Piotr Robak,
Antoni Zajko,
Mikołaj Roguski,
Dawid Płudowski,

Load Dataset

In [None]:
!pip install datasets

In [1]:
from datasets import load_dataset

ds = load_dataset("stanfordnlp/imdb")

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from gensim.models import Word2Vec
from gensim.models import FastText
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments





In [3]:
def preprocess_text(text):
    # Remove HTML tags, non-alphabetic characters, and lowercase the text
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in text.split() if word not in stop_words]
    return tokens

In [4]:
import nltk
nltk.download('stopwords')
# Apply preprocessing to the dataset
ds['train'] = ds['train'].map(lambda x: {'tokens': preprocess_text(x['text'])})
ds['test'] = ds['test'].map(lambda x: {'tokens': preprocess_text(x['text'])})

# Extract training and test data
train_texts = ds['train']['tokens']
train_labels = ds['train']['label']
test_texts = ds['test']['tokens']
test_labels = ds['test']['label']

Map: 100%|██████████| 25000/25000 [00:10<00:00, 2330.90 examples/s]


In [5]:
# Function to convert text into embeddings
def vectorize_text_with_given_model(tokens, model, vector_size=100):
    vec = np.zeros(vector_size)
    count = 0
    for word in tokens:
        if word in model.wv:
            vec += model.wv[word]
            count += 1
    if count > 0:
        vec /= count
    return vec

In [12]:
def load_glove_embeddings(filepath):
    embeddings = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Load GloVe embeddings (Ensure you download GloVe before running this) !!!!!!
#!!!!!!!!!
#!!!!!!!
#https://www.kaggle.com/datasets/danielwillgeorge/glove6b100dtxt?resource=download

In [13]:
class Glove_model:
    def __init__(self):
        self.wv = load_glove_embeddings('glove.6B.100d.txt')

In [19]:


vectorizers = {}
vectorizers["fastText"] = FastText(sentences=train_texts, vector_size=100, window=5, min_count=5, workers=4)
vectorizers["Word2Vec"] = Word2Vec(sentences=train_texts, vector_size=100, window=5, min_count=5, workers=4)
vectorizers["Glove"] = Glove_model()

classifiers = {}
classifiers["RandomForestClassifier"] = RandomForestClassifier(max_depth=2, random_state=0)
classifiers["SVM"] = SVC(kernel='linear', C=1.0)
classifiers["Ada"] = AdaBoostClassifier( n_estimators=50, random_state=42)
classifiers["LogisticRegression"] = LogisticRegression(max_iter=1000)

results_f1 = {}
results_recall = {}
results_precision = {}

for vectorizer_name in vectorizers:

    partial_result_f1 = {}
    partial_result_recall = {}
    partial_result_precision = {}

    vector_model = vectorizers[vectorizer_name]
    # Vectorize training and test data
    X_train = np.array([vectorize_text_with_given_model(tokens, vector_model) for tokens in train_texts])
    X_test = np.array([vectorize_text_with_given_model(tokens, vector_model) for tokens in test_texts])

    for classifier_name in classifiers:

        classifier = classifiers[classifier_name]
        classifier.fit(X_train, train_labels)

        # Predict on the test set
        y_pred = classifier.predict(X_test)

        # Calculate evaluation metrics
        accuracy = accuracy_score(test_labels, y_pred)
        precision = precision_score(test_labels, y_pred, average='weighted')
        recall = recall_score(test_labels, y_pred, average='weighted')
        f1 = f1_score(test_labels, y_pred, average='weighted')

        partial_result_f1[classifier_name] = f1
        partial_result_recall[classifier_name] = recall
        partial_result_precision[classifier_name] = precision

        # Print evaluation results
        print(f"Test Accuracy: {accuracy * 100:.2f}%")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1 Score: {f1}")

    results_f1[vectorizer_name]=partial_result_f1
    results_recall[vectorizer_name]=partial_result_recall
    results_precision[vectorizer_name]=partial_result_precision

print(pd.DataFrame(results_f1))

Test Accuracy: 69.68%
Precision: 0.6981155586241017
Recall: 0.69684
F1 Score: 0.6963512425875268
Test Accuracy: 80.16%
Precision: 0.8016400945943338
Recall: 0.80164
F1 Score: 0.8016399844485749




Test Accuracy: 75.43%
Precision: 0.7543528241302129
Recall: 0.75432
F1 Score: 0.7543120735178408
Test Accuracy: 79.95%
Precision: 0.7995044759849846
Recall: 0.79948
F1 Score: 0.799475903212493
Test Accuracy: 69.27%
Precision: 0.6952684045196933
Recall: 0.69272
F1 Score: 0.6917141575081024
Test Accuracy: 81.45%
Precision: 0.8144863118660737
Recall: 0.81448
F1 Score: 0.8144790691301774




Test Accuracy: 76.69%
Precision: 0.7669904154113837
Recall: 0.76692
F1 Score: 0.766904630996702
Test Accuracy: 81.44%
Precision: 0.8143600985833269
Recall: 0.81436
F1 Score: 0.8143599854458228
Test Accuracy: 70.99%
Precision: 0.7102591112773012
Recall: 0.70992
F1 Score: 0.7098029907085929
Test Accuracy: 79.74%
Precision: 0.7973941760628824
Recall: 0.79736
F1 Score: 0.7973541780665941




Test Accuracy: 75.50%
Precision: 0.7550511391816923
Recall: 0.755
F1 Score: 0.7549877184163685
Test Accuracy: 79.71%
Precision: 0.7971116431030585
Recall: 0.79708
F1 Score: 0.7970745969893895
                        fastText  Word2Vec     Glove
RandomForestClassifier  0.696351  0.691714  0.709803
SVM                     0.801640  0.814479  0.797354
Ada                     0.754312  0.766905  0.754988
LogisticRegression      0.799476  0.814360  0.797075


Result F1

In [20]:
print(pd.DataFrame(results_f1))

                        fastText  Word2Vec     Glove
RandomForestClassifier  0.696351  0.691714  0.709803
SVM                     0.801640  0.814479  0.797354
Ada                     0.754312  0.766905  0.754988
LogisticRegression      0.799476  0.814360  0.797075


Result precision

In [21]:
print(pd.DataFrame(results_precision))

                        fastText  Word2Vec     Glove
RandomForestClassifier  0.698116  0.695268  0.710259
SVM                     0.801640  0.814486  0.797394
Ada                     0.754353  0.766990  0.755051
LogisticRegression      0.799504  0.814360  0.797112


Result Recall

In [22]:
print(pd.DataFrame(results_recall))

                        fastText  Word2Vec    Glove
RandomForestClassifier   0.69684   0.69272  0.70992
SVM                      0.80164   0.81448  0.79736
Ada                      0.75432   0.76692  0.75500
LogisticRegression       0.79948   0.81436  0.79708


In [16]:
import torch
# Load the DistilBERT tokenizer and model for sequence classification
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Tokenize the text for DistilBERT
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Apply tokenization to the dataset
tokenized_ds = ds.map(tokenize_function, batched=True)

# Set format for PyTorch
tokenized_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Split dataset into train and test sets
train_dataset = tokenized_ds['train']
test_dataset = tokenized_ds['test']

# Define metrics function for evaluation
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
metrics = trainer.evaluate()

# Print evaluation results
print(f"Accuracy: {metrics['eval_accuracy']:.2f}")
print(f"Precision: {metrics['eval_precision']}")
print(f"Recall: {metrics['eval_recall']}")
print(f"F1 Score: {metrics['eval_f1']}")

KeyboardInterrupt: 

Because of lack of computing power, we were unable to test transformers, but code above works.