# Tutorial 2 - Embeddings

Done by:
- Nigel Teo
- Marc Martinez
- Menon Lainaud
- Alessandro Gentili

## TASK: sentiment analysis classification

Dataset: IMBD dataset with the defined split  into train/test,
e.g. from here:  https://huggingface.co/datasets/stanfordnlp/imdb

Methods: try different methods with embedding-based models, 
i.e. word2vec, fasttext, glove, transformers etc. 


Outcome: table with metrics for all tested models 
and data-processing pipelines (F1 score, precision, recall)

In [2]:
# !pip install gensim
# !pip install datasets
# !pip install transformers

In [3]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import torch


# Import word embedding models (word2vec, fasttext, glove, transformers)
from gensim.models import Word2Vec
from gensim.models import FastText
from gensim.models import KeyedVectors
from gensim.models import TfidfModel
from gensim.models import LsiModel
from gensim.models import LdaModel


# Import transformers
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments


# Import sklearn libraries and metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier



  from .autonotebook import tqdm as notebook_tqdm


## Preprocess Data

In [4]:
# import dataset from Dataset: IMBD dataset with the defined split  into train/test,
# e.g. from here:  https://huggingface.co/datasets/stanfordnlp/imdb

# Load the dataset
from datasets import load_dataset
dataset = load_dataset('imdb')

nltk.download('stopwords')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anadu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Load the dataset into a pandas dataframe
df = pd.DataFrame(dataset['train'])

# df.head()

# Preprocess the text data
def preprocess_text(text):
    # Remove all the special characters
    text = re.sub(r'\W', ' ', str(text))
    # Substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    # Removing prefixed 'b'
    text = re.sub(r'^b\s+', '', text)
    # Converting to Lowercase
    text = text.lower()
    # Remove stop words using NLTK
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Apply the preprocess_text function to the 'text' column
df['text'] = df['text'].apply(preprocess_text)

df.head()



Unnamed: 0,text,label
0,rented curious yellow video store controversy ...,0
1,curious yellow risible pretentious steaming pi...,0
2,avoid making type film future film interesting...,0
3,film probably inspired godard masculin féminin...,0
4,oh brother hearing ridiculous film umpteen yea...,0


In [6]:
# Split the data into training and testing sets
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text data using DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
X_train_encoded = tokenizer(X_train.tolist(), truncation=True, padding=True)
X_test_encoded = tokenizer(X_test.tolist(), truncation=True, padding=True)

# Convert the encoded data into PyTorch tensors
X_train_input_ids = torch.tensor(X_train_encoded['input_ids'])

## Word2Vec Model

In [7]:
# Train the Word2Vec model
word2vec_model = Word2Vec(sentences=X_train.apply(lambda x: x.split()), vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.train(X_train.apply(lambda x: x.split()), total_examples=word2vec_model.corpus_count, epochs=10)

# Encode the text data using the Word2Vec model
X_train_word2vec = X_train.apply(lambda x: np.mean([word2vec_model.wv[word] for word in x.split() if word in word2vec_model.wv], axis=0))
X_test_word2vec = X_test.apply(lambda x: np.mean([word2vec_model.wv[word] for word in x.split() if word in word2vec_model.wv], axis=0))



In [8]:
# Evaluate the Word2Vec model using classifiers with statistics

classifiers = {}
classifiers['SVM'] = SVC()
classifiers['Logistic Regression'] = LogisticRegression()
classifiers['Random Forest'] = RandomForestClassifier()
classifiers['Gradient Boosting'] = GradientBoostingClassifier()

results_df = pd.DataFrame(columns=['Classifier', 'Accuracy', 'F1 Score', 'Precision', 'Recall'])

for name, classifier in classifiers.items():
    classifier.fit(X_train_word2vec.tolist(), y_train)
    y_pred = classifier.predict(X_test_word2vec.tolist())
    print(f'Classifier: {name}')
    print(f'Accuracy: {accuracy_score(y_test, y_pred):.5f}')
    print(f'F1 Score: {f1_score(y_test, y_pred):.5f}')
    print(f'Precision: {precision_score(y_test, y_pred):.5f}')
    print(f'Recall: {recall_score(y_test, y_pred):.5f}')
    
    # Create a DataFrame with values rounded to 5 decimal places
    df = pd.DataFrame([[
        "word2vec + " + name, 
        round(accuracy_score(y_test, y_pred), 5), 
        round(f1_score(y_test, y_pred), 5), 
        round(precision_score(y_test, y_pred), 5), 
        round(recall_score(y_test, y_pred), 5)
    ]], columns=['Classifier', 'Accuracy', 'F1 Score', 'Precision', 'Recall'])
    
    results_df = pd.concat([results_df, df], ignore_index=True)

results_df

Classifier: SVM
Accuracy: 0.86680
F1 Score: 0.86780
Precision: 0.85625
Recall: 0.87968
Classifier: Logistic Regression
Accuracy: 0.86320
F1 Score: 0.86396
Precision: 0.85411
Recall: 0.87404
Classifier: Random Forest
Accuracy: 0.83640
F1 Score: 0.83866
Precision: 0.82244
Recall: 0.85553
Classifier: Gradient Boosting
Accuracy: 0.84160
F1 Score: 0.84185
Precision: 0.83551
Recall: 0.84829


Unnamed: 0,Classifier,Accuracy,F1 Score,Precision,Recall
0,word2vec + SVM,0.8668,0.8678,0.85625,0.87968
1,word2vec + Logistic Regression,0.8632,0.86396,0.85411,0.87404
2,word2vec + Random Forest,0.8364,0.83866,0.82244,0.85553
3,word2vec + Gradient Boosting,0.8416,0.84185,0.83551,0.84829


## FastText Model

In [9]:
# Train the FastText model
fasttext_model = FastText(sentences=X_train.apply(lambda x: x.split()), vector_size=100, window=5, min_count=1, workers=4)
fasttext_model.train(X_train.apply(lambda x: x.split()), total_examples=fasttext_model.corpus_count, epochs=10)

# Encode the text data using the FastText model
X_train_fasttext = X_train.apply(lambda x: np.mean([fasttext_model.wv[word] for word in x.split() if word in fasttext_model.wv], axis=0))
X_test_fasttext = X_test.apply(lambda x: np.mean([fasttext_model.wv[word] for word in x.split() if word in fasttext_model.wv], axis=0))



In [10]:
# Evaluate the FastText model using classifiers with statistics


for name, classifier in classifiers.items():
    classifier.fit(X_train_fasttext.tolist(), y_train)
    y_pred = classifier.predict(X_test_fasttext.tolist())
    print(f'Classifier: {name}')
    print(f'Accuracy: {accuracy_score(y_test, y_pred):.5f}')
    print(f'F1 Score: {f1_score(y_test, y_pred):.5f}')
    print(f'Precision: {precision_score(y_test, y_pred):.5f}')
    print(f'Recall: {recall_score(y_test, y_pred):.5f}')
    
    # Create a DataFrame with values rounded to 5 decimal places
    df = pd.DataFrame([[
        "fasttext + " + name, 
        round(accuracy_score(y_test, y_pred), 5), 
        round(f1_score(y_test, y_pred), 5), 
        round(precision_score(y_test, y_pred), 5), 
        round(recall_score(y_test, y_pred), 5)
    ]], columns=['Classifier', 'Accuracy', 'F1 Score', 'Precision', 'Recall'])
    
    results_df = pd.concat([results_df, df], ignore_index=True)

results_df


Classifier: SVM
Accuracy: 0.85760
F1 Score: 0.85788
Precision: 0.85109
Recall: 0.86479


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classifier: Logistic Regression
Accuracy: 0.85340
F1 Score: 0.85314
Precision: 0.84956
Recall: 0.85674
Classifier: Random Forest
Accuracy: 0.81320
F1 Score: 0.81585
Precision: 0.79977
Recall: 0.83260
Classifier: Gradient Boosting
Accuracy: 0.82720
F1 Score: 0.82823
Precision: 0.81847
Recall: 0.83823


Unnamed: 0,Classifier,Accuracy,F1 Score,Precision,Recall
0,word2vec + SVM,0.8668,0.8678,0.85625,0.87968
1,word2vec + Logistic Regression,0.8632,0.86396,0.85411,0.87404
2,word2vec + Random Forest,0.8364,0.83866,0.82244,0.85553
3,word2vec + Gradient Boosting,0.8416,0.84185,0.83551,0.84829
4,fasttext + SVM,0.8576,0.85788,0.85109,0.86479
5,fasttext + Logistic Regression,0.8534,0.85314,0.84956,0.85674
6,fasttext + Random Forest,0.8132,0.81585,0.79977,0.8326
7,fasttext + Gradient Boosting,0.8272,0.82823,0.81847,0.83823


## Use of GloVe

In [11]:

def load_glove_embeddings(filepath):
    embeddings = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Load the GloVe embeddings
glove_embeddings = load_glove_embeddings(r'C:\NUS\Y4S1\Natural Language Processing\glove model\glove.6B.100d.txt')


In [12]:
# Train the GloVe model
def encode_sentence_glove(sentence, glove_embeddings, vector_size=100):
    words = sentence.split()
    vectors = [glove_embeddings[word] for word in words if word in glove_embeddings]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        # If no word is found in GloVe, return a zero vector of the same size
        return np.zeros(vector_size)

# Encode training and test data
X_train_glove = X_train.apply(lambda x: encode_sentence_glove(x, glove_embeddings, vector_size=100))
X_test_glove = X_test.apply(lambda x: encode_sentence_glove(x, glove_embeddings, vector_size=100))




In [13]:
# Evaluate the GloVe model using classifiers with statistics

for name, classifier in classifiers.items():
    classifier.fit(X_train_glove.tolist(), y_train)
    y_pred = classifier.predict(X_test_glove.tolist())
    print(f'Classifier: {name}')
    print(f'Accuracy: {accuracy_score(y_test, y_pred):.5f}')
    print(f'F1 Score: {f1_score(y_test, y_pred):.5f}')
    print(f'Precision: {precision_score(y_test, y_pred):.5f}')
    print(f'Recall: {recall_score(y_test, y_pred):.5f}')
    
    # Create a DataFrame with values rounded to 5 decimal places
    df = pd.DataFrame([[
        "glove + " + name, 
        round(accuracy_score(y_test, y_pred), 5), 
        round(f1_score(y_test, y_pred), 5), 
        round(precision_score(y_test, y_pred), 5), 
        round(recall_score(y_test, y_pred), 5)
    ]], columns=['Classifier', 'Accuracy', 'F1 Score', 'Precision', 'Recall'])
    
    results_df = pd.concat([results_df, df], ignore_index=True)

results_df


Classifier: SVM
Accuracy: 0.79800
F1 Score: 0.79768
Precision: 0.79418
Recall: 0.80121


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classifier: Logistic Regression
Accuracy: 0.79320
F1 Score: 0.79178
Precision: 0.79242
Recall: 0.79115
Classifier: Random Forest
Accuracy: 0.75760
F1 Score: 0.75828
Precision: 0.75168
Recall: 0.76499
Classifier: Gradient Boosting
Accuracy: 0.77320
F1 Score: 0.77383
Precision: 0.76710
Recall: 0.78068


Unnamed: 0,Classifier,Accuracy,F1 Score,Precision,Recall
0,word2vec + SVM,0.8668,0.8678,0.85625,0.87968
1,word2vec + Logistic Regression,0.8632,0.86396,0.85411,0.87404
2,word2vec + Random Forest,0.8364,0.83866,0.82244,0.85553
3,word2vec + Gradient Boosting,0.8416,0.84185,0.83551,0.84829
4,fasttext + SVM,0.8576,0.85788,0.85109,0.86479
5,fasttext + Logistic Regression,0.8534,0.85314,0.84956,0.85674
6,fasttext + Random Forest,0.8132,0.81585,0.79977,0.8326
7,fasttext + Gradient Boosting,0.8272,0.82823,0.81847,0.83823
8,glove + SVM,0.798,0.79768,0.79418,0.80121
9,glove + Logistic Regression,0.7932,0.79178,0.79242,0.79115


In [14]:
# Export results_df to a CSV file at C:\NUS\Y4S1\Natural Language Processing with the name Tutorial_2_results.csv
results_df.to_csv(r'C:\NUS\Y4S1\Natural Language Processing\Tutorial_2_results.csv', index=False)


## Key insights:
Word2Vec shows the highest performance across all metrics, with SVM achieving the best results: 86.68% accuracy and 0.8678 F1 score.
FastText comes in a close second, with SVM performing best again with 85.76% accuracy and 0.85788 F1 score. Logistic Regression also performs well on FastText embeddings.
GloVe generally underperforms compared to Word2Vec and FastText, with its best result coming from SVM, achieving 79.8% accuracy and 0.79768 F1 score.
Random Forest and Gradient Boosting models perform worse overall than SVM and Logistic Regression, especially when paired with GloVe embeddings, which show the lowest scores for these models.


## Overall:
SVM and Logistic Regression perform best when paired with Word2Vec and FastText embeddings, while GloVe performs notably worse across all machine learning models.

## Using DistilBertForSequenceClassification Transformer

The below code is correct, however it is computationally expensive and takes a long time to run on my computer.

In [None]:
# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Tokenize the training and testing data
X_train_encoded = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
X_test_encoded = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')

# Custom Dataset Class
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        # Convert labels to PyTorch tensors
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        # No need to clone tensors; just access them directly
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]  # Access the label directly
        return item

    def __len__(self):
        return len(self.labels)

# Prepare the datasets
train_dataset = TextDataset(X_train_encoded, y_train.values)  # Ensure y_train is a NumPy array or list
test_dataset = TextDataset(X_test_encoded, y_test.values)      # Ensure y_test is a NumPy array or list

print(len(X_train_encoded['input_ids']), len(X_train_encoded['attention_mask']), len(y_train))

X_train_encoded = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
X_test_encoded = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')


# Load the DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

# Define the Trainer object
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset            # evaluation dataset
)
print(X_train.iloc[3102])
print(y_train.iloc[3102])
# Train the model
trainer.train()



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


20000 20000 20000


  0%|          | 0/3750 [04:25<?, ?it/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


even subscribe knee jerk anti free trade politics movie still tired note played clink clink clink even accept preacher peroxide hair advocates return first principles reverend billy pretty hard look serious figure clownish reverend sort wakes every morning aspiration ethereal see face tv climbs back bed night pretty wife admit would take tons save dreary mess movie interminable bus rides worst part progress shown guess colored line moving across map aww guessed oh well virtue short favorable thing say hmmmm yep afraid
0


  0%|          | 3/3750 [03:04<63:07:33, 60.65s/it]

In [49]:
# Evaluate the DistilBERT model using classifiers with statistics



NameError: name 'X_train_lda' is not defined