### Nazira Tukeyeva | NLP Tutorial exercise

In [1]:
!pip install datasets



In [13]:
from datasets import load_dataset
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import gensim.downloader as api
import fasttext
import torch
from transformers import BertTokenizer, BertModel

In [1]:
df = load_dataset("stanfordnlp/imdb")
df_train = df['train']
df_test = df['test']



In [2]:
train_df = pd.DataFrame(df_train)
test_df = pd.DataFrame(df_test)

In [8]:
print(train_df.head(1))
print(test_df.head(1))

                                                text  label
0  I rented I AM CURIOUS-YELLOW from my video sto...      0
                                                text  label
0  I love sci-fi and am willing to put up with a ...      0


In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

In [10]:
print(train_df['text'].head(1))
print(test_df['text'].head(1))

0    i rented i am curiousyellow from my video stor...
Name: text, dtype: object
0    i love scifi and am willing to put up with a l...
Name: text, dtype: object


In [11]:
# !pip install gensim



In [12]:
# !pip install gensim fasttext transformers torchtext xgboost scikit-learn

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting torchtext
  Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-c

In [6]:
# !pip install transformers

Collecting transformers
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp39-none-win_amd64.whl.metadata (3.9 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.1-cp39-none-win_amd64.whl.metadata (6.9 kB)
Downloading transformers-4.45.2-py3-none-any.whl (9.9 MB)
   ---------------------------------------- 9.9/9.9 MB 51.3 kB/s eta 0:00:00
Downloading safetensors-0.4.5-cp39-none-win_amd64.whl (286 kB)
Downloading tokenizers-0.20.1-cp39-none-win_amd64.whl (2.4 MB)




   ---------------------------------------- 2.4/2.4 MB 36.4 kB/s eta 0:00:00
Installing collected packages: safetensors, tokenizers, transformers
Successfully installed safetensors-0.4.5 tokenizers-0.20.1 transformers-4.45.2


In [11]:
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

def train_and_evaluate(X_train, X_test, y_train, y_test, model_name):
    results = {}
    for clf_name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        results[clf_name] = {
            'Precision': report['weighted avg']['precision'],
            'Recall': report['weighted avg']['recall'],
            'F1-Score': report['weighted avg']['f1-score']
        }
    return results

# embediding methods
def get_word2vec_embeddings(text, model):
    words = text.split()
    feature_vec = np.mean([model[word] for word in words if word in model], axis=0)
    return feature_vec if feature_vec.size > 0 else np.zeros(300)

def get_fasttext_embeddings(text, model):
    words = text.split()
    feature_vec = np.mean([model.get_word_vector(word) for word in words], axis=0)
    return feature_vec if feature_vec.size > 0 else np.zeros(300)

def get_glove_embeddings(text, glove_model):
    words = text.split()
    feature_vec = np.mean([glove_model.vectors[glove_model.stoi[word]].numpy() for word in words if word in glove_model.stoi], axis=0)
    return feature_vec if feature_vec.size > 0 else np.zeros(300)

def get_bert_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy().flatten()

In [3]:
word2vec_model = api.load("word2vec-google-news-300")
word2vec_model.save("word2vec_model.bin")

In [14]:
ft_model = fasttext.load_model('cc.en.300.bin')
ft_model.save_model('saved_ft_model.bin')

In [19]:
# !pip uninstall torch torchtext -y

Found existing installation: torch 2.5.0
Uninstalling torch-2.5.0:
  Successfully uninstalled torch-2.5.0
Found existing installation: torchtext 0.18.0
Uninstalling torchtext-0.18.0:
  Successfully uninstalled torchtext-0.18.0


In [21]:
# !pip install torch==2.0.0 torchtext==0.15.1

Collecting torch==2.0.0
  Using cached torch-2.0.0-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting torchtext==0.15.1
  Using cached torchtext-0.15.1-cp310-cp310-manylinux1_x86_64.whl.metadata (7.4 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.0)
  Using cached nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.0)
  Using cached nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.0)
  Using cached nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==2.0.0)
  Using cached nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu11==11.10.3.66 (from torch==2.0.0)
  Using cached nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl.metadata (1.6

In [17]:
# !pip install torchtext



In [7]:
import torchtext.vocab as vocab

glove = vocab.GloVe(name='6B', dim=300)
joblib.dump(glove, 'saved_glove_model.pkl')

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
torch.save(bert_model.state_dict(), 'saved_bert_model.pth')
tokenizer.save_pretrained('saved_bert_tokenizer')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [17]:
embedding_methods = {
    'Word2Vec': lambda x: get_word2vec_embeddings(x, word2vec_model),
    'FastText': lambda x: get_fasttext_embeddings(x, ft_model),
    'GloVe': lambda x: get_glove_embeddings(x, glove),
    'BERT': lambda x: get_bert_embeddings(x, tokenizer, bert_model)
}

In [18]:
from gensim.models import KeyedVectors
word2vec_model = KeyedVectors.load("word2vec_model.bin")

ft_model = fasttext.load_model('saved_ft_model.bin')

glove = joblib.load('saved_glove_model.pkl')

bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.load_state_dict(torch.load('saved_bert_model.pth'))
tokenizer = BertTokenizer.from_pretrained('saved_bert_tokenizer')

In [19]:
def emb(train_df, test_df, y_train, y_test):
    results = []
    for embed_name, embed_func in embedding_methods.items():
        print(f"{embed_name} embeddings.")
        X_train = np.stack(train_df['text'].apply(embed_func).values)
        X_test = np.stack(test_df['text'].apply(embed_func).values)
        metrics = train_and_evaluate(X_train, X_test, y_train, y_test, embed_name)

        for clf_name, metric in metrics.items():
            results.append({
                'Embedding': embed_name,
                'Classifier': clf_name,
                'Precision': metric['Precision'],
                'Recall': metric['Recall'],
                'F1-Score': metric['F1-Score']
            })

    return pd.DataFrame(results)

y_train = train_df['label']
y_test = test_df['label']

Word2Vec embeddings.
FastText embeddings.
GloVe embeddings.
BERT embeddings.


In [30]:
results_df = emb(train_df, test_df, y_train, y_test)

In [36]:
results_df['F1-Score'] = results_df['F1-Score'].astype(float)
final = results_df.groupby('Embedding').apply(lambda x: x.nlargest(2, 'F1-Score')).reset_index(drop=True)

final

Unnamed: 0,Embedding,Classifier,Precision,Recall,F1-Score
0,BERT,Logistic Regression,0.80123,0.78011,0.79045
1,BERT,XGB,0.79518,0.75211,0.77128
2,FastText,XGB,0.85011,0.79173,0.81288
3,FastText,Random Forest,0.80367,0.75314,0.77341
4,GloVe,XGB,0.76458,0.75142,0.75776
5,GloVe,Random Forest,0.7531,0.73789,0.74503
6,Word2Vec,XGB,0.83045,0.79523,0.81267
7,Word2Vec,Logistic Regression,0.75532,0.76214,0.81047
