In [1]:
import nltk                               
from nltk.corpus import twitter_samples   
from nltk.corpus import stopwords
import matplotlib.pyplot as plt      
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split 
from nltk.classify import SklearnClassifier

In [2]:
import pandas as pd

splits = {'train': 'plain_text/train-00000-of-00001.parquet', 'test': 'plain_text/test-00000-of-00001.parquet', 'unsupervised': 'plain_text/unsupervised-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["train"])
df_test =  pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["test"])

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df_train

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0
...,...,...
24995,A hit at the time but now better categorised a...,1
24996,I love this movie like no other. Another time ...,1
24997,This film and it's sequel Barry Mckenzie holds...,1
24998,'The Adventures Of Barry McKenzie' started lif...,1


In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maaja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
stopwords_set = set(stopwords.words("english"))

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
import nltk

def preprocess_text(text):
    return word_tokenize(text.lower())

df_train['tokens'] = df_train['text'].apply(preprocess_text)
X_train, y_train = df_train['tokens'], df_train['label']

df_test['tokens'] = df_test['text'].apply(preprocess_text)
X_test, y_test = df_test['tokens'], df_test['label']

In [8]:
X_train

0        [i, rented, i, am, curious-yellow, from, my, v...
1        [``, i, am, curious, :, yellow, '', is, a, ris...
2        [if, only, to, avoid, making, this, type, of, ...
3        [this, film, was, probably, inspired, by, goda...
4        [oh, ,, brother, ..., after, hearing, about, t...
                               ...                        
24995    [a, hit, at, the, time, but, now, better, cate...
24996    [i, love, this, movie, like, no, other, ., ano...
24997    [this, film, and, it, 's, sequel, barry, mcken...
24998    ['the, adventures, of, barry, mckenzie, ', sta...
24999    [the, story, centers, around, barry, mckenzie,...
Name: tokens, Length: 25000, dtype: object

# Comparison of different methods 

In [10]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

X_train_clean = [remove_stopwords(tokens) for tokens in X_train]
X_test_clean = [remove_stopwords(tokens) for tokens in X_test]

In [12]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=2, workers=4)

def vectorize_sentence_w2v(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return sum(vectors) / len(vectors) if vectors else [0] * 100

X_train_w2v = [vectorize_sentence_w2v(tokens, w2v_model) for tokens in X_train_clean]
X_test_w2v = [vectorize_sentence_w2v(tokens, w2v_model) for tokens in X_test_clean]

In [13]:
from gensim.models import FastText

fasttext_model = FastText(sentences=X_train, vector_size=100, window=5, min_count=2, workers=4)

fasttext_model.save("fasttext.model")
fasttext_model = FastText.load("fasttext.model")

X_train_fasttext = [vectorize_sentence_w2v(tokens, fasttext_model) for tokens in X_train_clean]
X_test_fasttext = [vectorize_sentence_w2v(tokens, fasttext_model) for tokens in X_test_clean]

# Results comparison for different models

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd


def evaluate_model(X_train, y_train, X_test, y_test, model, method_name, model_name):
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return {
        "Method Name": method_name,
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

In [31]:
results = []

In [32]:
# Logistic Regression
results.append(evaluate_model(X_train_w2v, y_train, X_test_w2v, y_test, LogisticRegression(max_iter=2000), "Word2Vec","LogisticRegression"))
results.append(evaluate_model(X_train_fasttext, y_train, X_test_fasttext, y_test, LogisticRegression(max_iter=2000), "FastText","LogisticRegression"))

In [33]:

# Random Forest
results.append(evaluate_model(X_train_w2v, y_train, X_test_w2v, y_test, RandomForestClassifier(n_estimators=100), "Word2Vec", "RandomForest"))
results.append(evaluate_model(X_train_fasttext, y_train, X_test_fasttext, y_test, RandomForestClassifier(n_estimators=100), "FastText", "RandomForest"))

In [34]:
# SVM
results.append(evaluate_model(X_train_w2v, y_train, X_test_w2v, y_test, SVC(kernel='linear'), "Word2Vec", "SVM"))
results.append(evaluate_model(X_train_fasttext, y_train, X_test_fasttext, y_test, SVC(kernel='linear'), "FastText", "SVM"))

In [35]:
# K-Nearest Neighbors
results.append(evaluate_model(X_train_w2v, y_train, X_test_w2v, y_test, KNeighborsClassifier(n_neighbors=5), "Word2Vec", "KNN"))
results.append(evaluate_model(X_train_fasttext, y_train, X_test_fasttext, y_test, KNeighborsClassifier(n_neighbors=5), "FastText", "KNN"))

In [36]:
# Naive Bayes
results.append(evaluate_model(X_train_w2v, y_train, X_test_w2v, y_test, GaussianNB(), "Word2Vec", "NaiveBayes"))
results.append(evaluate_model(X_train_fasttext, y_train, X_test_fasttext, y_test, GaussianNB(), "FastText", "NaiveBayes"))

In [37]:
# XGBoost
results.append(evaluate_model(X_train_w2v, y_train, X_test_w2v, y_test, XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), "Word2Vec", "XGBoost"))
results.append(evaluate_model(X_train_fasttext, y_train, X_test_fasttext, y_test, XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), "FastText", "XGBoost"))

In [38]:
metrics_df = pd.DataFrame(results)
print(metrics_df)

   Method Name               Model  Accuracy  Precision   Recall  F1 Score
0     Word2Vec  LogisticRegression   0.81320   0.811159  0.81648  0.813811
1     FastText  LogisticRegression   0.80008   0.799362  0.80128  0.800320
2     Word2Vec        RandomForest   0.75792   0.764653  0.74520  0.754801
3     FastText        RandomForest   0.71212   0.717747  0.69920  0.708352
4     Word2Vec                 SVM   0.81176   0.807771  0.81824  0.812972
5     FastText                 SVM   0.80052   0.796277  0.80768  0.801938
6     Word2Vec                 KNN   0.69340   0.721606  0.62976  0.672562
7     FastText                 KNN   0.62640   0.641882  0.57184  0.604840
8     Word2Vec          NaiveBayes   0.62756   0.620695  0.65600  0.637859
9     FastText          NaiveBayes   0.54268   0.532596  0.69736  0.603942
10    Word2Vec             XGBoost   0.78232   0.785287  0.77712  0.781182
11    FastText             XGBoost   0.74020   0.742548  0.73536  0.738936
