## Autorzy 
- Tomasz Krupiński 
- Szymon Pawlonka 

In [9]:
from datasets import load_dataset
import gensim
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, recall_score, roc_auc_score
import fasttext
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


In [10]:
ds = load_dataset("stanfordnlp/imdb")

In [11]:
train_data = ds['train']['text']
y_train = ds['train']['label']
test_data = ds['test']['text']
y_test = ds['test']['label']

In [82]:
train_data[:5]

['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, e

In [34]:
def summary(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print(f"F1: {f1}")
    print(f"Recall: {recall}")
    print(f"AUC: {auc}")
    print(f"Accuracy: {accuracy}")

def add_results(results: list, method:str, y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    results = results.append({'method': method, 'accuracy': accuracy, 'auc': auc, 'f1': f1, 'recall': recall})
    return results

In [35]:
all_results = []

# Word2Vec

In [36]:
word2vec_model = Word2Vec(sentences=train_data, vector_size=100, window=5, min_count=2, workers=4)

In [37]:
def get_average_word2vec(tokens, model, vector_size):
    valid_tokens = [token for token in tokens if token in model.wv.index_to_key]
    
    if not valid_tokens:
        return np.zeros(vector_size)
    
    return np.mean([model.wv[token] for token in valid_tokens], axis=0)

vector_size = word2vec_model.vector_size
X_train = np.array([get_average_word2vec(text, word2vec_model, vector_size) for text in train_data])
X_test = np.array([get_average_word2vec(text, word2vec_model, vector_size) for text in test_data])


In [38]:
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

In [54]:
y_pred = classifier.predict(X_test)
summary(y_pred, y_test)

F1: 0.6112206341383276
Recall: 0.6280384875084403
AUC: 0.6216910807371886
Accuracy: 0.62136


In [55]:
add_results(all_results, 'Word2Vec + RF', y_test, y_pred)

# fasttext

In [41]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
filtered_sentences = []

for text in train_data:
    word_tokens = [token for token in word_tokenize(text)]
    filtered_sentence = [
        word 
        for word in word_tokens 
        if word.lower() not in stop_words
        and word != "<br />"
    ]
    
    filtered_sentences.append(" ".join(filtered_sentence))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pavonism/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/pavonism/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /home/pavonism/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [42]:
train_data_for_fasttext = [
    f"__label__{y_train[index]} {text}" 
    for index, text in enumerate(filtered_sentences)
]

with open('fasttext_train.txt', 'w') as f:
    for line in train_data_for_fasttext:
        f.write(f"{line}\n")


In [57]:
import fasttext

fasttext_model = fasttext.train_supervised(input='fasttext_train.txt', lr=0.1, epoch=25, wordNgrams=2, dim=100)

Read 4M words
Number of words:  131671
Number of labels: 2
Progress: 100.0% words/sec/thread: 1141243 lr:  0.000000 avg.loss:  0.113124 ETA:   0h 0m 0s


In [58]:
y_pred, *_ = fasttext_model.predict(test_data)

In [59]:
y_pred = [0 if pred[0] == '__label__0' else 1 for pred in y_pred]

In [60]:
summary(y_pred, y_test)

F1: 0.8683257569920642
Recall: 0.8374201218606033
AUC: 0.8654264012526028
Accuracy: 0.86328


In [62]:
add_results(all_results, 'fasttext', y_test, y_pred)

# LSTM

In [14]:
max_len = 100
vocab_size = 10000
embedding_dim = 128

In [15]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_data)
sequences = tokenizer.texts_to_sequences(train_data)

In [16]:
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

In [17]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
model.add(LSTM(units=128))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [20]:
history = model.fit(
    padded_sequences, 
    np.array(y_train), 
    epochs=10, 
    batch_size=2, 
    validation_split=0.2
)

Epoch 1/10


2024-10-10 21:42:53.930723: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 10ms/step - accuracy: 0.7132 - loss: 0.5469 - val_accuracy: 0.7764 - val_loss: 0.5121
Epoch 2/10
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 9ms/step - accuracy: 0.9093 - loss: 0.2317 - val_accuracy: 0.6806 - val_loss: 0.7453
Epoch 3/10
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 9ms/step - accuracy: 0.9518 - loss: 0.1399 - val_accuracy: 0.7906 - val_loss: 0.4994
Epoch 4/10
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 9ms/step - accuracy: 0.9747 - loss: 0.0792 - val_accuracy: 0.7344 - val_loss: 0.9655
Epoch 5/10
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 9ms/step - accuracy: 0.9872 - loss: 0.0414 - val_accuracy: 0.8192 - val_loss: 0.6950
Epoch 6/10
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 9ms/step - accuracy: 0.9929 - loss: 0.0235 - val_accuracy: 0.7370 - val_loss: 1.1977
Epoch 7

In [63]:
model.summary()

In [50]:
test_sequences = tokenizer.texts_to_sequences(test_data)
new_padded_sequences = pad_sequences(test_sequences, maxlen=max_len, padding='post')

predictions = model.predict(new_padded_sequences)
predicted_labels = [1 if p > 0.5 else 0 for p in predictions]

summary(y_test, predicted_labels)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step
F1: 0.77740745678354
Recall: 0.69976
AUC: 0.7996399999999999
Accuracy: 0.79964


In [64]:
add_results(all_results, 'LSTM', y_test, predicted_labels)

# GloVe

In [77]:
from sklearn.linear_model import LogisticRegression
import re
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/pavonism/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [78]:
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    return text

def sentence_to_embedding(sentence, embeddings_index, embedding_dim):
    words = word_tokenize(sentence)
    embeddings = []
    for word in words:
        if word in embeddings_index:
            embeddings.append(embeddings_index[word])
        else:
            embeddings.append(np.zeros(embedding_dim))
    if len(embeddings) == 0:
        return np.zeros(embedding_dim)
    return np.mean(embeddings, axis=0)

In [71]:
glove_file = '/home/pavonism/glove.6B.300d.txt'
embeddings_index = load_glove_embeddings(glove_file)
embedding_dim = 300

In [79]:
def preprocess_data(x, y):
    data = {
        'text': x,
        'label': y
    }

    df = pd.DataFrame(data)

    df['text'] = df['text'].apply(preprocess_text)
    df['embedding'] = df['text'].apply(lambda x: sentence_to_embedding(x, embeddings_index, embedding_dim))

    return np.vstack(df['embedding'].values), df['label'].values

In [80]:
x_train, y_train = preprocess_data(train_data, y_train)
x_test, y_test = preprocess_data(test_data, y_test)

In [81]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
summary(y_test, y_pred)

F1: 0.8312787043751511
Recall: 0.82536
AUC: 0.8324799999999999
Accuracy: 0.83248


In [75]:
add_results(all_results, 'GloVe', y_test, y_pred)

# Results

In [76]:
pd.DataFrame(all_results)

Unnamed: 0,method,accuracy,auc,f1,recall
0,Word2Vec + RF,0.62136,0.62136,0.611221,0.59528
1,fasttext,0.86328,0.86328,0.868326,0.9016
2,LSTM,0.79964,0.79964,0.777407,0.69976
3,GloVe,0.83248,0.83248,0.831279,0.82536
