In [11]:
import nltk                                # Python library for NLP
from nltk.corpus import twitter_samples    # sample Twitter dataset from NLTK
from nltk.corpus import stopwords
import matplotlib.pyplot as plt            # library for visualization
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split # function for splitting data to train and test sets
from nltk.classify import SklearnClassifier
from wordcloud import WordCloud,STOPWORDS

In [12]:
from datasets import load_dataset

# Load the IMDb dataset
dataset = load_dataset('stanfordnlp/imdb')

# Display the first few rows of the dataset
print(dataset['train'][0])

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [14]:
from gensim.models import Word2Vec

In [15]:
# models:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

In [16]:
# models initialization

models = [
    ("Logistic Regression", LogisticRegression(max_iter=1000)),
    ("SVM", SVC(kernel='linear')),
    ("Random Forest", RandomForestClassifier(n_estimators=15)),
    ("Gradient Boosting", GradientBoostingClassifier(n_estimators=15)),
    ("K-Nearest Neighbors", KNeighborsClassifier(n_neighbors=3))
]

In [17]:
# Convert to pandas DataFrame 
x_train = pd.DataFrame(dataset['train']['text'])
x_test = pd.DataFrame(dataset['test']['text'])
y_train = pd.DataFrame(dataset['train']['label'])
y_test = pd.DataFrame(dataset['test']['label'])

# # Check label distribution
label_distribution = y_train.value_counts()
print(label_distribution)

# vectorizer = TfidfVectorizer(max_features=5000)
# x_train = vectorizer.fit_transform(dataset['train']['text'])
# x_test = vectorizer.fit_transform(dataset['test']['text'])

0
0    12500
1    12500
Name: count, dtype: int64


In [18]:
# Function to train model and evaluate performance
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

In [19]:
# Function to find the longest sentence
def find_longest_sentence(df, column_name):
    # Count words in each sentence
    df['word_count'] = df[column_name].apply(lambda x: len(x.split()))
    
    # Find the index of the longest sentence
    longest_index = df['word_count'].idxmax()
    
    # Get the longest sentence
    longest_sentence = df.loc[longest_index, column_name]
    
    return longest_sentence, df['word_count'].max()

# Example usage
longest_sentence, max_word_count = find_longest_sentence(x_train, 0)

print("Longest Sentence:", longest_sentence)
print("Word Count:", max_word_count)

Longest Sentence: Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables in order to get the win. Benoit and Guerrero heated up early on by taking turns hammering first Spike and then Bubba Ray. A German suplex by Benoit to Bubba took the wind out of the Dudley brother. Spike tried to help his brother, but the referee restrained him while Benoit and Guerrero ganged up on him in the corner. With Benoit stomping away on Bubba, Guerrero set up a table outside. Spike dashed into the ring and somersaulted over the top rope onto Guerrero on the outside! After recovering and taking care of Spike, Guerrero slipped a table into the ring and helped the Wolverine set it up. The tandem then set up for a double superplex from the middle rope which would have put Bubba thro

In [20]:
# ========= Word2Vec ==========

# Tokenize the text (you can adjust this based on your needs)
tokenized_texts = [text.split() for text in x_train[0]]  # Simple whitespace tokenization

# Train the Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=300, window=5, min_count=1, workers=4)


In [21]:
# Create sentence embeddings by averaging word vectors
def get_sentence_embedding(sentence, model):
    # Get the word vectors for each word in the sentence
    word_vectors = [model.wv[word] for word in sentence if word in model.wv]
    # Average the word vectors (if no words are found, return a zero vector)
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

In [22]:
# Create embeddings for the training set
X_train_word_2_vec = np.array([get_sentence_embedding(sentence.split(), word2vec_model) for sentence in x_train[0]])
y_train_word_2_vec = np.array(y_train).flatten()  # Labels for training

In [23]:
# Repeat for the test set
X_test_word2vec = np.array([get_sentence_embedding(sentence.split(), word2vec_model) for sentence in x_test[0]])
y_test_word_2_vec = np.array(y_test).flatten()  # Your test labels

In [24]:
# Create an empty list to store evaluation results
results = []

# Evaluate each model
for model_name, model in models:
    accuracy, precision, recall, f1 = evaluate_model(model, X_train_word_2_vec, X_test_word2vec, y_train_word_2_vec, y_test_word_2_vec)
    results.append([model_name, accuracy, precision, recall, f1])
    print(model_name)

# Convert results to a DataFrame for a table format
results_df_word_2_vec = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score"])

# Print the results table
print(results_df_word_2_vec)

Logistic Regression
SVM
Random Forest
Gradient Boosting
K-Nearest Neighbors
                 Model  Accuracy  Precision   Recall  F1-Score
0  Logistic Regression   0.78168   0.770846  0.80168  0.785961
1                  SVM   0.78436   0.772146  0.80680  0.789093
2        Random Forest   0.67324   0.666615  0.69312  0.679609
3    Gradient Boosting   0.68424   0.691184  0.66608  0.678400
4  K-Nearest Neighbors   0.63952   0.648224  0.61016  0.628616


In [25]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/szymi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/szymi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:
# Text preprocessing
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Lemmatization
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

x_train = pd.DataFrame(dataset['train']['text'])
x_test = pd.DataFrame(dataset['test']['text'])

x_train['preprocessed'] = x_train[0].apply(preprocess_text)
x_test['preprocessed'] = x_test[0].apply(preprocess_text)

# Tokenize the text (you can adjust this based on your needs)
tokenized_texts = [text.split() for text in x_train['preprocessed']]  # Simple whitespace tokenization

# Train the Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=300, window=5, min_count=1, workers=4)

In [28]:
# Create embeddings for the training set
X_train_word_2_vec_preprocessed = np.array([get_sentence_embedding(sentence.split(), word2vec_model) for sentence in x_train['preprocessed']])
y_train_word_2_vec_preprocessed = np.array(y_train).flatten()  # Labels for training

# Repeat for the test set
X_test_word2vec_preprocessed = np.array([get_sentence_embedding(sentence.split(), word2vec_model) for sentence in x_test['preprocessed']])
y_test_word_2_vec_preprocessed = np.array(y_test).flatten()  # Your test labels

In [29]:
# Create an empty list to store evaluation results
results = []

# Evaluate each model
for model_name, model in models:
    accuracy, precision, recall, f1 = evaluate_model(model, X_train_word_2_vec_preprocessed, X_test_word2vec_preprocessed, y_train_word_2_vec_preprocessed, y_test_word_2_vec_preprocessed)
    results.append([model_name, accuracy, precision, recall, f1])
    print(model_name)

# Convert results to a DataFrame for a table format
results_df_word_2_vec_preprocessed = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score"])

# Print the results table
print(results_df_word_2_vec_preprocessed)

Logistic Regression
SVM
Random Forest
Gradient Boosting
K-Nearest Neighbors
                 Model  Accuracy  Precision   Recall  F1-Score
0  Logistic Regression   0.81368   0.809960  0.81968  0.814791
1                  SVM   0.81580   0.809729  0.82560  0.817588
2        Random Forest   0.74436   0.741329  0.75064  0.745955
3    Gradient Boosting   0.74088   0.740572  0.74152  0.741046
4  K-Nearest Neighbors   0.70248   0.717664  0.66760  0.691727


In [30]:
# ======= FastText =======

from gensim.models import FastText

# Load or train FastText model
fasttext_model = FastText(tokenized_texts, vector_size=100, window=5, min_count=1)

# Create sentence embeddings by averaging word vectors
def get_fast_text_embedding(sentence, model):
    return np.mean([model.wv[word] for word in sentence if word in model.wv], axis=0)


X_train_fast_text = np.array([get_fast_text_embedding(sentence.split(), fasttext_model) for sentence in x_train[0]])
y_train_fast_text = np.array(y_train).flatten()  # Labels for training

# Repeat for the test set
X_test_fast_text= np.array([get_fast_text_embedding(sentence.split(), fasttext_model) for sentence in x_test[0]])
y_test_fast_text = np.array(y_test).flatten()  # Your test labels

In [31]:
# Create an empty list to store evaluation results
# I had to change vector size, because it was executing around 30 mins and nothing :)
results = []

# Evaluate each model
for model_name, model in models:
    accuracy, precision, recall, f1 = evaluate_model(model, X_train_fast_text, X_test_fast_text, y_train_fast_text, y_test_fast_text)
    results.append([model_name, accuracy, precision, recall, f1])
    print(model_name)

# Convert results to a DataFrame for a table format
results_df_fast_text = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score"])

# Print the results table
print(results_df_fast_text)

Logistic Regression
SVM
Random Forest
Gradient Boosting
K-Nearest Neighbors
                 Model  Accuracy  Precision   Recall  F1-Score
0  Logistic Regression   0.79124   0.794992  0.78488  0.789904
1                  SVM   0.79368   0.797584  0.78712  0.792318
2        Random Forest   0.71380   0.715125  0.71072  0.712916
3    Gradient Boosting   0.71300   0.725349  0.68560  0.704915
4  K-Nearest Neighbors   0.66560   0.672788  0.64480  0.658497


In [32]:
X_train_fast_text_preprocessed = np.array([get_fast_text_embedding(sentence.split(), fasttext_model) for sentence in x_train['preprocessed']])
y_train_fast_text_preprocessed = np.array(y_train).flatten()  # Labels for training

# Repeat for the test set
X_test_fast_text_preprocessed = np.array([get_fast_text_embedding(sentence.split(), fasttext_model) for sentence in x_train['preprocessed']])
y_test_fast_text_preprocessed = np.array(y_test).flatten()  # Your test labels

In [33]:
# Create an empty list to store evaluation results
results = []

# Evaluate each model
for model_name, model in models:
    accuracy, precision, recall, f1 = evaluate_model(model, X_train_fast_text_preprocessed, X_test_fast_text_preprocessed, y_train_fast_text_preprocessed, y_test_fast_text_preprocessed)
    results.append([model_name, accuracy, precision, recall, f1])
    print(model_name)

# Convert results to a DataFrame for a table format
results_df_fast_text_preprocessed = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score"])

# Print the results table
print(results_df_fast_text_preprocessed)

Logistic Regression
SVM
Random Forest
Gradient Boosting
K-Nearest Neighbors
                 Model  Accuracy  Precision   Recall  F1-Score
0  Logistic Regression   0.79244   0.787993  0.80016  0.794030
1                  SVM   0.79384   0.787088  0.80560  0.796236
2        Random Forest   0.99664   0.996481  0.99680  0.996641
3    Gradient Boosting   0.73800   0.742343  0.72904  0.735631
4  K-Nearest Neighbors   0.83524   0.848482  0.81624  0.832049


In [34]:
# ======= sentence-transformers/all-MiniLM-L6-v2 =======

import torch
from transformers import AutoTokenizer, AutoModel

# Load the MiniLM v6 model and tokenizer from Hugging Face
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # MiniLM v2
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_mini = AutoModel.from_pretrained(model_name)

# Function to encode text
def encode_text(text, model):
    # Tokenize the input text
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    
    # Use the model to get embeddings
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the embeddings for the [CLS] token
    embeddings = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
    return embeddings



In [35]:
x_train_encoded = [encode_text(text, model_mini) for text in x_train[0]]

In [36]:
x_test_encoded = [encode_text(text, model_mini) for text in x_test[0]]

In [37]:
y_train_mini = np.array(y_train).flatten()  # Labels for training
y_test_mini = np.array(y_test).flatten()  # Your test labels

In [38]:
x_train_numpy = [tensor.cpu().detach().numpy() for tensor in x_train_encoded if isinstance(tensor, torch.Tensor)]
x_train_flattened = np.array([np.mean(embedding, axis=0) for embedding in x_train_numpy])
x_test_numpy = [tensor.cpu().detach().numpy() for tensor in x_test_encoded if isinstance(tensor, torch.Tensor)]
x_test_flattened = np.array([np.mean(embedding, axis=0) for embedding in x_test_numpy])

In [39]:
# Create an empty list to store evaluation results
results = []

# Evaluate each model
for model_name, model in models:
    accuracy, precision, recall, f1 = evaluate_model(model, x_train_flattened, x_test_flattened, y_train_mini, y_test_mini)
    results.append([model_name, accuracy, precision, recall, f1])
    print(model_name)

# Convert results to a DataFrame for a table format
results_df_mini = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score"])

# Print the results table
print(results_df_mini)

Logistic Regression
SVM
Random Forest
Gradient Boosting
K-Nearest Neighbors
                 Model  Accuracy  Precision   Recall  F1-Score
0  Logistic Regression   0.78452   0.787673  0.77904  0.783333
1                  SVM   0.78468   0.788722  0.77768  0.783162
2        Random Forest   0.61556   0.617049  0.60920  0.613099
3    Gradient Boosting   0.63524   0.638827  0.62232  0.630466
4  K-Nearest Neighbors   0.59720   0.621209  0.49816  0.552921


: 

In [46]:
x_train_encoded_preprocessed = [encode_text(text, model_mini) for text in x_train['preprocessed']]
x_test_encoded_preprocessed = [encode_text(text, model_mini) for text in x_test['preprocessed']]
y_train_mini = np.array(y_train).flatten()  # Labels for training
y_test_mini = np.array(y_test).flatten()  # Your test labels

x_train_numpy_preprocessed = [tensor.cpu().detach().numpy() for tensor in x_train_encoded_preprocessed if isinstance(tensor, torch.Tensor)]
x_train_flattened_preprocessed = np.array([np.mean(embedding, axis=0) for embedding in x_train_numpy_preprocessed])
x_test_numpy_preprocessed = [tensor.cpu().detach().numpy() for tensor in x_test_encoded_preprocessed if isinstance(tensor, torch.Tensor)]
x_test_flattened_preprocessed = np.array([np.mean(embedding, axis=0) for embedding in x_test_numpy_preprocessed])

In [47]:
# Create an empty list to store evaluation results
results = []

# Evaluate each model
for model_name, model in models:
    accuracy, precision, recall, f1 = evaluate_model(model, x_train_flattened_preprocessed, x_test_flattened_preprocessed, y_train_mini, y_test_mini)
    results.append([model_name, accuracy, precision, recall, f1])
    print(model_name)

# Convert results to a DataFrame for a table format
results_df_mini_preprocessed = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score"])

# Print the results table
print(results_df_mini_preprocessed)

Logistic Regression
SVM
Random Forest
Gradient Boosting
K-Nearest Neighbors
                 Model  Accuracy  Precision   Recall  F1-Score
0  Logistic Regression   0.79144   0.795315  0.78488  0.790063
1                  SVM   0.78936   0.792874  0.78336  0.788089
2        Random Forest   0.63828   0.640380  0.63080  0.635554
3    Gradient Boosting   0.65404   0.657402  0.64336  0.650305
4  K-Nearest Neighbors   0.60580   0.634033  0.50048  0.559396


In [51]:
# Concatenate with distinguishing labels for df1 and df2 using 'keys'
merged_df = pd.concat([results_df_word_2_vec,
                       results_df_word_2_vec_preprocessed,
                       results_df_fast_text,
                       results_df_fast_text_preprocessed,
                       results_df_mini, 
                       results_df_mini_preprocessed], 
                      keys=[
                          'results_df_word_2_vec',
                          'results_df_word_2_vec_preprocessed',
                          'results_df_fast_text',
                          'results_df_fast_text_preprocessed',
                          'results_df_mini', 
                          'results_df_mini_preprocessed'])

# Reset index and drop the inner index
merged_df = merged_df.reset_index(level=1, drop=True)

# Replace repeated index values with empty strings except for the first occurrence in each group
merged_df.index = merged_df.index.to_series().mask(merged_df.index.duplicated(), '')

# Print the final formatted DataFrame
print(merged_df)

                                            Model  Accuracy  Precision  \
results_df_mini               Logistic Regression   0.78452   0.787673   
                                              SVM   0.78468   0.788722   
                                    Random Forest   0.61520   0.616448   
                                Gradient Boosting   0.63524   0.638827   
                              K-Nearest Neighbors   0.59720   0.621209   
results_df_mini_preprocessed  Logistic Regression   0.79144   0.795315   
                                              SVM   0.78936   0.792874   
                                    Random Forest   0.63828   0.640380   
                                Gradient Boosting   0.65404   0.657402   
                              K-Nearest Neighbors   0.60580   0.634033   

                               Recall  F1-Score  
results_df_mini               0.77904  0.783333  
                              0.77768  0.783162  
                              0.609