In [4]:
import pandas as pd
import numpy as np
import kagglehub
import os

## 1. Download data and explore

In [5]:
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

In [7]:
os.listdir(path)

['IMDB Dataset.csv']

In [8]:
file_path = os.path.join(path, 'IMDB Dataset.csv')
df = pd.read_csv(file_path)

df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [18]:
# Imbalances?

df["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

Perfectly balanced.

In [20]:
# Null values?

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


No null values.

In [21]:
# Conversion labels:
# Positive -> 1
# Negative -> 0

df["label"] = df["sentiment"].map({
    "positive": 1,
    "negative": 0
})

df["label"].value_counts()

label
1    25000
0    25000
Name: count, dtype: int64

In [29]:
# Check length of reviews (number of words)

df["review_length"] = df["review"].apply(lambda x: len(x.split()))
df["review_length"].describe()

count    50000.000000
mean       231.156940
std        171.343997
min          4.000000
25%        126.000000
50%        173.000000
75%        280.000000
max       2470.000000
Name: review_length, dtype: float64

In [30]:
# Check if there is HTML <br /><br />

print(df["review"].iloc[0])

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fac

This must be removed during the pre-processing.

## 2. Text pre-processing

* Remove HTML
* Lowercase
* Tokenization
* Lemmatization

In [33]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from gensim.utils import simple_preprocess

nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Chiara\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Chiara\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [34]:
# lemmatizer initialization

lemmatizer = WordNetLemmatizer() 

In [36]:
# Function for pre-processing

def preprocess_review(text):

    # remove HTML
    text = re.sub(r"<.*?>", " ", text)

    # lowering
    text = text.lower()

    # tokenization
    tokens = simple_preprocess(text)  # eliminate numbers and punctuation

    # lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

In [38]:
from tqdm import tqdm

tqdm.pandas()

tokenized_reviews = df["review"].progress_apply(preprocess_review)

100%|██████████████████████████████████████████████████████████████████████████| 50000/50000 [00:32<00:00, 1542.77it/s]


In [40]:
# check

print(tokenized_reviews.iloc[0][:30])
print(len(tokenized_reviews))

['one', 'of', 'the', 'other', 'reviewer', 'ha', 'mentioned', 'that', 'after', 'watching', 'just', 'oz', 'episode', 'you', 'll', 'be', 'hooked', 'they', 'are', 'right', 'a', 'this', 'is', 'exactly', 'what', 'happened', 'with', 'me', 'the', 'first']
50000


I don't apply stopwords because I think that some words may be important to define the context in this case.

## 3. Train Word2Vec 

In [41]:
from gensim.models import Word2Vec

In [43]:
# Train the model

vector_size = 200
window = 5
min_count = 5
workers = 4

w2v_model = Word2Vec(
    sentences=tokenized_reviews, 
    vector_size=vector_size,
    window=window,
    min_count=min_count,
    workers=workers,
    sg=0  # 0 -> CBOW, 1 -> Skipgram
)

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


In [44]:
# Vocabulary

print("Vocabulary size:")
len(w2v_model.wv)

Vocabulary size:


34821

In [46]:
# Vector size

print("Vector size:")
w2v_model.vector_size

Vector size:


200

In [52]:
# Check

w2v_model.wv.most_similar("good")

[('decent', 0.6980888843536377),
 ('great', 0.6614031791687012),
 ('bad', 0.658441960811615),
 ('cool', 0.6235573887825012),
 ('fine', 0.602043092250824),
 ('nice', 0.5987997651100159),
 ('solid', 0.5821413993835449),
 ('lousy', 0.5643934607505798),
 ('impressive', 0.5503801703453064),
 ('funny', 0.5474219918251038)]

In [53]:
# Function to perform average Word2Vec

def avg_word2vec(doc, model):

    vectors = [model.wv[word] for word in doc if word in model.wv]

    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [55]:
# Create X

X = []

for doc in tqdm(tokenized_reviews):
    X.append(avg_word2vec(doc, w2v_model))

X = np.vstack(X)

100%|██████████████████████████████████████████████████████████████████████████| 50000/50000 [00:12<00:00, 4002.04it/s]


In [56]:
X.shape

(50000, 200)

In [57]:
# Create y

y = df["label"].values
y.shape

(50000,)

In [58]:
# Train test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## 4. Which is the best to predict sentiment? 

* Logistic Regression
* Linear SVM
* MLP

### 4.1. Logistic Regression

In [59]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    max_iter = 1000,
    class_weight="balanced",
    random_state=42
)

lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [61]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("Accuracy: ", accuracy_score(y_test, y_pred_lr))

print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred_lr))

print("\nClassification report:")
print(classification_report(y_test, y_pred_lr))

Accuracy:  0.8663

Confusion matrix:
[[4336  664]
 [ 673 4327]]

Classification report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      5000
           1       0.87      0.87      0.87      5000

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



### 4.2. SVM

In [62]:
from sklearn.svm import LinearSVC

svm = LinearSVC(class_weight="balanced", random_state=42)

svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

In [63]:
print("Accuracy: ", accuracy_score(y_test, y_pred_svm))

print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred_svm))

print("\nClassification report:")
print(classification_report(y_test, y_pred_svm))

Accuracy:  0.8659

Confusion matrix:
[[4320  680]
 [ 661 4339]]

Classification report:
              precision    recall  f1-score   support

           0       0.87      0.86      0.87      5000
           1       0.86      0.87      0.87      5000

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



### 4.3. MLP

In [66]:
# Input size 200 -> hidden layer size 128 -> output size 1 for binary classification

from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(
    hidden_layer_sizes=(128,),
    activation="relu",
    max_iter=20,
    random_state=42
)

mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)



In [67]:
print("Accuracy: ", accuracy_score(y_test, y_pred_mlp))

print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred_mlp))

print("\nClassification report:")
print(classification_report(y_test, y_pred_mlp))

Accuracy:  0.8681

Confusion matrix:
[[4147  853]
 [ 466 4534]]

Classification report:
              precision    recall  f1-score   support

           0       0.90      0.83      0.86      5000
           1       0.84      0.91      0.87      5000

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



Performances are almost the same for all the models. As a consequence a linear model is already enough, I choose Logistic Regression as it is the easiest.

## 5. Pre-trained Google News and Logistic Regression

How does performance change when using an already trained Word2Vec model rather than training the model with my dataset?

In [69]:
import gensim.downloader as api

w2v_google = api.load("word2vec-google-news-300")

In [70]:
# Average Word2Vec

def avg_word2vec_google(doc, model):

    vectors = [model[word] for word in doc if word in model]

    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [71]:
# Create X

X_google = []

for doc in tqdm(tokenized_reviews):
    X_google.append(avg_word2vec_google(doc, w2v_google))

X_google = np.vstack(X_google)
X_google.shape

100%|██████████████████████████████████████████████████████████████████████████| 50000/50000 [00:11<00:00, 4209.85it/s]


(50000, 300)

In [72]:
# Train the model 

X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(X_google, y, test_size=0.2, random_state=42, stratify=y)

lr_google = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)
lr_google.fit(X_train_g, y_train)

y_pred_google = lr_google.predict(X_test_g)

In [73]:
# Performance

print("Accuracy: ", accuracy_score(y_test, y_pred_google))

print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred_google))

print("\nClassification report:")
print(classification_report(y_test, y_pred_google))

Accuracy:  0.8479

Confusion matrix:
[[4255  745]
 [ 776 4224]]

Classification report:
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      5000
           1       0.85      0.84      0.85      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



Word2Vec trained on specific data performs 1.8% better.