Dataset source: </br>
https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews


contributors:
* Patrycja Wysocka
* Łukasz Jaremek
* Stanisław Kurzątkowski

In [1]:
import string
import re

from sklearn.metrics import (
    accuracy_score, precision_score, f1_score, recall_score,
    classification_report
)
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from sklearn.svm import SVC
import pandas as pd
import numpy as np
import nltk


In [2]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

stop_words = set(stopwords.words("english"))


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/wpartycja/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/wpartycja/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/wpartycja/nltk_data...


In [3]:
df = pd.read_csv("IMDB Dataset.csv")

df["sentiment"] = df["sentiment"].map({"positive": 1, "negative": 0})

df


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [4]:
empty_rows = df["review"].isna().sum()
print(f"Liczba pustych wierszy: {empty_rows}\n")

no_letter_rows = df["review"].apply(lambda x: all(char not in string.ascii_letters for char in str(x))).sum()
print(f"Liczba wierszy bez liter: {no_letter_rows}\n")

df["review_length"] = df["review"].apply(lambda x: len(str(x)))
length_distribution = df["review_length"].describe()
print("Rozkład długości wierszy:")
print(length_distribution)

del df["review_length"]


Liczba pustych wierszy: 0

Liczba wierszy bez liter: 0

Rozkład długości wierszy:
count    50000.000000
mean      1309.431020
std        989.728014
min         32.000000
25%        699.000000
50%        970.000000
75%       1590.250000
max      13704.000000
Name: review_length, dtype: float64


In [5]:
def clean_review(review: str) -> str:
    review = re.sub(r"<.*?>", "", review)

    # 1. Usuwanie adresów e-mail
    review = re.sub(r"\S+@\S+\.\S+", "", review)

    # 2. Usuwanie linków (http, https, ftp itd.)
    review = re.sub(r"http\S+|www\S+|ftp\S+", "", review)

    # 3. Usuwanie emotikonów (proste emotikony jak :) :( itp.)
    emoticon_pattern = r"[:;=8][-o*\"]?[)\]dDpP/\:}({@|\\]"
    review = re.sub(emoticon_pattern, "", review)

    # 4. Usuwanie wszelkich znaków nie-alfanumerycznych (poza spacjami)
    review = re.sub(r"[^A-Za-z0-9\s]", "", review)

    # 5. Usuwanie dodatkowych białych znaków
    review = re.sub(r"\s+", " ", review).strip()

    return review


df["cleaned_review"] = df["review"].apply(lambda x: clean_review(str(x)))

df


Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,1,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,1,A wonderful little production The filming tech...
2,I thought this was a wonderful way to spend ti...,1,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,0,Basically theres a family where a little boy J...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,Petter Matteis Love in the Time of Money is a ...
...,...,...,...
49995,I thought this movie did a down right good job...,1,I thought this movie did a down right good job...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0,Bad plot bad dialogue bad acting idiotic direc...
49997,I am a Catholic taught in parochial elementary...,0,I am a Catholic taught in parochial elementary...
49998,I'm going to have to disagree with the previou...,0,Im going to have to disagree with the previous...


In [6]:
def remove_stop_words(text: str) -> str:
    words = text.split()

    filtered_words = [
        word
        for word in words
        if word.lower() not in stop_words
        ]

    return " ".join(filtered_words)


df["cleaned_review"] = df["cleaned_review"].apply(lambda x: remove_stop_words(str(x)))

df


Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,1,One reviewers mentioned watching 1 Oz episode ...
1,A wonderful little production. <br /><br />The...,1,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,Basically theres family little boy Jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,Petter Matteis Love Time Money visually stunni...
...,...,...,...
49995,I thought this movie did a down right good job...,1,thought movie right good job wasnt creative or...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0,Bad plot bad dialogue bad acting idiotic direc...
49997,I am a Catholic taught in parochial elementary...,0,Catholic taught parochial elementary schools n...
49998,I'm going to have to disagree with the previou...,0,Im going disagree previous comment side Maltin...


In [7]:
lemmatizer = WordNetLemmatizer()


def lemmatize_text(text: str) -> str:
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)


df["cleaned_review"] = df["cleaned_review"].apply(
    lambda x: lemmatize_text(str(x))
    )

df


Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,1,One reviewer mentioned watching 1 Oz episode y...
1,A wonderful little production. <br /><br />The...,1,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,Basically there family little boy Jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,Petter Matteis Love Time Money visually stunni...
...,...,...,...
49995,I thought this movie did a down right good job...,1,thought movie right good job wasnt creative or...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0,Bad plot bad dialogue bad acting idiotic direc...
49997,I am a Catholic taught in parochial elementary...,0,Catholic taught parochial elementary school nu...
49998,I'm going to have to disagree with the previou...,0,Im going disagree previous comment side Maltin...


In [8]:
def tokenize(text: str) -> list[str]:
    return text.split(" ")


df["tokens"] = df["cleaned_review"].apply(
    lambda x: tokenize(str(x))
    )

df


Unnamed: 0,review,sentiment,cleaned_review,tokens
0,One of the other reviewers has mentioned that ...,1,One reviewer mentioned watching 1 Oz episode y...,"[One, reviewer, mentioned, watching, 1, Oz, ep..."
1,A wonderful little production. <br /><br />The...,1,wonderful little production filming technique ...,"[wonderful, little, production, filming, techn..."
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,0,Basically there family little boy Jake think t...,"[Basically, there, family, little, boy, Jake, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,Petter Matteis Love Time Money visually stunni...,"[Petter, Matteis, Love, Time, Money, visually,..."
...,...,...,...,...
49995,I thought this movie did a down right good job...,1,thought movie right good job wasnt creative or...,"[thought, movie, right, good, job, wasnt, crea..."
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0,Bad plot bad dialogue bad acting idiotic direc...,"[Bad, plot, bad, dialogue, bad, acting, idioti..."
49997,I am a Catholic taught in parochial elementary...,0,Catholic taught parochial elementary school nu...,"[Catholic, taught, parochial, elementary, scho..."
49998,I'm going to have to disagree with the previou...,0,Im going disagree previous comment side Maltin...,"[Im, going, disagree, previous, comment, side,..."


In [9]:
model = Word2Vec(
    sentences=df["tokens"], vector_size=100, window=5, min_count=1, workers=4
    )


In [10]:
def document_vector(doc: list[str]) -> np.ndarray:
    doc = [word for word in doc if word in model.wv.index_to_key]

    if len(doc) > 0:
        return np.mean(model.wv[doc], axis=0)

    return np.zeros(model.vector_size)


x = np.array([document_vector(doc) for doc in df["tokens"]])
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, shuffle=True
    )


In [11]:
svm = SVC(kernel="linear")
svm.fit(X_train, y_train)


In [12]:
y_pred = svm.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.8535
Precision: 0.8459
Recall: 0.8647
F1 Score: 0.8552


In [13]:
print("Classification report:\n", classification_report(y_test, y_pred))


Classification report:
               precision    recall  f1-score   support

           0       0.86      0.84      0.85      4998
           1       0.85      0.86      0.86      5002

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

