In [19]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_score, accuracy_score
import re
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression

In [20]:
data = pd.read_csv("./imdb.csv", encoding="latin1")

In [22]:
def get_clean(x):
    x = str(x).lower().replace('\\', '').replace('_', ' ')
    x = re.sub("(.)\\1{2,}", "\\1", x)
    x = re.sub(r'[^\w\s]', '', x)
    return x

In [23]:
data = data[data["label"]!="unsup"]

In [24]:
data["review"] = data["review"].apply(lambda x: get_clean(x))

In [25]:
train_set = data[data["type"] == "train"]
test_set = data[data["type"] == "test"]

In [26]:
x_train = train_set["review"]
y_train = train_set["label"]
x_test = test_set["review"]
y_test = test_set["label"]

### Model with LinearSVC

In [27]:
tfidf = TfidfVectorizer()
svc = LinearSVC(max_iter=5000)
pipe = make_pipeline(tfidf, svc)
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)


print(classification_report(y_test, y_pred))

In [None]:
accuracy_score(y_test, y_pred)

### Model with KNeighborsClassifier

In [28]:
tfidf = TfidfVectorizer()
knn = KNeighborsClassifier(n_neighbors=5)
dtc = DecisionTreeClassifier(max_depth=5)
lr = LogisticRegression()
pipe = make_pipeline(tfidf, knn)
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)

In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.68      0.66      0.67     12500
         pos       0.67      0.68      0.68     12500

    accuracy                           0.67     25000
   macro avg       0.67      0.67      0.67     25000
weighted avg       0.67      0.67      0.67     25000



In [None]:
accuracy_score(y_test, y_pred)

### DecisionTreeClassifier

In [None]:
tfidf = TfidfVectorizer()
dtc = DecisionTreeClassifier(max_depth=5)
pipe = make_pipeline(tfidf, dtc)
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)

print(classification_report(y_test, y_pred))

In [None]:
accuracy_score(y_test, y_pred)

### Logistic regression

In [None]:
tfidf = TfidfVectorizer()
lr = LogisticRegression()
pipe = make_pipeline(tfidf, lr)
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)

print(classification_report(y_test, y_pred))

0.6718

In [None]:
accuracy_score(y_test, y_pred)

### Model with RandomForestClassifier

In [None]:
tfidf = TfidfVectorizer()
rfc = RandomForestClassifier(n_estimators=2, random_state=42)
pipe = make_pipeline(tfidf, rfc)
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)

print(classification_report(y_test, y_pred))

In [None]:
accuracy_score(y_test, y_pred)