In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, classification_report

In [32]:
data = pd.read_csv('data/moviereviews.tsv', delimiter='\t')

# Removing missing values
if data.isna().sum().sum() != 0:
    data.dropna(inplace=True)

# Checking for empty reviews
empty_review = list(data[data.review.str.isspace()].index)
data.drop(index=empty_review, inplace=True)

def train_model(model_name, model, X_train, X_test, y_train, y_test):
    print(f'BEGIN. {model_name.upper()}......')
    model.fit(X_train_vect, y_train)
    y_pred = model.predict(X_test_vect)
    print(f'{model_name.upper()}: \t\t{accuracy_score(y_test, y_pred) * 100:.2f}%')
    print(classification_report(y_test, y_pred))
    print(f'END. {model_name.upper()}')
    print('======================================================')
    return y_pred

print('=============Splitting the data=============')
X = data.review
y = data.label
print(f'Data shape: {data.shape}')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f'X_Train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_Test shape: {X_test.shape}, y_test shape: {y_test.shape}')

print('\n============Message Preprocessing============')
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

print('Training and testing data shape after pre-processing:')
print(f'X_Train shape: {X_train_vect.shape}, y_train shape: {y_train.shape}')
print(f'X_Test shape: {X_test_vect.shape}, y_test shape: {y_test.shape}')

print('\n=============Model Building==================')
lr_model = LogisticRegression()
lr_y_pred = train_model('Logistic Regression', lr_model, X_train, X_test, y_train, y_test)

svm_model = LinearSVC()
svm_y_pred = train_model('Support Vector Machine', svm_model, X_train, X_test, y_train, y_test)

nb_model = MultinomialNB()
nb_y_pred = train_model('Naive Bayes', nb_model, X_train, X_test, y_train, y_test)

Data shape: (1938, 2)
X_Train shape: (1356,), y_train shape: (1356,)
X_Test shape: (582,), y_test shape: (582,)

Training and testing data shape after pre-processing:
X_Train shape: (1356, 402033), y_train shape: (1356,)
X_Test shape: (582, 402033), y_test shape: (582,)

BEGIN. LOGISTIC REGRESSION......
LOGISTIC REGRESSION: 		80.76%
              precision    recall  f1-score   support

         neg       0.84      0.74      0.79       282
         pos       0.78      0.87      0.82       300

    accuracy                           0.81       582
   macro avg       0.81      0.81      0.81       582
weighted avg       0.81      0.81      0.81       582

END. LOGISTIC REGRESSION
BEGIN. SUPPORT VECTOR MACHINE......
SUPPORT VECTOR MACHINE: 		84.02%
              precision    recall  f1-score   support

         neg       0.87      0.78      0.83       282
         pos       0.81      0.89      0.85       300

    accuracy                           0.84       582
   macro avg       0.84   