In [13]:
from mytools import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

print("Imports done.")


Imports done.


In [2]:
merged = select_features(load_csv("train"), load_csv("movies"))
inspect(merged)

Shape of the dataframe: (162758, 12)

Columns in the dataframe:
Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'sentiment', 'audienceScore', 'rating', 'runtimeMinutes', 'genre',
       'originalLanguage', 'director', 'boxOffice'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162758 entries, 0 to 162757
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   movieid             162758 non-null  object 
 1   reviewerName        162758 non-null  object 
 2   isFrequentReviewer  162758 non-null  bool   
 3   reviewText          162758 non-null  object 
 4   sentiment           162758 non-null  object 
 5   audienceScore       149510 non-null  float64
 6   rating              162758 non-null  object 
 7   runtimeMinutes      159382 non-null  float64
 8   genre               160320 non-null  object 
 9   originalLanguage    159468 non-null  object 
 10  

In [3]:
num_pipe, cat_pipe, text_pipe = get_preprocessing_pipelines()
num_pipe, cat_pipe, text_pipe

(Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', MinMaxScaler())]),
 Pipeline(steps=[('ohe', OneHotEncoder(handle_unknown='ignore'))]),
 Pipeline(steps=[('tvec', TfidfVectorizer())]))

## Apply PCA on reviewText  

In [4]:
train_reviewText = merged["reviewText"]
train_reviewText.shape

(162758,)

In [10]:
# Best vanilla logreg for comparison
ct_logreg = ColumnTransformer(transformers=[
    ("num", num_pipe, ["audienceScore"]),
    ("cat", cat_pipe, ["rating", "isFrequentReviewer"]),
    ("tvec", TfidfVectorizer(ngram_range=(1,2)), "reviewText")
])

pipe_logreg = Pipeline(steps=[
    ("preprocessor", ct_logreg),
    ("model", LogisticRegression())
])

pipe_logreg.set_params(preprocessor__tvec__ngram_range=(1,2), preprocessor__tvec__max_features=None)
pipe_logreg.set_params(model__C=10, model__max_iter=100000)

split_train_predict(merged[["audienceScore", "rating", "isFrequentReviewer", "reviewText"]], merged["sentiment"], pipe_logreg, test_size=0.25)

              precision    recall  f1-score   support

    NEGATIVE       0.79      0.67      0.73     13717
    POSITIVE       0.84      0.91      0.88     26973

    accuracy                           0.83     40690
   macro avg       0.82      0.79      0.80     40690
weighted avg       0.83      0.83      0.83     40690

[[ 9162  4555]
 [ 2385 24588]]


In [14]:
# Try PCA on similar features as in vanilla logreg above
# Using truncatedSVD instead of PCA because latter doesn't work with sparse matrices

# https://stats.stackexchange.com/questions/239481/difference-between-scikit-learn-implementations-of-pca-and-truncatedsvd 

ct_pca_logreg = ColumnTransformer(transformers=[
    ("num", num_pipe, ["audienceScore"]),
    ("cat", cat_pipe, ["rating", "isFrequentReviewer"]),
    ("tvec", TfidfVectorizer(ngram_range=(1,2)), "reviewText")
])

pipe_pca_logreg = Pipeline(steps=[
    ("preprocessor", ct_pca_logreg),
    # ("pca", PCA(n_components=1000)),
    ("tsvd", TruncatedSVD(n_components=1000)),
    ("model", LogisticRegression())
])

split_train_predict(merged[["audienceScore", "rating", "isFrequentReviewer", "reviewText"]], merged["sentiment"], pipe_pca_logreg, test_size=0.25)

MemoryError: Unable to allocate 6.70 GiB for an array with shape (890718, 1010) and data type float64