In [188]:
from mytools import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler, FunctionTransformer
from sklearn.impute import SimpleImputer, KNNImputer

print("Imported all libraries successfully!")

Imported all libraries successfully!


In [None]:
traindf = load_csv("train")
moviesdf = load_csv("movies")
traindf.shape, moviesdf.shape

## Examine movies.csv data  

In [None]:
moviesdf.head()

In [None]:
moviesdf.columns

In [None]:
moviesdf.info()

In [None]:
moviesdf.isnull().sum()

In [None]:
moviesdf.describe()

In [None]:
moviesdf["genre"].value_counts()

## Drop duplicates from moviesdf dataframe  

In [None]:
movies_unique = moviesdf.drop_duplicates(subset=["movieid"])
movies_unique.shape, moviesdf.shape

## Merge traindf and moviesdf  

In [None]:
train_movies_merged = pd.merge(traindf, movies_unique, on="movieid")
train_movies_merged.shape

In [None]:
train_movies_merged.columns

In [None]:
train_movies_merged = train_movies_merged[['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'title', 'audienceScore', 'rating', 'ratingContents',
       'releaseDateTheaters', 'releaseDateStreaming', 'runtimeMinutes',
       'genre', 'originalLanguage', 'director', 'boxOffice', 'distributor',
       'soundType', 'sentiment']]
train_movies_merged.shape

In [None]:
train_movies_merged.head()

In [None]:
train_movies_merged.isnull().sum()

## Clean data in merged df  

In [None]:
# Fill missing values in "reviewText", 'rating" column with empty string and "NA" respectively
# Clean language names

train_final = train_movies_merged.copy()
train_final["reviewText"] = train_final["reviewText"].fillna(" ")
train_final["rating"] = train_final["rating"].fillna("NA")
train_final["originalLanguage"].replace({"English (United Kingdom)": "English", 
                                         "English (Australia)" : "English",
                                         "French (France)": "French", 
                                         "French (Canada)": "French",
                                         "Portuguese (Brazil)": "Portuguese",
                                         "Spanish (Spain)": "Spanish"},                                         
                                         inplace=True)
train_final["reviewText"].isna().sum()

In [None]:
train_final["rating"].value_counts()

In [None]:
train_final["genre"].value_counts()

In [None]:
train_final["originalLanguage"].unique(), train_final["originalLanguage"].value_counts()

In [None]:
train_final.columns

## Keep only the columns to work on  

In [None]:
train_final = train_final.drop(columns=["title", "ratingContents", "releaseDateTheaters", "releaseDateStreaming", "boxOffice", "distributor", "soundType"])
train_final.shape,  train_final.columns

## Separate features and labels  

In [None]:
train_features = train_final.iloc[:, :-1]
train_labels = train_final.iloc[:, -1]
train_features.shape, train_labels.shape

In [None]:
train_features.head()

In [None]:
train_labels.head()

## Try "select_features" function from mytools module  

In [168]:
df = select_features(load_csv("train"), load_csv("movies"))
df.head()

Unnamed: 0,movieid,reviewerName,isFrequentReviewer,reviewText,sentiment,audienceScore,rating,runtimeMinutes,genre,originalLanguage,director
0,marvelous_pirate,Benjamin Henry,False,Henry Selick’s first movie since 2009’s Corali...,POSITIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
1,marvelous_pirate,Sharon Foster,False,&#91;T&#93;he haphazard way this story is asse...,NEGATIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
2,marvelous_pirate,Melinda Dunn,False,The stop-motion artistry of Wendell &amp; Wild...,POSITIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
3,marvelous_pirate,Mr. Wayne Smith,False,Wendell &amp; Wild is narratively overstuffed ...,NEGATIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
4,marvelous_pirate,Connor Nelson,False,For being about the Netherworlds&#44; it&#8217...,POSITIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso


In [169]:
df.shape

(162758, 11)

In [170]:
df.columns

Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'sentiment', 'audienceScore', 'rating', 'runtimeMinutes', 'genre',
       'originalLanguage', 'director'],
      dtype='object')

In [171]:
dftest = select_features(load_csv("test"), load_csv("movies"))
dftest.head()

Unnamed: 0,movieid,reviewerName,isTopCritic,reviewText,audienceScore,rating,runtimeMinutes,genre,originalLanguage,director
0,legend_marty_mcfly_oracle,John Kim,False,Green slowly cranks up the dread with style an...,57.0,R,111.0,"Holiday, Horror, Mystery & thriller",English,Sara Barnett
1,legend_marty_mcfly_oracle,Kathleen Poole,False,Considering this is the 13th Halloween movie&#...,57.0,R,111.0,"Holiday, Horror, Mystery & thriller",English,Sara Barnett
2,legend_marty_mcfly_oracle,Kenneth Lamb,False,Halloween Ends is by no means the worst horror...,57.0,R,111.0,"Holiday, Horror, Mystery & thriller",English,Sara Barnett
3,legend_marty_mcfly_oracle,Brittany Lane,False,A concluding chapter that shares more DNA with...,57.0,R,111.0,"Holiday, Horror, Mystery & thriller",English,Sara Barnett
4,legend_marty_mcfly_oracle,Yolanda Thomas,False,For a film called Halloween Ends&#44; let&#821...,57.0,R,111.0,"Holiday, Horror, Mystery & thriller",English,Sara Barnett


In [172]:
dftest.shape

(55315, 10)

In [173]:
dftest.columns

Index(['movieid', 'reviewerName', 'isTopCritic', 'reviewText', 'audienceScore',
       'rating', 'runtimeMinutes', 'genre', 'originalLanguage', 'director'],
      dtype='object')

In [174]:
df.columns

Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'sentiment', 'audienceScore', 'rating', 'runtimeMinutes', 'genre',
       'originalLanguage', 'director'],
      dtype='object')

## Train models  

In [175]:
train1_fs = df[['movieid', 
                'reviewerName', 
                'isFrequentReviewer', 
                'reviewText',
                'audienceScore', 
                'rating', 
                'runtimeMinutes', 
                'genre', 
                'originalLanguage', 
                'director'
                ]]

# train1_fs = df[['reviewText',
#                 'audienceScore', 
#                 'runtimeMinutes', 
#                 'genre', 
#                 'originalLanguage', 
#                 'director'
#               ]]

train1_labels = df['sentiment']
train1_fs.shape, train1_labels.shape

((162758, 10), (162758,))

In [176]:
# train1_fs.loc[:, "audienceScore"] = SimpleImputer(strategy='mean', missing_values=np.nan).fit_transform(train1_fs[["audienceScore"]])
# train1_fs.loc[:, "runtimeMinutes"] = SimpleImputer(strategy='mean', missing_values=np.nan).fit_transform(train1_fs[["runtimeMinutes"]])
# train1_fs.loc[:, "genre"] = SimpleImputer(strategy='most_frequent').fit_transform(train1_fs[["genre"]])
# train1_fs.loc[:, "originalLanguage"] = SimpleImputer(strategy='constant', fill_value='Unknown').fit_transform(train1_fs[["originalLanguage"]])
# train1_fs.isnull().sum()

In [177]:
X_train, X_test, y_train, y_test = train_test_split(train1_fs, train1_labels, test_size=0.25, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((122068, 10), (40690, 10), (122068,), (40690,))

In [178]:
train1_fs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162758 entries, 0 to 162757
Data columns (total 10 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   movieid             162758 non-null  object 
 1   reviewerName        162758 non-null  object 
 2   isFrequentReviewer  162758 non-null  bool   
 3   reviewText          162758 non-null  object 
 4   audienceScore       149510 non-null  float64
 5   rating              162758 non-null  object 
 6   runtimeMinutes      159382 non-null  float64
 7   genre               160320 non-null  object 
 8   originalLanguage    159468 non-null  object 
 9   director            162758 non-null  object 
dtypes: bool(1), float64(2), object(7)
memory usage: 12.6+ MB


In [179]:
train1_fs.isnull().sum()

movieid                   0
reviewerName              0
isFrequentReviewer        0
reviewText                0
audienceScore         13248
rating                    0
runtimeMinutes         3376
genre                  2438
originalLanguage       3290
director                  0
dtype: int64

In [180]:
# ct1_imputer = ColumnTransformer(transformers=[
#                                 ('imputer_audienceScore', SimpleImputer(strategy='mean', missing_values=np.nan), ['audienceScore']),
#                                 ('imputer_runtimeMinutes', SimpleImputer(strategy='mean', missing_values=np.nan), ['runtimeMinutes']),
#                                 ('imputer_genre', SimpleImputer(strategy='most_frequent'), ['genre']),
#                                 ('imputer_lang', SimpleImputer(strategy='constant', fill_value='Unknown'), ['originalLanguage']),
#                                 ], 
#                                 remainder='passthrough')

In [264]:
pipe_imputer_num = Pipeline(steps=[
                                ('imp1', SimpleImputer(strategy='mean', missing_values=np.nan)),
                                # ('toarray', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True))
                            ])

pipe_imputer_freq = Pipeline(steps=[
                                ('imp2', SimpleImputer(strategy='most_frequent')),
                                ('toarray', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True))
                            ])

pipe_imputer_const = Pipeline(steps=[
                                ('imp3', SimpleImputer(strategy='constant', fill_value='Unknown')),
                                ('toarray', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True))
                            ])

# pipe_tvec = Pipeline(steps=[
#                             ('tvec', TfidfVectorizer(max_features=10000)),
#                             ('toarray', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True))
#                             ])

vectorizer_params = dict(max_features=10000)
pipe_text = Pipeline(steps=[
                            # ('imp', SimpleImputer(strategy='most_frequent')),
                            # ("squeez", FunctionTransformer(lambda x: x.squeeze())),
                            # ("vect", CountVectorizer(**vectorizer_params)),
                            # ("tfidf", TfidfTransformer()),
                            ("tvec", TfidfVectorizer(**vectorizer_params)),
                            # ("toarray", FunctionTransformer(lambda x: x.toarray())),
                        ])

In [265]:
train1_fs.isna().sum()

movieid                   0
reviewerName              0
isFrequentReviewer        0
reviewText                0
audienceScore         13248
rating                    0
runtimeMinutes         3376
genre                  2438
originalLanguage       3290
director                  0
dtype: int64

In [266]:
train1_fs.columns

Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'audienceScore', 'rating', 'runtimeMinutes', 'genre',
       'originalLanguage', 'director'],
      dtype='object')

In [318]:
ct2 = ColumnTransformer(transformers=[
                                    ('num', pipe_imputer_num, ['audienceScore', 'runtimeMinutes']),
                                    ('text', pipe_text, 'director'),
                                    ('reviewText', pipe_text, 'reviewText'),
                                    ], 
                                    remainder='drop', sparse_threshold=0.3)

In [319]:
# ct1 = ColumnTransformer(transformers=[
#                         ('tvec_movieid', TfidfVectorizer(), ['movieid']),
#                          ('tvec_reviewerName', TfidfVectorizer(max_features=10000), ['reviewerName']),
#                         #  ('ohe_freqRev', OneHotEncoder(handle_unknown='ignore'), ['isFrequentReviewer']),
#                         ('tvec_isFreq', TfidfVectorizer(), ['isFrequentReviewer']),
#                          ('tvec_reviewText', TfidfVectorizer(ngram_range=(1,2), max_features=10000), ['reviewText']),
#                         #  ('std_scaler_audienceScore', StandardScaler(), ['audienceScore']),
#                         #  ('ohe_rating', OneHotEncoder(handle_unknown='ignore'), ['rating']),
#                         ('tvec_rating', TfidfVectorizer(), ['rating']),
#                         #  ('mm_scaler_runtimeMinutes', MinMaxScaler(), ['runtimeMinutes']),
#                          ('tvec_genre', TfidfVectorizer(max_features=20), ['genre']),
#                          ('tvec_originalLanguage', TfidfVectorizer(max_features=100), ['originalLanguage']),
#                          ('tvec_director', TfidfVectorizer(max_features=10000), ['director']),
#                          ], remainder='passthrough', sparse_threshold=0.3)
# ct1

In [320]:
pipe1 = Pipeline(steps=[
                        # ('imputer', ct1_imputer),
                        ('transformer', ct2), 
                        ('logreg', LogisticRegression(C=2, max_iter=1000))
                        ])

In [321]:
pipe1.fit(X_train, y_train)

In [None]:
pipe1.score(X_train, y_train)

0.7395058491988072

In [None]:
predict_n_evaluate(pipe1, X_test, y_test)

y_pred shape: (40690,)
Summary of predictions: (array(['NEGATIVE', 'POSITIVE'], dtype=object), array([ 8600, 32090], dtype=int64))
              precision    recall  f1-score   support

    NEGATIVE       0.63      0.40      0.49     13717
    POSITIVE       0.74      0.88      0.81     26973

    accuracy                           0.72     40690
   macro avg       0.69      0.64      0.65     40690
weighted avg       0.71      0.72      0.70     40690

[[ 5444  8273]
 [ 3156 23817]]


array(['POSITIVE', 'POSITIVE', 'POSITIVE', ..., 'POSITIVE', 'POSITIVE',
       'POSITIVE'], dtype=object)

### test file

In [None]:
# test1 = dftest[['reviewerName', 'isTopCritic', 'reviewText']]
# test1.shape

In [None]:
X_train.columns

Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'audienceScore', 'rating', 'runtimeMinutes', 'genre',
       'originalLanguage', 'director'],
      dtype='object')

In [None]:
X_train["rating"].value_counts()

NA       47713
R        37952
PG-13    27155
PG        8806
NC-17      153
TVPG       147
TV14       110
TVMA        32
Name: rating, dtype: int64

In [None]:
vec = TfidfVectorizer()
yyyy = vec.fit_transform(X_train["originalLanguage"].astype(str))
ypd = pd.DataFrame(yyyy.toarray())
ypd

MemoryError: Unable to allocate 81.0 MiB for an array with shape (122068, 87) and data type float64

In [None]:
ypd.sum()

0      34.0
1      29.0
2       8.0
3     534.0
4      24.0
      ...  
82     43.0
83     10.0
84     14.0
85      2.0
86     29.0
Length: 87, dtype: float64

In [None]:
vec.get_feature_names()

['afrikaans',
 'albanian',
 'amharic',
 'arabic',
 'aramaic',
 'armenian',
 'azerbaijani',
 'bambara',
 'bangla',
 'bosnian',
 'bulgarian',
 'catalan',
 'chinese',
 'croatian',
 'crp',
 'czech',
 'danish',
 'dutch',
 'dzongkha',
 'english',
 'estonian',
 'filipino',
 'finnish',
 'french',
 'galician',
 'georgian',
 'german',
 'greek',
 'gujarati',
 'hebrew',
 'hindi',
 'hungarian',
 'icelandic',
 'indonesian',
 'inuktitut',
 'irish',
 'italian',
 'japanese',
 'kalaallisut',
 'kannada',
 'khmer',
 'korean',
 'kurdish',
 'language',
 'lao',
 'latvian',
 'lingala',
 'lithuanian',
 'luxembourgish',
 'macedonian',
 'malay',
 'malayalam',
 'maltese',
 'maori',
 'marathi',
 'mongolian',
 'nan',
 'nepali',
 'norwegian',
 'pashto',
 'persian',
 'polish',
 'portuguese',
 'romanian',
 'romany',
 'russian',
 'serbian',
 'slovak',
 'slovenian',
 'somali',
 'spanish',
 'swahili',
 'swedish',
 'tagalog',
 'tamil',
 'telugu',
 'thai',
 'tibetan',
 'turkish',
 'ukrainian',
 'unknown',
 'urdu',
 'vietna

In [None]:
vec.vocabulary_

{'english': 19,
 'french': 23,
 'korean': 41,
 'unknown': 80,
 'language': 43,
 'chinese': 12,
 'spanish': 70,
 'nan': 56,
 'italian': 36,
 'japanese': 37,
 'german': 26,
 'hebrew': 29,
 'malayalam': 51,
 'serbian': 66,
 'persian': 60,
 'hindi': 30,
 'marathi': 54,
 'czech': 15,
 'hungarian': 31,
 'thai': 76,
 'finnish': 22,
 'portuguese': 62,
 'albanian': 1,
 'danish': 16,
 'vietnamese': 82,
 'swedish': 72,
 'romanian': 63,
 'turkish': 78,
 'bangla': 8,
 'russian': 65,
 'arabic': 3,
 'tamil': 74,
 'polish': 61,
 'telugu': 75,
 'croatian': 13,
 'norwegian': 58,
 'bulgarian': 10,
 'icelandic': 32,
 'kannada': 39,
 'kurdish': 42,
 'lithuanian': 47,
 'dutch': 17,
 'afrikaans': 0,
 'tagalog': 73,
 'dzongkha': 18,
 'wolof': 84,
 'urdu': 81,
 'pashto': 59,
 'greek': 27,
 'romany': 64,
 'amharic': 2,
 'macedonian': 49,
 'khmer': 40,
 'estonian': 20,
 'filipino': 21,
 'inuktitut': 34,
 'slovenian': 68,
 'welsh': 83,
 'georgian': 25,
 'catalan': 11,
 'indonesian': 33,
 'yiddish': 86,
 'swahili'

In [None]:
vec.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}