In [1]:
from mytools import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer

print("Imported all libraries successfully!")

Imported all libraries successfully!


In [None]:
traindf = load_csv("train")
moviesdf = load_csv("movies")
traindf.shape, moviesdf.shape

## Examine movies.csv data  

In [None]:
moviesdf.head()

In [None]:
moviesdf.columns

In [None]:
moviesdf.info()

In [None]:
moviesdf.isnull().sum()

In [None]:
moviesdf.describe()

In [None]:
moviesdf["genre"].value_counts()

## Drop duplicates from moviesdf dataframe  

In [None]:
movies_unique = moviesdf.drop_duplicates(subset=["movieid"])
movies_unique.shape, moviesdf.shape

## Merge traindf and moviesdf  

In [None]:
train_movies_merged = pd.merge(traindf, movies_unique, on="movieid")
train_movies_merged.shape

In [None]:
train_movies_merged.columns

In [None]:
train_movies_merged = train_movies_merged[['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'title', 'audienceScore', 'rating', 'ratingContents',
       'releaseDateTheaters', 'releaseDateStreaming', 'runtimeMinutes',
       'genre', 'originalLanguage', 'director', 'boxOffice', 'distributor',
       'soundType', 'sentiment']]
train_movies_merged.shape

In [None]:
train_movies_merged.head()

In [None]:
train_movies_merged.isnull().sum()

## Clean data in merged df  

In [None]:
# Fill missing values in "reviewText", 'rating" column with empty string and "NA" respectively
# Clean language names

train_final = train_movies_merged.copy()
train_final["reviewText"] = train_final["reviewText"].fillna(" ")
train_final["rating"] = train_final["rating"].fillna("NA")
train_final["originalLanguage"].replace({"English (United Kingdom)": "English", 
                                         "English (Australia)" : "English",
                                         "French (France)": "French", 
                                         "French (Canada)": "French",
                                         "Portuguese (Brazil)": "Portuguese",
                                         "Spanish (Spain)": "Spanish"},                                         
                                         inplace=True)
train_final["reviewText"].isna().sum()

In [None]:
train_final["rating"].value_counts()

In [None]:
train_final["genre"].value_counts()

In [None]:
train_final["originalLanguage"].unique(), train_final["originalLanguage"].value_counts()

In [None]:
train_final.columns

## Keep only the columns to work on  

In [None]:
train_final = train_final.drop(columns=["title", "ratingContents", "releaseDateTheaters", "releaseDateStreaming", "boxOffice", "distributor", "soundType"])
train_final.shape,  train_final.columns

## Separate features and labels  

In [None]:
train_features = train_final.iloc[:, :-1]
train_labels = train_final.iloc[:, -1]
train_features.shape, train_labels.shape

In [None]:
train_features.head()

In [None]:
train_labels.head()

## Try "select_features" function from mytools module  

In [2]:
df = select_features(load_csv("train"), load_csv("movies"))
df.head()

Unnamed: 0,movieid,reviewerName,isFrequentReviewer,reviewText,sentiment,audienceScore,rating,runtimeMinutes,genre,originalLanguage,director
0,marvelous_pirate,Benjamin Henry,False,Henry Selick’s first movie since 2009’s Corali...,POSITIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
1,marvelous_pirate,Sharon Foster,False,&#91;T&#93;he haphazard way this story is asse...,NEGATIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
2,marvelous_pirate,Melinda Dunn,False,The stop-motion artistry of Wendell &amp; Wild...,POSITIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
3,marvelous_pirate,Mr. Wayne Smith,False,Wendell &amp; Wild is narratively overstuffed ...,NEGATIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
4,marvelous_pirate,Connor Nelson,False,For being about the Netherworlds&#44; it&#8217...,POSITIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso


In [3]:
df.shape

(162758, 11)

In [4]:
df.columns

Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'sentiment', 'audienceScore', 'rating', 'runtimeMinutes', 'genre',
       'originalLanguage', 'director'],
      dtype='object')

In [5]:
dftest = select_features(load_csv("test"), load_csv("movies"))
dftest.head()

Unnamed: 0,movieid,reviewerName,isTopCritic,reviewText,audienceScore,rating,runtimeMinutes,genre,originalLanguage,director
0,legend_marty_mcfly_oracle,John Kim,False,Green slowly cranks up the dread with style an...,57.0,R,111.0,"Holiday, Horror, Mystery & thriller",English,Sara Barnett
1,legend_marty_mcfly_oracle,Kathleen Poole,False,Considering this is the 13th Halloween movie&#...,57.0,R,111.0,"Holiday, Horror, Mystery & thriller",English,Sara Barnett
2,legend_marty_mcfly_oracle,Kenneth Lamb,False,Halloween Ends is by no means the worst horror...,57.0,R,111.0,"Holiday, Horror, Mystery & thriller",English,Sara Barnett
3,legend_marty_mcfly_oracle,Brittany Lane,False,A concluding chapter that shares more DNA with...,57.0,R,111.0,"Holiday, Horror, Mystery & thriller",English,Sara Barnett
4,legend_marty_mcfly_oracle,Yolanda Thomas,False,For a film called Halloween Ends&#44; let&#821...,57.0,R,111.0,"Holiday, Horror, Mystery & thriller",English,Sara Barnett


In [6]:
dftest.shape

(55315, 10)

In [7]:
dftest.columns

Index(['movieid', 'reviewerName', 'isTopCritic', 'reviewText', 'audienceScore',
       'rating', 'runtimeMinutes', 'genre', 'originalLanguage', 'director'],
      dtype='object')

In [8]:
df.columns

Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'sentiment', 'audienceScore', 'rating', 'runtimeMinutes', 'genre',
       'originalLanguage', 'director'],
      dtype='object')

## Train models  

In [9]:
train1_fs = df[['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'audienceScore', 'rating', 'runtimeMinutes', 
       'genre', 'originalLanguage', 
       'director'
       ]]
train1_labels = df['sentiment']
train1_fs.shape, train1_labels.shape

((162758, 10), (162758,))

In [29]:
train1_fs["audienceScore"] = SimpleImputer(strategy='mean', missing_values=np.nan).fit_transform(train1_fs[["audienceScore"]])
train1_fs["runtimeMinutes"] = SimpleImputer(strategy='mean', missing_values=np.nan).fit_transform(train1_fs[["runtimeMinutes"]])
train1_fs["genre"] = SimpleImputer(strategy='most_frequent').fit_transform(train1_fs[["genre"]])
train1_fs["originalLanguage"] = SimpleImputer(strategy='constant', fill_value='Unknown').fit_transform(train1_fs[["originalLanguage"]])
train1_fs.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train1_fs["audienceScore"] = SimpleImputer(strategy='mean', missing_values=np.nan).fit_transform(train1_fs[["audienceScore"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train1_fs["runtimeMinutes"] = SimpleImputer(strategy='mean', missing_values=np.nan).fit_transform(train1_fs[["runtimeMinutes"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

movieid               0
reviewerName          0
isFrequentReviewer    0
reviewText            0
audienceScore         0
rating                0
runtimeMinutes        0
genre                 0
originalLanguage      0
director              0
dtype: int64

In [30]:
X_train, X_test, y_train, y_test = train_test_split(train1_fs, train1_labels, test_size=0.25, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((122068, 10), (40690, 10), (122068,), (40690,))

In [31]:
train1_fs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162758 entries, 0 to 162757
Data columns (total 10 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   movieid             162758 non-null  object 
 1   reviewerName        162758 non-null  object 
 2   isFrequentReviewer  162758 non-null  bool   
 3   reviewText          162758 non-null  object 
 4   audienceScore       162758 non-null  float64
 5   rating              162758 non-null  object 
 6   runtimeMinutes      162758 non-null  float64
 7   genre               162758 non-null  object 
 8   originalLanguage    162758 non-null  object 
 9   director            162758 non-null  object 
dtypes: bool(1), float64(2), object(7)
memory usage: 16.6+ MB


In [32]:
train1_fs.isnull().sum()

movieid               0
reviewerName          0
isFrequentReviewer    0
reviewText            0
audienceScore         0
rating                0
runtimeMinutes        0
genre                 0
originalLanguage      0
director              0
dtype: int64

In [33]:
# ct1_imputer = ColumnTransformer(transformers=[
#                                 ('imputer_audienceScore', SimpleImputer(strategy='mean', missing_values=np.nan), ['audienceScore']),
#                                 ('imputer_runtimeMinutes', SimpleImputer(strategy='mean', missing_values=np.nan), ['runtimeMinutes']),
#                                 ('imputer_genre', SimpleImputer(strategy='most_frequent'), ['genre']),
#                                 ('imputer_lang', SimpleImputer(strategy='constant', fill_value='Unknown'), ['originalLanguage']),
#                                 ], 
#                                 remainder='passthrough')

In [52]:
ct1 = ColumnTransformer(transformers=[
                        ('tvec_movieid', TfidfVectorizer(), ['movieid']),
                         ('tvec_reviewerName', TfidfVectorizer(), ['reviewerName']),
                         ('ohe_freqRev', OneHotEncoder(handle_unknown='ignore'), ['isFrequentReviewer']),
                         ('tvec_reviewText', TfidfVectorizer(ngram_range=(1,2), max_features=10000), ['reviewText']),
                        #  ('std_scaler_audienceScore', StandardScaler(), ['audienceScore']),
                         ('ohe_rating', OneHotEncoder(handle_unknown='ignore'), ['rating']),
                         ('mm_scaler_runtimeMinutes', MinMaxScaler(), ['runtimeMinutes']),
                         ('tvec_genre', TfidfVectorizer(max_features=20), ['genre']),
                         ('tvec_originalLanguage', TfidfVectorizer(max_features=100), ['originalLanguage']),
                         ('tvec_director', TfidfVectorizer(max_features=10000), ['director']),
                         ], remainder='passthrough', sparse_threshold=0.3)
ct1

ColumnTransformer(remainder='passthrough',
                  transformers=[('tvec_movieid', TfidfVectorizer(),
                                 ['movieid']),
                                ('tvec_reviewerName', TfidfVectorizer(),
                                 ['reviewerName']),
                                ('ohe_freqRev',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 ['isFrequentReviewer']),
                                ('tvec_reviewText',
                                 TfidfVectorizer(max_features=10000,
                                                 ngram_range=(1, 2)),
                                 ['reviewText']),
                                ('ohe_rating',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 ['rating']),
                                ('mm_scaler_runtimeMinutes', MinMaxScaler(),
                                 ['runtimeMinutes']),


In [55]:
pipe1 = Pipeline(steps=[
                        # ('imputer', ct1_imputer),
                        ('transformer', ct1), 
                        ('logreg', LogisticRegression(C=2, max_iter=1000))
                        ])

In [56]:
pipe1.fit(X_train, y_train)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1 and the array at index 2 has size 122068

In [None]:
pipe1.score(X_train, y_train)

In [None]:
predict_n_evaluate(pipe1, X_test, y_test)

### test file

In [None]:
# test1 = dftest[['reviewerName', 'isTopCritic', 'reviewText']]
# test1.shape