In [1]:
from mytools import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score


print("Imports done.")

Imports done.


## Merge train.csv and movies.csv file  

In [2]:
merged = select_features(load_csv("train"), load_csv("movies"))
merged.head()

Unnamed: 0,movieid,reviewerName,isFrequentReviewer,reviewText,sentiment,audienceScore,rating,runtimeMinutes,genre,originalLanguage,director
0,marvelous_pirate,Benjamin Henry,False,Henry Selick’s first movie since 2009’s Corali...,POSITIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
1,marvelous_pirate,Sharon Foster,False,&#91;T&#93;he haphazard way this story is asse...,NEGATIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
2,marvelous_pirate,Melinda Dunn,False,The stop-motion artistry of Wendell &amp; Wild...,POSITIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
3,marvelous_pirate,Mr. Wayne Smith,False,Wendell &amp; Wild is narratively overstuffed ...,NEGATIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
4,marvelous_pirate,Connor Nelson,False,For being about the Netherworlds&#44; it&#8217...,POSITIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso


In [3]:
inspect(merged)

Shape of the dataframe: (162758, 11)

Columns in the dataframe:
Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'sentiment', 'audienceScore', 'rating', 'runtimeMinutes', 'genre',
       'originalLanguage', 'director'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162758 entries, 0 to 162757
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   movieid             162758 non-null  object 
 1   reviewerName        162758 non-null  object 
 2   isFrequentReviewer  162758 non-null  bool   
 3   reviewText          162758 non-null  object 
 4   sentiment           162758 non-null  object 
 5   audienceScore       149510 non-null  float64
 6   rating              162758 non-null  object 
 7   runtimeMinutes      159382 non-null  float64
 8   genre               160320 non-null  object 
 9   originalLanguage    159468 non-null  object 
 10  director     

## Evaluate numerical features one by one  

In [4]:
pipe_num = Pipeline(steps=[
                            ("imputer", SimpleImputer(strategy="mean", missing_values=np.nan)),
                            ("scaler", MinMaxScaler())
                        ])
pipe_num

Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', MinMaxScaler())])

In [5]:
pipe = Pipeline(steps=[
                        ("preprocessor", pipe_num),
                        ("model", LogisticRegression())
                    ])
pipe

Pipeline(steps=[('preprocessor',
                 Pipeline(steps=[('imputer', SimpleImputer()),
                                 ('scaler', MinMaxScaler())])),
                ('model', LogisticRegression())])

In [6]:
def split_train_predict(features, labels, pipeline, test_size=0.25, random_state=42):
    # cols = features.columns
    if len(features.shape) == 1:
        features = features.to_numpy().reshape(-1, 1)  # reshape to 2D array
    features = pd.DataFrame(features)
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=test_size, random_state=random_state)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    return #y_pred

In [7]:
split_train_predict(merged["audienceScore"], merged["sentiment"], pipe)

              precision    recall  f1-score   support

    NEGATIVE       0.58      0.25      0.35     13717
    POSITIVE       0.70      0.91      0.79     26973

    accuracy                           0.69     40690
   macro avg       0.64      0.58      0.57     40690
weighted avg       0.66      0.69      0.64     40690

[[ 3446 10271]
 [ 2449 24524]]


In [8]:
split_train_predict(merged["runtimeMinutes"], merged["sentiment"], pipe)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    NEGATIVE       0.00      0.00      0.00     13717
    POSITIVE       0.66      1.00      0.80     26973

    accuracy                           0.66     40690
   macro avg       0.33      0.50      0.40     40690
weighted avg       0.44      0.66      0.53     40690

[[    0 13717]
 [    0 26973]]


In [9]:
split_train_predict(merged[["audienceScore", "runtimeMinutes"]], merged["sentiment"], pipe)

              precision    recall  f1-score   support

    NEGATIVE       0.58      0.26      0.36     13717
    POSITIVE       0.71      0.91      0.79     26973

    accuracy                           0.69     40690
   macro avg       0.65      0.58      0.58     40690
weighted avg       0.66      0.69      0.65     40690

[[ 3524 10193]
 [ 2506 24467]]


## Consider other columns in movies.csv file  

In [10]:
merged.columns

Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'sentiment', 'audienceScore', 'rating', 'runtimeMinutes', 'genre',
       'originalLanguage', 'director'],
      dtype='object')

In [11]:
len(merged["originalLanguage"].unique())

86

In [12]:
tvec = TfidfVectorizer()
tvec

TfidfVectorizer()

In [13]:
pipe_txt = Pipeline(steps=[
                            # ("imputer", SimpleImputer(strategy="most_frequent", missing_values=np.nan)),
                            # ("dfmaker", FunctionTransformer(np.reshape, kw_args={'newshape':-1})),
                            ("tvec", tvec)
                        ])
pipe_txt

Pipeline(steps=[('tvec', TfidfVectorizer())])

In [14]:
ct_txt = ColumnTransformer(transformers=[
                                        ("tvec", pipe_txt, ["originalLanguage"]),
                                        ("tvec2", pipe_txt, ["genre"]),
                                        ], remainder="drop")
ct_txt

ColumnTransformer(transformers=[('tvec',
                                 Pipeline(steps=[('tvec', TfidfVectorizer())]),
                                 ['originalLanguage']),
                                ('tvec2',
                                 Pipeline(steps=[('tvec', TfidfVectorizer())]),
                                 ['genre'])])

In [15]:
pipe2 = Pipeline(steps=[
                        ("preprocessor", pipe_txt),
                        ("model", LogisticRegression())
                    ])
pipe2

Pipeline(steps=[('preprocessor', Pipeline(steps=[('tvec', TfidfVectorizer())])),
                ('model', LogisticRegression())])

In [16]:
split_train_predict(merged["originalLanguage"], merged["sentiment"], pipe2)

AttributeError: 'int' object has no attribute 'lower'