In [1]:
from mytools import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

print("Imported all libraries successfully!")

Imported all libraries successfully!


In [2]:
traindf = load_csv("train")
moviesdf = load_csv("movies")
traindf.shape, moviesdf.shape

((162758, 5), (143258, 14))

## Examine movies.csv data  

In [3]:
moviesdf.head()

Unnamed: 0,movieid,title,audienceScore,rating,ratingContents,releaseDateTheaters,releaseDateStreaming,runtimeMinutes,genre,originalLanguage,director,boxOffice,distributor,soundType
0,han_solo_heroic_myth_wonder_woman,Han Solo Heroic Myth Wonder Woman,50.0,,,,2018-08-25,75.0,"Comedy, Horror, Sci-fi",English,Claude Nicolet,,,
1,voyage_tyler_durden_han_solo_stardust,Voyage Tyler Durden Han Solo Stardust,,,,,2020-02-11,114.0,Drama,English,Nathan Haack,,,
2,norman_bates_ferris_bueller_hermione_granger_v...,Norman Bates Ferris Bueller Hermione Granger V...,43.0,,,,,120.0,Drama,Korean,Theresa Smith,,,
3,elegant_hermione_granger,Elegant Hermione Granger,60.0,,,,2020-10-23,90.0,"Action, Mystery & thriller",English,Donald Lewis,,,
4,adventure_rocky_balboa,Adventure Rocky Balboa,70.0,,,,2017-03-27,80.0,"Fantasy, Adventure, Animation",English,Howard Simms,,,


In [4]:
moviesdf.columns

Index(['movieid', 'title', 'audienceScore', 'rating', 'ratingContents',
       'releaseDateTheaters', 'releaseDateStreaming', 'runtimeMinutes',
       'genre', 'originalLanguage', 'director', 'boxOffice', 'distributor',
       'soundType'],
      dtype='object')

In [5]:
moviesdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143258 entries, 0 to 143257
Data columns (total 14 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   movieid               143258 non-null  object 
 1   title                 143258 non-null  object 
 2   audienceScore         73248 non-null   float64
 3   rating                13991 non-null   object 
 4   ratingContents        13991 non-null   object 
 5   releaseDateTheaters   30773 non-null   object 
 6   releaseDateStreaming  79420 non-null   object 
 7   runtimeMinutes        129431 non-null  float64
 8   genre                 132175 non-null  object 
 9   originalLanguage      129400 non-null  object 
 10  director              143258 non-null  object 
 11  boxOffice             14743 non-null   object 
 12  distributor           23005 non-null   object 
 13  soundType             15917 non-null   object 
dtypes: float64(2), object(12)
memory usage: 15.3+ MB


In [6]:
moviesdf.isnull().sum()

movieid                      0
title                        0
audienceScore            70010
rating                  129267
ratingContents          129267
releaseDateTheaters     112485
releaseDateStreaming     63838
runtimeMinutes           13827
genre                    11083
originalLanguage         13858
director                     0
boxOffice               128515
distributor             120253
soundType               127341
dtype: int64

In [7]:
moviesdf.describe()

Unnamed: 0,audienceScore,runtimeMinutes
count,73248.0,129431.0
mean,55.674967,93.708578
std,24.553648,28.129175
min,0.0,1.0
25%,37.0,84.0
50%,57.0,92.0
75%,76.0,103.0
max,100.0,2700.0


In [8]:
moviesdf["genre"].value_counts()

Drama                                          27860
Documentary                                    15162
Comedy                                         11514
Mystery & thriller                              7015
Comedy, Drama                                   5479
                                               ...  
Fantasy, Drama, Musical                            1
Holiday, Drama, Musical                            1
Drama, War, Adventure, Action                      1
Action, Adventure, Comedy, Drama, Animation        1
Western, Comedy, Animation                         1
Name: genre, Length: 2912, dtype: int64

## Drop duplicates from moviesdf dataframe  

In [9]:
movies_unique = moviesdf.drop_duplicates(subset=["movieid"])
movies_unique.shape, moviesdf.shape

((126404, 14), (143258, 14))

## Merge traindf and moviesdf  

In [10]:
train_movies_merged = pd.merge(traindf, movies_unique, on="movieid")
train_movies_merged.shape

(162758, 18)

In [11]:
train_movies_merged.columns

Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'sentiment', 'title', 'audienceScore', 'rating', 'ratingContents',
       'releaseDateTheaters', 'releaseDateStreaming', 'runtimeMinutes',
       'genre', 'originalLanguage', 'director', 'boxOffice', 'distributor',
       'soundType'],
      dtype='object')

In [12]:
train_movies_merged = train_movies_merged[['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'title', 'audienceScore', 'rating', 'ratingContents',
       'releaseDateTheaters', 'releaseDateStreaming', 'runtimeMinutes',
       'genre', 'originalLanguage', 'director', 'boxOffice', 'distributor',
       'soundType', 'sentiment']]
train_movies_merged.shape

(162758, 18)

In [13]:
train_movies_merged.head()

Unnamed: 0,movieid,reviewerName,isFrequentReviewer,reviewText,title,audienceScore,rating,ratingContents,releaseDateTheaters,releaseDateStreaming,runtimeMinutes,genre,originalLanguage,director,boxOffice,distributor,soundType,sentiment
0,marvelous_pirate,Benjamin Henry,False,Henry Selick’s first movie since 2009’s Corali...,Marvelous Pirate,65.0,PG-13,"['Violence', 'Brief Strong Language', 'Substan...",2022-10-21,2022-10-28,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso,,Netflix,,POSITIVE
1,marvelous_pirate,Sharon Foster,False,&#91;T&#93;he haphazard way this story is asse...,Marvelous Pirate,65.0,PG-13,"['Violence', 'Brief Strong Language', 'Substan...",2022-10-21,2022-10-28,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso,,Netflix,,NEGATIVE
2,marvelous_pirate,Melinda Dunn,False,The stop-motion artistry of Wendell &amp; Wild...,Marvelous Pirate,65.0,PG-13,"['Violence', 'Brief Strong Language', 'Substan...",2022-10-21,2022-10-28,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso,,Netflix,,POSITIVE
3,marvelous_pirate,Mr. Wayne Smith,False,Wendell &amp; Wild is narratively overstuffed ...,Marvelous Pirate,65.0,PG-13,"['Violence', 'Brief Strong Language', 'Substan...",2022-10-21,2022-10-28,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso,,Netflix,,NEGATIVE
4,marvelous_pirate,Connor Nelson,False,For being about the Netherworlds&#44; it&#8217...,Marvelous Pirate,65.0,PG-13,"['Violence', 'Brief Strong Language', 'Substan...",2022-10-21,2022-10-28,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso,,Netflix,,POSITIVE


In [14]:
train_movies_merged.isnull().sum()

movieid                     0
reviewerName                0
isFrequentReviewer          0
reviewText               6447
title                       0
audienceScore           13248
rating                  63724
ratingContents          63724
releaseDateTheaters     35834
releaseDateStreaming    16185
runtimeMinutes           3376
genre                    2438
originalLanguage         3290
director                    0
boxOffice               52776
distributor             40663
soundType               85367
sentiment                   0
dtype: int64

## Clean data in merged df  

In [15]:
# Fill missing values in "reviewText", 'rating" column with empty string and "NA" respectively
# Clean language names

train_final = train_movies_merged.copy()
train_final["reviewText"] = train_final["reviewText"].fillna("")
train_final["rating"] = train_final["rating"].fillna("NA")
train_final["originalLanguage"].replace({"English (United Kingdom)": "English", 
                                         "English (Australia)" : "English",
                                         "French (France)": "French", 
                                         "French (Canada)": "French",
                                         "Portuguese (Brazil)": "Portuguese",
                                         "Spanish (Spain)": "Spanish"},                                         
                                         inplace=True)
train_final["reviewText"].isna().sum()

0

In [16]:
train_final["rating"].value_counts()

NA       63724
R        50331
PG-13    36380
PG       11734
NC-17      201
TVPG       200
TV14       146
TVMA        42
Name: rating, dtype: int64

In [17]:
train_final["genre"].value_counts()

Drama                                               24159
Comedy                                              11252
Documentary                                          9805
Comedy, Drama                                        9775
Mystery & thriller                                   5012
                                                    ...  
Fantasy, Lgbtq+, Musical                                1
Action, Comedy, Animation                               1
Western, Adventure, Romance                             1
Action, Comedy, Foreign                                 1
Action, Crime, Drama, Horror, Mystery & thriller        1
Name: genre, Length: 1162, dtype: int64

In [18]:
train_final["originalLanguage"].unique(), train_final["originalLanguage"].value_counts()

(array(['English', 'Spanish', 'Portuguese', 'Russian', 'Japanese',
        'Chinese', 'Danish', 'French', nan, 'Italian', 'Korean', 'Thai',
        'Filipino', 'Czech', 'Indonesian', 'German', 'Persian',
        'Hungarian', 'Malayalam', 'Hebrew', 'Arabic', 'Vietnamese',
        'Dutch', 'Hindi', 'Polish', 'Khmer', 'Tibetan', 'Swedish',
        'Aramaic', 'Turkish', 'Norwegian', 'Inuktitut', 'Yiddish',
        'Romanian', 'Nepali', 'Tagalog', 'Finnish', 'Icelandic', 'crp',
        'Telugu', 'Bulgarian', 'Greek', 'Serbian', 'Albanian', 'Wolof',
        'Dzongkha', 'Unknown language', 'Tamil', 'Swahili', 'Bosnian',
        'Georgian', 'Marathi', 'Welsh', 'Bangla', 'Estonian', 'Gujarati',
        'Afrikaans', 'Mongolian', 'Croatian', 'Maori', 'Ukrainian',
        'Kalaallisut', 'Bambara', 'Lithuanian', 'Catalan', 'Armenian',
        'Urdu', 'Slovak', 'Lao', 'Lingala', 'Kurdish', 'Pashto', 'Romany',
        'Xhosa', 'Kannada', 'Luxembourgish', 'Maltese', 'Amharic',
        'Galician', 'Mal

In [19]:
train_final.columns

Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText', 'title',
       'audienceScore', 'rating', 'ratingContents', 'releaseDateTheaters',
       'releaseDateStreaming', 'runtimeMinutes', 'genre', 'originalLanguage',
       'director', 'boxOffice', 'distributor', 'soundType', 'sentiment'],
      dtype='object')

In [20]:
train_final = train_final.drop(columns=["title", "ratingContents", "releaseDateTheaters", "releaseDateStreaming", "boxOffice", "distributor", "soundType"])
train_final.shape,  train_final.columns

((162758, 11),
 Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
        'audienceScore', 'rating', 'runtimeMinutes', 'genre',
        'originalLanguage', 'director', 'sentiment'],
       dtype='object'))

## Separate features and labels  

In [21]:
train_features = train_final.iloc[:, :-1]
train_labels = train_final.iloc[:, -1]
train_features.shape, train_labels.shape

((162758, 10), (162758,))

In [22]:
train_features.head()

Unnamed: 0,movieid,reviewerName,isFrequentReviewer,reviewText,audienceScore,rating,runtimeMinutes,genre,originalLanguage,director
0,marvelous_pirate,Benjamin Henry,False,Henry Selick’s first movie since 2009’s Corali...,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
1,marvelous_pirate,Sharon Foster,False,&#91;T&#93;he haphazard way this story is asse...,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
2,marvelous_pirate,Melinda Dunn,False,The stop-motion artistry of Wendell &amp; Wild...,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
3,marvelous_pirate,Mr. Wayne Smith,False,Wendell &amp; Wild is narratively overstuffed ...,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
4,marvelous_pirate,Connor Nelson,False,For being about the Netherworlds&#44; it&#8217...,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso


In [23]:
train_labels.head()

0    POSITIVE
1    NEGATIVE
2    POSITIVE
3    NEGATIVE
4    POSITIVE
Name: sentiment, dtype: object

## Try "select_features" function from mytools module  

In [24]:
df = select_features(load_csv("train"), load_csv("movies"))
df.head()

Unnamed: 0,movieid,reviewerName,isFrequentReviewer,reviewText,sentiment,audienceScore,rating,runtimeMinutes,genre,originalLanguage,director
0,marvelous_pirate,Benjamin Henry,False,Henry Selick’s first movie since 2009’s Corali...,POSITIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
1,marvelous_pirate,Sharon Foster,False,&#91;T&#93;he haphazard way this story is asse...,NEGATIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
2,marvelous_pirate,Melinda Dunn,False,The stop-motion artistry of Wendell &amp; Wild...,POSITIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
3,marvelous_pirate,Mr. Wayne Smith,False,Wendell &amp; Wild is narratively overstuffed ...,NEGATIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso
4,marvelous_pirate,Connor Nelson,False,For being about the Netherworlds&#44; it&#8217...,POSITIVE,65.0,PG-13,105.0,"Comedy, Animation, Adventure, Fantasy",English,Bennie Basso


In [25]:
df.shape

(162758, 11)

In [26]:
df.columns

Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'sentiment', 'audienceScore', 'rating', 'runtimeMinutes', 'genre',
       'originalLanguage', 'director'],
      dtype='object')

In [27]:
dftest = select_features(load_csv("test"), load_csv("movies"))
dftest.head()

Unnamed: 0,movieid,reviewerName,isTopCritic,reviewText,audienceScore,rating,runtimeMinutes,genre,originalLanguage,director
0,legend_marty_mcfly_oracle,John Kim,False,Green slowly cranks up the dread with style an...,57.0,R,111.0,"Holiday, Horror, Mystery & thriller",English,Sara Barnett
1,legend_marty_mcfly_oracle,Kathleen Poole,False,Considering this is the 13th Halloween movie&#...,57.0,R,111.0,"Holiday, Horror, Mystery & thriller",English,Sara Barnett
2,legend_marty_mcfly_oracle,Kenneth Lamb,False,Halloween Ends is by no means the worst horror...,57.0,R,111.0,"Holiday, Horror, Mystery & thriller",English,Sara Barnett
3,legend_marty_mcfly_oracle,Brittany Lane,False,A concluding chapter that shares more DNA with...,57.0,R,111.0,"Holiday, Horror, Mystery & thriller",English,Sara Barnett
4,legend_marty_mcfly_oracle,Yolanda Thomas,False,For a film called Halloween Ends&#44; let&#821...,57.0,R,111.0,"Holiday, Horror, Mystery & thriller",English,Sara Barnett


In [28]:
dftest.shape

(55315, 10)

In [29]:
dftest.columns

Index(['movieid', 'reviewerName', 'isTopCritic', 'reviewText', 'audienceScore',
       'rating', 'runtimeMinutes', 'genre', 'originalLanguage', 'director'],
      dtype='object')