In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from itertools import compress

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE

import scipy.stats as stats
print("Imports done.")

Imports done.


### Custom functions  

In [58]:
def load_csv(filename: str):
    df = pd.read_csv(f"data/{filename}.csv")
    return df

In [59]:
def inspect(df: pd.DataFrame):
    print(f"Shape of the dataframe: {df.shape}")
    print()
    print(f"Columns in the dataframe:\n{df.columns}")
    print()
    print(f"{df.info()}")
    print()
    # print(f"Summary: {df.describe()}")
    print(f"Missing values:\n{df.isna().sum()}")
    return

In [60]:
def name_fl(name):
    l = name.split()
    n = ' '.join((l[0], l[-1]))
    return n

In [61]:
def select_features(df: pd.DataFrame, moviesdf: pd.DataFrame):
    '''
    This function merges the given dataframes. Note that the first df must be "train" or "test" and
    the second df should be "movies".
    Note: Sentiment column is present only in "train.csv" file and not "test.csv" file.
    '''
    
    # Drop duplicates from moviesdf
#     movies_unique = moviesdf.drop_duplicates(subset=["movieid"])
    # Drop duplicates using groupby - clubs similar rows and fills in missing values better
    movies_unique = moviesdf.fillna(value=np.nan).groupby("movieid").first()

    # Merge df and movies_unique
    df_merged = pd.merge(df, movies_unique, on="movieid", how='left')
    
    # Rename "isTopCritic" column, if it exists, to "isFrequentReviewer"
    df_merged.rename(columns={"isTopCritic": "isFrequentReviewer"}, inplace=True)
    
    # Drop columns
    df_merged = df_merged.drop(columns=["title", "ratingContents", "releaseDateTheaters", "releaseDateStreaming", "distributor", "soundType"])

    # Fill missing values in "reviewText" with empty string
    final = df_merged.copy()
    final["reviewYN"] = np.where(final["reviewText"].isnull(), 1, 0)    # Feature engineering - adding a new column
    final["reviewWC"] = final.apply(lambda x: len(str(x["reviewText"]).split()), axis=1)    # Feature engineering - adding second new column
    final["reviewText"] = final["reviewText"].fillna("neutral")
    
    # Fill missing values in "rating", "genre", original columns with the word "Unknown"
    final["rating"] = final["rating"].fillna("Unknown")
    final["genre"] = final["genre"].fillna("Unknown")
    final["originalLanguage"] = final["originalLanguage"].fillna("Unknown")

    # Impute missing values for "audienceScore" and "runtimeMinutes" columns
    final["audienceScore"] = final["audienceScore"].fillna(final["audienceScore"].mean())
    final["runtimeMinutes"] = final["runtimeMinutes"].fillna(final["runtimeMinutes"].median())
    
    # Preprocess and impute missing values in "boxOffice" column
    final["boxOffice"] = final["boxOffice"].str[1:]
    final["boxOffice"] = final["boxOffice"].replace(to_replace={"M": "*1000000", "K": "*1000"}, regex=True)
    final["boxOffice"] = final["boxOffice"].loc[final["boxOffice"].notnull()].apply(lambda x: eval(str(x)))
    final["boxOffice"] = final["boxOffice"].fillna(final["boxOffice"].median())
    # (Optional) Replace outliers in boxOffice with median
    median = final["boxOffice"].describe()['50%']
    iqr = final["boxOffice"].describe()['75%'] - final["boxOffice"].describe()['25%']
    ll = median - (1.5*iqr)
    ul = median + (1.5*iqr)
    final.loc[final["boxOffice"] > ul, "boxOffice"] = median
    
    # Clean language names
    final["originalLanguage"].replace({"English (United Kingdom)": "English", 
                                            "English (Australia)" : "English",
                                            "French (France)": "French", 
                                            "French (Canada)": "French",
                                            "Portuguese (Brazil)": "Portuguese",
                                            "Spanish (Spain)": "Spanish"},                                         
                                            inplace=True)
    
    # Clean reviewerName column
    pre_post_fixes = {"Mr. ": "", "Mrs. ": "", "Ms. ": "", "Dr. ": "", 
                      " MD": "", " DDS": "", " DVM": "", " Jr.": "", " PhD": "", " II": "", " IV": ""}
    final["reviewerName"] = final["reviewerName"].replace(pre_post_fixes, regex=True)
    final["reviewerName"] = final["reviewerName"].apply(name_fl)

    return final

In [62]:
merged = select_features(load_csv("train"), load_csv("movies"))
inspect(merged)

Shape of the dataframe: (162758, 14)

Columns in the dataframe:
Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'sentiment', 'audienceScore', 'rating', 'runtimeMinutes', 'genre',
       'originalLanguage', 'director', 'boxOffice', 'reviewYN', 'reviewWC'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162758 entries, 0 to 162757
Data columns (total 14 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   movieid             162758 non-null  object 
 1   reviewerName        162758 non-null  object 
 2   isFrequentReviewer  162758 non-null  bool   
 3   reviewText          162758 non-null  object 
 4   sentiment           162758 non-null  object 
 5   audienceScore       162758 non-null  float64
 6   rating              162758 non-null  object 
 7   runtimeMinutes      162758 non-null  float64
 8   genre               162758 non-null  object 
 9   originalLanguage    162758

In [63]:
merged_test = select_features(load_csv("test"), load_csv("movies"))
inspect(merged_test)

Shape of the dataframe: (55315, 13)

Columns in the dataframe:
Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'audienceScore', 'rating', 'runtimeMinutes', 'genre',
       'originalLanguage', 'director', 'boxOffice', 'reviewYN', 'reviewWC'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55315 entries, 0 to 55314
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movieid             55315 non-null  object 
 1   reviewerName        55315 non-null  object 
 2   isFrequentReviewer  55315 non-null  bool   
 3   reviewText          55315 non-null  object 
 4   audienceScore       55315 non-null  float64
 5   rating              55315 non-null  object 
 6   runtimeMinutes      55315 non-null  float64
 7   genre               55315 non-null  object 
 8   originalLanguage    55315 non-null  object 
 9   director            55315 non-null  object 
 10  boxO

## Work on vocabulary  

In [64]:
rt_senti = merged[["reviewText", "sentiment"]].copy()
rt_senti.shape, rt_senti.head()

((162758, 2),
                                           reviewText sentiment
 0  Henry Selick’s first movie since 2009’s Corali...  POSITIVE
 1  With a cast that reads like the Vogue Oscar pa...  NEGATIVE
 2  Creed II does not give us anything but another...  POSITIVE
 3  I know what you're thinking, but this is no Li...  POSITIVE
 4  Director Fernando Meirelles tells the story wi...  POSITIVE)

In [65]:
rt_test = pd.DataFrame(merged_test["reviewText"].copy())
rt_test.shape, rt_test.head()

((55315, 1),
                                           reviewText
 0  Green slowly cranks up the dread with style an...
 1  Philip Noyce's direction is elegant and unforc...
 2  It wouldn't do to say what path Maria ultimate...
 3  Pig is not exactly the arthouse John Wick that...
 4  An imaginative no-budget musical of sorts abou...)

In [66]:
def get_vocab(text_df, ngram_range=(1,1)):
    print(f"Generating vocabulary for ngram_range: {ngram_range}...")
    tvec1 = TfidfVectorizer(ngram_range=ngram_range, stop_words='english')
    tvec1.fit(text_df)
    voc_ngram = list(tvec1.vocabulary_.keys())
    return voc_ngram

In [67]:
def get_common_vocab(df_list: list, ngram_range_list: list):
    """
    Get common vocabulary from a list of dataframes.
    """
    # Get vocabulary from each dataframe for a given ngram-range
    for ngram_range in ngram_range_list:
        vocab_ngram = set()
        for df in df_list:
            vocab_ngram_df = set(get_vocab(df['reviewText'], ngram_range))
            print(f"Vocabulary for ngram-range {ngram_range} in df: {len(vocab_ngram_df)}")
            if vocab_ngram:
                vocab_ngram = vocab_ngram.intersection(vocab_ngram_df)
            else:
                vocab_ngram = vocab_ngram_df
        print(f"Common vocabulary for ngram-range {ngram_range}: {len(vocab_ngram)}")   
    return vocab_ngram

In [68]:
common_unigrams = get_common_vocab([rt_senti, rt_test], ngram_range_list=[(1,1)])
common_unigrams

Generating vocabulary for ngram_range: (1, 1)...


Vocabulary for ngram-range (1, 1) in df: 64912
Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 43588
Common vocabulary for ngram-range (1, 1): 35946


{'bikers',
 'antidote',
 'starry',
 'honored',
 'frantic',
 'adding',
 'steinbauer',
 'downton',
 'toilets',
 'pictorial',
 'rothman',
 'longings',
 'robson',
 'discourage',
 'sasquatch',
 'ooey',
 'subconscious',
 'awakening',
 'narrating',
 'retrieval',
 'lensed',
 'undistinguished',
 'amazing',
 'abyss',
 'doubled',
 'looksee',
 'streetdance',
 'progenitor',
 'uninterested',
 'reductio',
 'kasper',
 'gram',
 'ersatz',
 'voiceover',
 'criminalizing',
 'nazi',
 'wigs',
 'floating',
 'tinha',
 'blossom',
 'mobius',
 'ec',
 'purview',
 'babe',
 'alps',
 'rooftops',
 'mueller',
 'loftiest',
 'crammed',
 'krieger',
 'hellraiser',
 'docu',
 'appropriating',
 'addled',
 'lumbers',
 'chai',
 'fresno',
 'distribution',
 'modernized',
 'orcas',
 'desk',
 'goldsworthy',
 'clique',
 'bombast',
 'iliza',
 'athletes',
 'immortals',
 'daylights',
 'ss',
 'mercedes',
 'kyle',
 'komandarev',
 'amarrar',
 'presently',
 'mackenzie',
 'obstreperous',
 'alphabet',
 'agreed',
 'servicing',
 'sassoon',
 'r

In [69]:
common_bigrams = get_common_vocab([rt_senti, rt_test], ngram_range_list=[(2,2)])
common_bigrams

Generating vocabulary for ngram_range: (2, 2)...
Vocabulary for ngram-range (2, 2) in df: 1146211
Generating vocabulary for ngram_range: (2, 2)...
Vocabulary for ngram-range (2, 2) in df: 442876
Common vocabulary for ngram-range (2, 2): 115420


{'inspired feature',
 'career changer',
 'like minded',
 'audiences think',
 'jim gaffigan',
 'bittersweet note',
 'debates advocating',
 'narrative loses',
 'like superhero',
 'deserve having',
 'context shooting',
 'una experiencia',
 'new heights',
 'value systems',
 'like johnny',
 'funny fierce',
 'unflinching slice',
 'stick long',
 'watch seen',
 'bad word',
 'epic war',
 'performances universally',
 'searchers 1956',
 'style pure',
 'lacking spectacle',
 'stand ground',
 'like actually',
 'acted study',
 'place american',
 'gory gift',
 'roadrunner film',
 'attempts blend',
 'dramatic effect',
 'refreshingly adult',
 'taking inspiration',
 'winsome characters',
 'don surprised',
 'able resist',
 'viewers need',
 'epic tragedy',
 'surface level',
 'formula succeeds',
 'budget time',
 'preposterous story',
 'dark blue',
 'valentine genre',
 'brothers latest',
 'victory lap',
 'viewer ends',
 'result fun',
 'especially love',
 'camera eye',
 'feels disjointed',
 'paced outrageousl

In [70]:
common_trigrams = get_common_vocab([rt_senti, rt_test], ngram_range_list=[(3,3)])
common_trigrams

Generating vocabulary for ngram_range: (3, 3)...


Vocabulary for ngram-range (3, 3) in df: 1443939
Generating vocabulary for ngram_range: (3, 3)...
Vocabulary for ngram-range (3, 3) in df: 498850
Common vocabulary for ngram-range (3, 3): 23768


{'hero color city',
 'original film enjoyable',
 'mishmash styles genres',
 'emmerich old hand',
 'para merecer uma',
 'man climbed window',
 'moments visual splendor',
 'little coming age',
 'best battle scenes',
 'bad taste fun',
 'dark places film',
 'review parents available',
 'command humor pathos',
 'self contained self',
 'funny romantic comedy',
 'good time movie',
 'sci fi extravaganza',
 'designed maximum impact',
 'drama comedic moments',
 'funny film invites',
 'coming age dramedy',
 'apos star wars',
 'movie wants make',
 'clever little film',
 'falls short character',
 'uses overused word',
 'comic book adaptation',
 'nice mix comedy',
 'indiana jones temple',
 'rendition 8220 ant',
 'ballad ricky bobby',
 'direction peter jackson',
 'prevents viewer connecting',
 'best films 2018',
 'fast moving thriller',
 'nicely paced struggles',
 'social issue drama',
 'handsomely mounted highly',
 'destined cult classic',
 'best thing say',
 'makes better use',
 'set pieces sense',

`Get common words within training text corpus`  

In [95]:
df_shuffled = rt_senti.sample(frac=1)
df_splits = np.array_split(df_shuffled, 10)
# for df in df_splits:
#     print(df.shape)
#     display(df)

In [96]:
common_unigrams_train = get_common_vocab(df_splits, ngram_range_list=[(1,1)])
common_unigrams_train

Generating vocabulary for ngram_range: (1, 1)...


Vocabulary for ngram-range (1, 1) in df: 25501
Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 25329
Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 25438
Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 25489
Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 25265
Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 25523
Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 25411
Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 25447
Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 25577
Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 25485
Common vocabulary for ngram-range (1, 1): 9016


{'races',
 'cumulative',
 'center',
 'antidote',
 'mamma',
 'simply',
 'frantic',
 'adding',
 'occasional',
 'enterprise',
 'destination',
 'downton',
 'jerking',
 'slavery',
 'destroy',
 'opposed',
 'awakening',
 'aspire',
 'undistinguished',
 'amazing',
 'press',
 'ringing',
 'salvage',
 'sharper',
 'threat',
 'gonzo',
 'represents',
 'eda',
 'surprisingly',
 'loyalty',
 'seven',
 'shortcomings',
 'ersatz',
 'jake',
 'shows',
 'private',
 'pursuit',
 'nazi',
 'colorful',
 'situations',
 'plunge',
 'jittery',
 'tempered',
 'floating',
 'uninteresting',
 'psychedelic',
 'knowledge',
 'ghosts',
 'shattering',
 'babe',
 'amid',
 'crammed',
 'heyday',
 'percent',
 'batman',
 'candy',
 'docu',
 'charlotte',
 'green',
 'geek',
 'hader',
 'detracts',
 'memoir',
 'shoots',
 'award',
 'behave',
 'society',
 'ought',
 '84',
 'textbook',
 'persona',
 'rock',
 'help',
 'japan',
 'fragile',
 'puff',
 'kubrick',
 'kyle',
 'relatable',
 'spirals',
 'caesar',
 'sant',
 'pilot',
 'solidarity',
 'quest

In [101]:
sorted(list(common_unigrams_train))[:100]

['000',
 '007',
 '10',
 '100',
 '105',
 '11',
 '12',
 '13',
 '13th',
 '14',
 '15',
 '150',
 '160',
 '17',
 '1930s',
 '1940',
 '1940s',
 '1950s',
 '1960s',
 '1966',
 '1968',
 '1970s',
 '1972',
 '1973',
 '1977',
 '1978',
 '1980',
 '1980s',
 '1982',
 '1986',
 '1988',
 '1989',
 '1990s',
 '1994',
 '1995',
 '1999',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '2004',
 '2005',
 '2006',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2022',
 '20th',
 '21',
 '21st',
 '22',
 '225',
 '23',
 '233',
 '24',
 '25',
 '28',
 '30',
 '300',
 '33',
 '35',
 '3d',
 '40',
 '41',
 '44',
 '45',
 '46',
 '47',
 '48',
 '50',
 '50s',
 '58',
 '59',
 '60',
 '60s',
 '63',
 '70',
 '70s',
 '75',
 '78',
 '80',
 '80s',
 '82',
 '8211',
 '8212',
 '8216',
 '8217',
 '8220',
 '8221']

In [None]:
sorted(list(common_unigrams_train))[100:200]

## Common vocabulary in train that is also in test  

In [98]:
len(common_unigrams_train & common_unigrams)

9011

In [99]:
sorted(list(common_unigrams_train & common_unigrams))[:100]

['000',
 '007',
 '10',
 '100',
 '105',
 '11',
 '12',
 '13',
 '13th',
 '14',
 '15',
 '150',
 '160',
 '17',
 '1930s',
 '1940',
 '1940s',
 '1950s',
 '1960s',
 '1966',
 '1968',
 '1970s',
 '1972',
 '1973',
 '1977',
 '1978',
 '1980',
 '1980s',
 '1982',
 '1986',
 '1988',
 '1989',
 '1990s',
 '1994',
 '1995',
 '1999',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '2004',
 '2005',
 '2006',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2022',
 '20th',
 '21',
 '21st',
 '22',
 '225',
 '23',
 '233',
 '24',
 '25',
 '28',
 '30',
 '300',
 '33',
 '35',
 '3d',
 '40',
 '41',
 '44',
 '45',
 '46',
 '47',
 '48',
 '50',
 '50s',
 '58',
 '59',
 '60',
 '60s',
 '63',
 '70',
 '70s',
 '75',
 '78',
 '80',
 '80s',
 '82',
 '8211',
 '8212',
 '8216',
 '8217',
 '8220',
 '8221']