In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from itertools import compress

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB, CategoricalNB, ComplementNB, MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn.neural_network import MLPClassifier

# import lightgbm as ltb

import scipy.stats as stats
print("Imports done.")

Imports done.


In [10]:
platform = 'vscode'  

In [3]:
# Helper function for loading files  

def load_csv(filename: str):
    if platform == "vscode":
        df = pd.read_csv(f"data/{filename}.csv")
    else:
        df = pd.read_csv(f"/kaggle/input/sentiment-prediction-on-movie-reviews/{filename}.csv")
        
    return df

In [4]:
def inspect(df: pd.DataFrame):
    print(f"Shape of the dataframe: {df.shape}")
    print()
    print(f"Columns in the dataframe:\n{df.columns}")
    print()
    print(f"{df.info()}")
    print()
    # print(f"Summary: {df.describe()}")
    print(f"Missing values:\n{df.isna().sum()}")
    return

In [5]:
def name_fl(name):
    l = name.split()
    n = ' '.join((l[0], l[-1]))
    return n

In [7]:
def select_features(df: pd.DataFrame, moviesdf: pd.DataFrame, row_thresh_null=None):
    '''
    This function merges the given dataframes. Note that the first df must be "train" or "test" and
    the second df should be "movies".
    Note: Sentiment column is present only in "train.csv" file and not "test.csv" file.
    '''
    
    # Drop duplicates from moviesdf
#     movies_unique = moviesdf.drop_duplicates(subset=["movieid"])

    # Drop duplicates using groupby - clubs similar rows and fills in missing values better
    movies_unique = moviesdf.fillna(value=np.nan).groupby("movieid").first().reset_index()
    

    # Handle missing values in movies.csv better than just dropping duplicates?
#     movies_unique = moviesdf.copy()
#     movies_unique[["audienceScore", "runtimeMinutes"]] = movies_unique[["audienceScore", "runtimeMinutes"]].interpolate(method='linear', axis=0)
#     movies_unique = movies_unique.fillna(value=np.nan).groupby("movieid").first().fillna(method='ffill').reset_index()

    

    # Merge df and movies_unique
    df_merged = pd.merge(df, movies_unique, on="movieid", how='left')
    
    # Rename "isTopCritic" column, if it exists, to "isFrequentReviewer"
    df_merged.rename(columns={"isTopCritic": "isFrequentReviewer"}, inplace=True)
    
    # Drop columns
#     df_merged = df_merged.drop(columns=["title", "ratingContents", "releaseDateTheaters", "releaseDateStreaming", "distributor", "soundType"])
#     df_merged = df_merged.drop(columns=["title", "soundType"])
    
    # Drop rows (OPTIONAL: Uses kwarg row_thresh_null)
    if row_thresh_null != None:
        df_merged.dropna(axis=0, thresh=(df_merged.shape[1] - row_thresh_null), inplace=True)
        

    # Create new columns based on reviewText
    final = df_merged.copy()
    final["reviewYN"] = np.where(final["reviewText"].isnull(), 1, 0)    # Feature engineering - adding a new column
    final["reviewWC"] = final.apply(lambda x: len(str(x["reviewText"]).split()), axis=1)    # Feature engineering - adding second new column
    
    # Clean text (replace numbers with empty string) and fill missing values in "reviewText" with empty string
    final["reviewText"] = final["reviewText"].str.replace('\d+', '', regex=True)
    final["reviewText"] = final["reviewText"].fillna("neutral")
    
    # Fill missing values in "rating", "genre", original columns with the word "unknown"
    final["rating"] = final["rating"].fillna("unknown")
    final["originalLanguage"] = final["originalLanguage"].fillna("unknown")
    final["genre"] = final["genre"].fillna("unknown")
    final["genre"] = final["genre"].apply(lambda x: re.sub(r"-", "", x))
    final["genreSorted"] = final["genre"].apply(lambda x: (",").join(sorted(x.split(", "))))
#     final["genre"] = final["genre"].replace(to_replace={"&": ""})

    # Impute missing values for "audienceScore" and "runtimeMinutes" columns
    final["audienceScore"] = final["audienceScore"].fillna(final["audienceScore"].mean())
    final["runtimeMinutes"] = final["runtimeMinutes"].fillna(final["runtimeMinutes"].median())
    
    # Preprocess and impute missing values in "boxOffice" column
    final["boxOffice"] = final["boxOffice"].str[1:]
    final["boxOffice"] = final["boxOffice"].replace(to_replace={"M": "*1000000", "K": "*1000"}, regex=True)
    final["boxOffice"] = final["boxOffice"].loc[final["boxOffice"].notnull()].apply(lambda x: eval(str(x)))
    final["boxOffice"] = final["boxOffice"].fillna(final["boxOffice"].median())
    # (Optional) Replace outliers in boxOffice with median
#     median = final["boxOffice"].describe()['50%']
#     iqr = final["boxOffice"].describe()['75%'] - final["boxOffice"].describe()['25%']
#     ll = median - (1.5*iqr)
#     ul = median + (1.5*iqr)
#     final.loc[final["boxOffice"] > ul, "boxOffice"] = median
    
    # Clean language names
    final["originalLanguage"].replace({"English (United Kingdom)": "English", 
                                            "English (Australia)" : "English",
                                            "French (France)": "French", 
                                            "French (Canada)": "French",
                                            "Portuguese (Brazil)": "Portuguese",
                                            "Spanish (Spain)": "Spanish"},                                         
                                            inplace=True)
    
    # Clean reviewerName column
    pre_post_fixes = {"Mr. ": "", "Mrs. ": "", "Ms. ": "", "Dr. ": "", 
                      " MD": "", " DDS": "", " DVM": "", " Jr.": "", " PhD": "", " II": "", " IV": ""}
    final["reviewerName"] = final["reviewerName"].replace(pre_post_fixes, regex=True)
    final["reviewerName"] = final["reviewerName"].apply(name_fl)
    
    # Handle 'ratingContents' column
    final["ratingContents"] = final["ratingContents"].fillna("neutral")
    final["rcSorted"] = final["ratingContents"].apply(lambda x: (",").join(sorted(x.strip("][").split(", "))))
    final["rcSorted"] = final["rcSorted"].apply(lambda x: re.sub(r"'", "", x))
    final["rcSorted"] = final["rcSorted"].apply(lambda x: re.sub(r"[/\s]", "_", x))  
    
    # Handle 'ratingContents' column
    final["distributor"] = final["distributor"].fillna("unknown")
    
    # Work with 'releaseDateTheaters', releaseDateStreaming column
    final[["releaseDateTheaters", "releaseDateStreaming"]] = final[["releaseDateTheaters", "releaseDateStreaming"]].astype('datetime64[ns]')

    final["releaseDate"] = final[["releaseDateTheaters", "releaseDateStreaming"]].min(axis=1, skipna=False)
    final["releaseDate"] = final["releaseDate"].fillna(final["releaseDate"].median())


    final["releaseYear"] = final["releaseDate"].dt.year
    final["releaseMonth"] = final["releaseDate"].dt.month
    
    # Compute "releaseDiff" column and fill missing values in "releaseDiff" and (optional) replace outliers
    final["releaseDiff"] = (final["releaseDateStreaming"] - final["releaseDateTheaters"]) / np.timedelta64(1, 'D')
    final["releaseDiff"] = final["releaseDiff"].apply(lambda x: abs(x))
    final["releaseDiff"] = final["releaseDiff"].fillna(value=0)
#     final["releaseDiff"] = final["releaseDiff"].fillna(final["releaseDiff"].median())
    # median = final["releaseDiff"].describe()['50%']
    # iqr = final["releaseDiff"].describe()['75%'] - final["releaseDiff"].describe()['25%']
    # ll = median - (1.5*iqr)
    # ul = median + (1.5*iqr)
    # final.loc[final["releaseDiff"] > ul, "releaseDiff"] = median
    # final.loc[final["releaseDiff"] < ll, "releaseDiff"] = median
    
    # Create new feature columns
    
    # Convert audienceScore to categories  
    num_bins_as = 20
    final["audScoreBins"] = pd.cut(final['audienceScore'], bins=num_bins_as, labels=False)
    
    # Convert runtimeMinutes to categories  
#     num_bins_rt = 20
    final["runtimeBins"] = pd.cut(final['runtimeMinutes'], bins=[0,75,120,180,565], labels=[4,3,2,1])
    
    # Convert boxOffice to categories  
    num_bins_bo = 5
    final["boxOfficeBins"] = pd.cut(final['boxOffice'], bins=num_bins_bo, labels=False)
    
    # Convert releaseDiff to categories  
    num_bins_rd = 5
    final["releaseDiffBins"] = pd.cut(final['releaseDiff'], bins=[-1, 180, 360, 1000, 40000], labels=[0, 1, 2, 3])

    return final

In [8]:
def split_train_predict(features, labels, pipeline, test_size=0.25, random_state=42):
    # cols = features.columns
    if len(features.shape) == 1:
        features = features.to_numpy().reshape(-1, 1)  # reshape to 2D array
    features = pd.DataFrame(features)
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=test_size, random_state=random_state)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    return pipeline

In [11]:
merged = select_features(load_csv("train"), load_csv("movies"), row_thresh_null=None)
inspect(merged)

Shape of the dataframe: (162758, 30)

Columns in the dataframe:
Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'sentiment', 'title', 'audienceScore', 'rating', 'ratingContents',
       'releaseDateTheaters', 'releaseDateStreaming', 'runtimeMinutes',
       'genre', 'originalLanguage', 'director', 'boxOffice', 'distributor',
       'soundType', 'reviewYN', 'reviewWC', 'genreSorted', 'rcSorted',
       'releaseDate', 'releaseYear', 'releaseMonth', 'releaseDiff',
       'audScoreBins', 'runtimeBins', 'boxOfficeBins', 'releaseDiffBins'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162758 entries, 0 to 162757
Data columns (total 30 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   movieid               162758 non-null  object        
 1   reviewerName          162758 non-null  object        
 2   isFrequentReviewer    162758 non-null  bool          
 

## More helper functions  

In [12]:
# Testing predictions on missing reviewtext columns  

def predict_on_missing_review_data(pipe, selected_features, merged_train):
    missing_reviews_train = merged_train.loc[merged_train['reviewYN'] == 1]
    missing_reviews_train.reset_index(drop=True)
    X_train_miss_revs = missing_reviews_train.drop('sentiment', axis = 1)
    X_train_miss_revs = X_train_miss_revs[selected_features]
    y_train_miss_revs = missing_reviews_train['sentiment']
    
    y_pred_miss_revs = pipe.predict(X_train_miss_revs)
#     print('Predictions on rows which had missing reviewText')
    print("Confusion matrix and f1-score for rows which have no reviewText in X_train: ")
    print(confusion_matrix(y_train_miss_revs, y_pred_miss_revs, labels=pipe.classes_))
    
    return f1_score(y_train_miss_revs, y_pred_miss_revs, average='micro')

In [13]:
# Function to build pipelines for GridSearchCV which goes into the final 'submit' function  
def build_pipeline(selected_model=LogisticRegression(), 
                   selected_features={'txt': ['reviewText']}, 
                   param_grid=None,
                   vocab=None,
                   vocab_usage='tfidf',
                   strip_accents='unicode', 
                   add_countvec=False):
    # Encoders  
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    MMscaler = MinMaxScaler()
    StdScaler = StandardScaler()
    RobScaler = RobustScaler()
    
    # Text features ['reviewText', 'reviewerName', 'movieid', 'genre', 'director']
    tfidf_vec = TfidfVectorizer(ngram_range=(1,2))
    count_vec = CountVectorizer(ngram_range=(1,2))
    txt_pipe = Pipeline(steps=[
                            ("tvec", TfidfVectorizer(ngram_range=(1,3), strip_accents=strip_accents))    # Adjust ngram_range here for reviewText
                        ])
    txt_pipe_vocab = Pipeline(steps=[
                            ("tvec", TfidfVectorizer(ngram_range=(1,3), strip_accents=strip_accents, vocabulary=vocab))    # Adjust ngram_range here for reviewText
                        ])
    txt_pipe_2 = Pipeline(steps=[
                            ("tvec", TfidfVectorizer(ngram_range=(2,2)))
                        ])
    txt_pipe_3 = Pipeline(steps=[
                            ("tvec", TfidfVectorizer(ngram_range=(1,1)))
                        ])
    txt_pipe_4 = Pipeline(steps=[
                            ("tvec", TfidfVectorizer(ngram_range=(1,3)))    # Adjust ngram_range here for reviewText
                        ])
    txt_pipe_title  = Pipeline(steps=[
                            ("tvec", TfidfVectorizer(ngram_range=(1,5)))    # Adjust ngram_range here for reviewText
                        ])
    
    # Additional pipes for count vectorizer
    txt_pipe_countvec = Pipeline(steps=[
                            ("cvec", CountVectorizer(ngram_range=(1,3), strip_accents=strip_accents))    # Adjust ngram_range here for reviewText
                        ])
    txt_pipe_countvec_vocab = Pipeline(steps=[
                            ("cvec", CountVectorizer(ngram_range=(1,3), strip_accents=strip_accents, vocabulary=vocab))    # Adjust ngram_range here for reviewText
                        ])
    
    txt_pipe_countvec_ratingContents = Pipeline(steps=[
                            ("cvec", CountVectorizer(ngram_range=(1,5)))    # Adjust ngram_range here for reviewText
                        ])
    txt_pipe_countvec_rcSorted = Pipeline(steps=[
                            ("cvec", CountVectorizer(ngram_range=(1,1)))    # Adjust ngram_range here for reviewText
                        ])
    txt_pipe_countvec_genre = Pipeline(steps=[
                            ("cvec", CountVectorizer(ngram_range=(1,1)))
                        ])
    txt_pipe_countvec_genreSorted = Pipeline(steps=[
                            ("cvec", CountVectorizer(ngram_range=(1,5)))
                        ])
    txt_pipe_countvec_distributor = Pipeline(steps=[
                            ("cvec", CountVectorizer(ngram_range=(1,3)))    # Adjust ngram_range here for reviewText
                        ])
    txt_pipe_countvec_title = Pipeline(steps=[
                            ("cvec", CountVectorizer(ngram_range=(1,5), strip_accents=strip_accents))
                        ])
    txt_pipe_countvec_movieid = Pipeline(steps=[
                            ("cvec", CountVectorizer(ngram_range=(1,1)))
                        ])
    # Basic pipes  
    TFs = []
    try:
        for c in selected_features['cat']:
            TFs.append((f"cat_{c}", ohe, [c]))
    except:
        pass
    try:
        for n in selected_features['num']:
            if n in ["runtimeMinutes", "boxOffice", "releaseDiff"]:
                TFs.append((f"num_{n}", RobScaler, [n]))
            else:
                TFs.append((f"num_{n}", MMscaler, [n]))
    except:
        pass
    try:
        for t in selected_features['txt']:
            if t in ['director', 'reviewerName']:
                TFs.append((f"txt_{t}", txt_pipe_2, t))
            elif t in ["originalLanguage", "releaseYear"]:
                TFs.append((f"txt_{t}", txt_pipe_3, t))
            elif t in ['genre']:
                TFs.append((f"txt_{t}", txt_pipe_countvec_genre, t))
            elif t in ["genreSorted"]:
                TFs.append((f"txt_{t}", txt_pipe_countvec_genreSorted, t))
            elif t in ["distributor"]:
                TFs.append((f"txt_{t}", txt_pipe_countvec_distributor, t))
            elif t in ['reviewText']:
#                 if vocab and (vocab_usage in ["tfidf", "both"]):
#                     txt_pipe.set_params(tvec__vocabulary=vocab)
#                     txt_pipe.set_params(tvec__stop_words='english')
                TFs.append((f"txt_{t}", txt_pipe, t))
            elif t in ['reviewText_2']:
                if add_countvec:
#                     if vocab and (vocab_usage in ["count", "both"]):
#                         txt_pipe_countvec.set_params(cvec__vocabulary=vocab)
#                         txt_pipe_countvec.set_params(cvec__stop_words='english')
                    TFs.append((f"txt_{t}", txt_pipe_countvec, t))
            elif t in ['reviewText_3']:
                if vocab_usage == 'tfidf':
                    TFs.append((f"txt_{t}", txt_pipe_vocab, t))
                elif vocab_usage == 'count':
                    TFs.append((f"txt_{t}", txt_pipe_countvec_vocab, t))
            elif t in ["ratingContents"]:
                TFs.append((f"txt_{t}", txt_pipe_countvec_ratingContents, t))
            elif t in ["rcSorted"]:
                TFs.append((f"txt_{t}", txt_pipe_countvec_rcSorted, t))
            elif t in ["title"]:
                TFs.append((f"txt_{t}", txt_pipe_title, t))
            elif t in ["movieid"]:
                TFs.append((f"txt_{t}", txt_pipe_countvec_movieid, t))
            else:
                pass
    except:
        pass

    # Build ColumnTransformer  
    ct = ColumnTransformer(transformers=TFs, remainder='drop')

    # Build Pipeline
    pipe = Pipeline(steps=[('ct', ct), ('model', selected_model)])
    print("\nPipeline built successfully.")

    # Use the pipe in GridSearchCV
    if param_grid == None:
        param_grid_temp = {"model__C": [1],
                     'model__solver': ['liblinear']}
        print("Full GridSearchCV pipeline built successfully with basic default param_grid.\n")
        pipeCV = GridSearchCV(pipe, param_grid_temp, cv=10, scoring="f1_micro", n_jobs=-1)
        return pipeCV
    
    # GridSearchCV if param_grid provided
    if param_grid:
        pipeCV = GridSearchCV(pipe, param_grid, cv=10, scoring="f1_micro", n_jobs=-1)
        print("\nFull GridSearchCV pipeline built successfully.")
        return pipeCV

In [14]:
def submit_v4(selected_model=LogisticRegression(C=1, solver='liblinear', max_iter=100000), 
              selected_features={'txt': ['reviewText']}, 
              param_grid=None,
              vocab=None,
              vocab_usage='tfidf',
              strip_accents='unicode', 
              add_countvec=False):
    
    print("\nRunning the submit_v4 function...")
    
    # Fine tune selected_features
    if add_countvec:
        if 'txt' in selected_features.keys():
            if "reviewText_2" not in selected_features['txt']:
                selected_features['txt'].append('reviewText_2')
    if vocab:
        if 'txt' in selected_features.keys():
            if "reviewText_3" not in selected_features['txt']:
                selected_features['txt'].append('reviewText_3')
    print(f"\nSelected features: {selected_features}")

    # Build Pipeline
    pipe = build_pipeline(selected_model, selected_features, param_grid=param_grid, 
                          vocab=vocab, vocab_usage='tfidf', 
                          strip_accents=strip_accents, add_countvec=add_countvec)
    print(pipe)

    # Features list
    features = []
    for item in selected_features.values():
        features.extend(item)

    # Retrain on the whole train.csv file  
    merged = select_features(load_csv("train"), load_csv("movies"), row_thresh_null=None)    # Decide if you want to drop any rows containing lot of nulls
    if add_countvec:
        merged['reviewText_2'] = merged['reviewText']
    if vocab:
        merged['reviewText_3'] = merged['reviewText']
        
    X_train = merged.drop(labels="sentiment", axis=1)
    y_train = merged["sentiment"]

    X_train = X_train[features]
    
#     # Duplicate reviewText column to use both CountVectorizer and TfidfVectorizer
#     if add_countvec and ("reviewText" in features):
#         X_train['reviewText_2'] = X_train['reviewText']
    
    # Check1
    print("\nCheck 1 complete.")
    print(f"Shape of X_train: {X_train.shape}")
    print(f"Features in X_train: {X_train.columns}")
#     print(X_train.head())
    
#     if not param_grid:
#         try:
#             print(pipe.named_steps['model'].intercept_, pipe.named_steps['model'].coef_)
#         except:
#             print("Model not trained yet!")
    
    # Fit 
    print("\nTraining started with full pipeline...")
    pipe.fit(X_train, y_train)
    
    # Check2
    print("\nCheck 2 complete.")
    print("Details of the best model using full pipeline (GridSearchCV) on X_train: ")
    print(f"Best Params: {pipe.best_params_}")
    print(f"Best Score: {pipe.best_score_}")
    
    print(predict_on_missing_review_data(pipe, features, merged))    # Function defined above
    
        
        
    # Predict on test.csv file
    merged_test = select_features(load_csv("test"), load_csv("movies"))
    if add_countvec:
        merged_test['reviewText_2'] = merged_test['reviewText']
    if vocab:
        merged_test['reviewText_3'] = merged_test['reviewText']
        
    X_test = merged_test.copy()

    X_test = X_test[features]
    
#     # Duplicate reviewText column to use both CountVectorizer and TfidfVectorizer
#     if add_countvec and ("reviewText" in features):
#         X_test['reviewText_2'] = X_test['reviewText']
    
    # Check3
    print("\nCheck 3 complete.")
    print(f"Shape of X_test: {X_test.shape}")
    print(f"Features in X_train: {X_test.columns}")
#     print(X_test.head())
    
    y_pred = pipe.predict(X_test)
    
    # Check4
    print("\nCheck 4 complete.")
    cv_results_df = pd.DataFrame(pipe.cv_results_)
    print("Details of the best model using full pipeline (GridSearchCV) on X_train: ")
    print(f"Best Estimator: {pipe.best_estimator_}")
    print(f"Best Params: {pipe.best_params_}")
    print(f"Best Score: {pipe.best_score_}")
    print(f"Best Index: {pipe.best_index_}")
    print(f"Refit Time: {pipe.refit_time_}")
    print(f"Shape of CV results dataframe: {cv_results_df.shape}")
    
    pred_df = pd.DataFrame(y_pred)
    pred_df.columns = ["sentiment"]
    pred_df.index.name = "id"
    pred_df.to_csv("submission.csv")
    
    print("\nSuccessfully created the submission file!!!")
    
#     return pipe_cv_results_df
    return pipe.cv_results_