In [21]:
from mytools import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import json

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.compose import ColumnTransformer
# sklearn configuration
from sklearn import set_config
set_config(display='diagram')

print('Imports done')

Imports done


## Submit to kaggle function  

`Inputs`
>> Features dictionary: Keys will be 'cat', 'num', 'txt', values will be lists of features.   

>> Model: Model to be used for prediction.  

In [22]:
def build_pipeline(selected_model=LogisticRegression(), selected_features={'txt': ['reviewText']}):
    # Encoders  
    ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
    scaler = MinMaxScaler()
    tfidf_vec = TfidfVectorizer(ngram_range=(1,2))

    # Basic pipes  
    TFs = []
    for c in selected_features['cat']:
        TFs.append((f"cat_{c}", ohe, [c]))
    for t in selected_features['num']:
        TFs.append((f"num_{t}", scaler, [t]))
    for t in selected_features['txt']:
        TFs.append((f"txt_{t}", tfidf_vec, t))


    # Build ColumnTransformer  
    ct = ColumnTransformer(transformers=TFs, remainder='drop')

    # Build Pipeline
    pipe = Pipeline(steps=[('ct', ct), ('model', selected_model)])
    return pipe

In [23]:
num_vars = ["audienceScore", "runtimeMinutes"]
txt_vars = ["originalLanguage", "genre", "director", "reviewerName", "reviewText"]
cat_vars = ["rating", "isFrequentReviewer"]

SFs = {'cat': cat_vars, 'num': num_vars, 'txt': txt_vars}
SFs

{'cat': ['rating', 'isFrequentReviewer'],
 'num': ['audienceScore', 'runtimeMinutes'],
 'txt': ['originalLanguage',
  'genre',
  'director',
  'reviewerName',
  'reviewText']}

In [24]:
features = []
for item in SFs.values():
    features.extend(item)
features

['rating',
 'isFrequentReviewer',
 'audienceScore',
 'runtimeMinutes',
 'originalLanguage',
 'genre',
 'director',
 'reviewerName',
 'reviewText']

In [27]:
build_pipeline(selected_features=SFs)

In [8]:
def submit_v4(selected_model=LogisticRegression(), selected_features={'txt': ['reviewText']}):

    # Build Pipeline
    pipe = build_pipeline(selected_model, selected_features)

    # Features list
    features = selected_features['cat'] + selected_features['num'] + selected_features['txt']

    # Retrain on the whole train.csv file  
    merged = select_features(load_csv("train"), load_csv("movies"))
    X_train = merged.drop(labels="sentiment", axis=1)
    y_train = merged["sentiment"]

    X_train = X_train[features]
    
    # Check1
    print(X_train.shape)
    print(X_train.head())
    try:
        print(selected_model.named_steps['model'].intercept_, selected_model.named_steps['model'].coef_)
    except:
        print("Model not trained yet!")
    
    # Fit 
    pipe.fit(X_train, y_train)
    
    # Check2
    print(selected_model.named_steps['model'].intercept_, selected_model.named_steps['model'].coef_)
    
    # Predict on test.csv file
    merged_test = select_features(load_csv("test"), load_csv("movies"))
    X_test = merged_test.copy()

    X_test = X_test[features]
    
    # Check3
    print(X_test.shape)
    print(X_test.head())
    
    y_pred = pipe.predict(X_test)
    
    pred_df = pd.DataFrame(y_pred)
    pred_df.columns = ["sentiment"]
    pred_df.index.name = "id"
    pred_df.to_csv("submission.csv")
    
    return "Successfully created the submission file!!!"