In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from scipy.sparse import csr_matrix

# Machine Learning Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Tuning
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
#from skopt import BayesSearchCV
#from skopt.space import Real, Categorical, Integer

# Evaluation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

# with text

In [2]:
train = pd.read_csv('train_process_2.csv')
test = pd.read_csv('test_process_2.csv')

In [3]:
train.head()

Unnamed: 0,isAdult,runtime,budget,revenue,type_movie,type_short,type_tvEpisode,type_tvMiniSeries,type_tvMovie,type_tvSeries,...,760,761,762,763,764,765,766,767,sentiment_score,averageRating
0,0,-0.553539,-0.26443,-0.190517,1,0,0,0,0,0,...,0.180402,-0.309614,-0.109253,-0.083931,-0.124375,-0.049131,0.448019,0.212206,0.385075,5.6
1,0,-0.020035,0.52012,0.682158,1,0,0,0,0,0,...,0.037835,-0.344181,-0.057734,0.087253,-0.018626,-0.022519,0.800948,0.356451,0.237056,6.1
2,0,-0.104272,-0.26443,-0.190673,1,0,0,0,0,0,...,-0.030777,-0.322845,-0.061202,0.305364,-0.348252,-0.281722,0.328683,0.15772,-0.939,5.3
3,0,-0.216589,-0.26443,-0.190673,1,0,0,0,0,0,...,-0.032892,-0.349964,-0.195581,0.195616,-0.03752,-0.17388,0.454948,0.368299,0.596515,4.4
4,0,0.288836,-0.26443,-0.190673,1,0,0,0,0,0,...,-0.070856,-0.477748,-0.230749,0.008394,-0.032509,-0.091296,0.341122,0.329087,0.74938,5.9


In [4]:
X_train = train.drop(columns = ['averageRating'])
y_train = train['averageRating']
X_test = test.drop(columns = ['averageRating'])
y_test = test['averageRating']

In [6]:
X_train

Unnamed: 0,isAdult,runtime,budget,revenue,type_movie,type_short,type_tvEpisode,type_tvMiniSeries,type_tvMovie,type_tvSeries,...,759,760,761,762,763,764,765,766,767,sentiment_score
0,0,-0.553539,-0.264430,-0.190517,1,0,0,0,0,0,...,-0.328453,0.180402,-0.309614,-0.109253,-0.083931,-0.124375,-0.049131,0.448019,0.212206,0.385075
1,0,-0.020035,0.520120,0.682158,1,0,0,0,0,0,...,-0.558157,0.037835,-0.344181,-0.057734,0.087253,-0.018626,-0.022519,0.800948,0.356451,0.237056
2,0,-0.104272,-0.264430,-0.190673,1,0,0,0,0,0,...,-0.493396,-0.030777,-0.322845,-0.061202,0.305364,-0.348252,-0.281722,0.328683,0.157720,-0.939000
3,0,-0.216589,-0.264430,-0.190673,1,0,0,0,0,0,...,-0.389747,-0.032892,-0.349964,-0.195581,0.195616,-0.037520,-0.173880,0.454948,0.368299,0.596515
4,0,0.288836,-0.264430,-0.190673,1,0,0,0,0,0,...,-0.396414,-0.070856,-0.477748,-0.230749,0.008394,-0.032509,-0.091296,0.341122,0.329087,0.749380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29755,0,0.064203,-0.264430,-0.190673,1,0,0,0,0,0,...,-0.361886,0.046745,-0.424633,-0.166735,0.323392,-0.161053,-0.138874,0.589103,0.572579,0.571975
29756,0,-0.048114,-0.264430,-0.190673,1,0,0,0,0,0,...,-0.262824,-0.060787,-0.349424,0.010337,0.326076,-0.252016,-0.099298,0.656841,0.135185,0.645110
29757,0,-0.104272,0.677031,0.366954,1,0,0,0,0,0,...,-0.309824,-0.101431,-0.520131,-0.186858,-0.131616,-0.061005,-0.103117,0.227377,0.119793,0.446428
29758,0,-0.300826,-0.264430,-0.190673,1,0,0,0,0,0,...,-0.435759,-0.025843,-0.432768,-0.210309,0.181765,-0.159357,-0.155233,0.152872,0.439998,0.510545


## Baseline

In [21]:
def Clf_train(classifier, X_train_norm, y_train, X_test_norm, y_test):    
    # classifier
    clf = classifier
    # fit data
    clf.fit(X_train_norm, y_train)
    # get predictions
    y_pred = clf.predict(X_test_norm) # get predicted rating

    # mse
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    return mse, mae

In [23]:
def model_fit(X_train_df, y_train_df, X_test_df, y_test):
    models = {
    "Linear Regression": LinearRegression(),    
    "Random Forest": RandomForestRegressor(random_state = seed),
    "XGBoost": XGBRegressor(random_state = seed),
    "LightGBM": LGBMRegressor(random_state = seed)
    }
        
    mse_metrics = []
    mae_metrics = []

    for model_name in models:
        model = models[model_name]
        mae, mse = Clf_train(model,X_train_df,y_train_df,X_test_df,y_test)
        mse_metrics.append([model_name,mse])
        mae_metrics.append([model_name,mae])
        print("{}{}{}".format(model_name, " mse: ", mse))
        print("{}{}{}".format(model_name, " mae: ", mae))
    
    return mse_metrics, mae_metrics

In [12]:
seed = 2022

In [22]:
%%time
accuracy_base, f1_base = model_fit(X_train, y_train, X_test,y_test)

Linear Regression mse: 0.6612073935231855
Linear Regression mae: 0.7424857354856306
Random Forest mse: 0.6427543010752689
Random Forest mae: 0.712331686827957
XGBoost mse: 0.6343260736106545
XGBoost mae: 0.6868666431826888
LightGBM mse: 0.6017258864027116
LightGBM mae: 0.627597219219405


NameError: name 'nae_metrics' is not defined

In [None]:
mse_metrics = pd.DataFrame(mse_metrics, columns = ['model','Accuracy'])
mse_metrics

In [None]:
f1_base = pd.DataFrame(f1_base, columns = ['model','F1_score'])
f1_base

## Baseline finetuning

### LightGBM

In [24]:
%%time
# Start with low learning rate and basic parameter settings
lgbm_classifier = LGBMClassifier(random_state = seed)

params_lgbm = {
    'learning_rate' : [0.1,0.3],
    'max_depth':[10,None],
    'n_estimators':[100,200],
    'subsample': [0.3, 0.8], 
    'colsample_bytree': [0.5, 1],
    'reg_alpha':[0.5,1],
    'reg_lambda':[0.5,1],
    'min_child_weight':[1e-3,1e-2],
    'min_child_samples':[15,20]
}

gs_lgbm = GridSearchCV(estimator=lgbm_classifier, 
                 param_grid=params_lgbm, 
                 cv=5,
                 n_jobs = -1, 
                 scoring='accuracy') 

gs_lgbm.fit(X_train, y_train)

gs_lgbm.best_params_

NameError: name 'LGBMClassifier' is not defined

In [None]:
lgb_1 = gs_lgbm.best_estimator_
cm_lgb_1,score_lgb_1,f1_lgb_1 = Clf_train(lgb_1, X_train_norm, y_train, X_test_norm)

print(f"Accuracy Score for LGB: {score_lgb_1}")
print(f"F1 score for LGB: {f1_lgb_1}")
print(cm_lgb_1)

### XGBoost

In [None]:
# Start with low learning rate and basic parameter settings
#tune max_depth & min_child_weight
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 eval_metric='auc', nthread=4, scale_pos_weight=1, seed=27, use_label_encoder=False), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4, cv=5)
gsearch1.fit(X_train, y_train)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
# Tune max_depth & min_child_weight further
param_test2 = {
 'max_depth':[8,9,10],
 'min_child_weight':[1,2,3]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 eval_metric='auc', nthread=4, scale_pos_weight=1, seed=27, use_label_encoder=False), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4, cv=5)
gsearch1.fit(X_train, y_train)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
# Tune Gamma
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=9,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 eval_metric='auc', nthread=4, scale_pos_weight=1,seed=27,use_label_encoder=False), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=4, cv=5)
gsearch3.fit(X_train, y_train)
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

In [None]:
# Tune subsample & colsample_bytree
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=9,
 min_child_weight=1, gamma=0.4, subsample=0.8, colsample_bytree=0.8,
 eval_metric='auc', nthread=4, scale_pos_weight=1,seed=27,use_label_encoder=False), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=4, cv=5)
gsearch4.fit(X_train, y_train)
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_

In [None]:
#Further tunning on subsample and colsample_bytree
param_test5 = {
 'subsample':[i/100.0 for i in range(85,100,5)],
 'colsample_bytree':[i/100.0 for i in range(65,80,5)]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=9,
 min_child_weight=1, gamma=0.4, subsample=0.9, colsample_bytree=0.6,
 eval_metric='auc', nthread=4, scale_pos_weight=1,seed=27,use_label_encoder=False), 
 param_grid = param_test5, scoring='roc_auc',n_jobs=4, cv=5)
gsearch5.fit(X_train, y_train)
gsearch5.cv_results_, gsearch5.best_params_, gsearch5.best_score_

In [None]:
#Tune Regularization Parameters
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=9,
 min_child_weight=1, gamma=0.4, subsample=0.9, colsample_bytree=0.7,
 eval_metric='auc', nthread=4, scale_pos_weight=1,seed=27,use_label_encoder=False), 
 param_grid = param_test6, scoring='roc_auc',n_jobs=4, cv=5)
gsearch6.fit(X_train, y_train)
gsearch6.cv_results_, gsearch6.best_params_, gsearch6.best_score_

In [None]:
# Now we have optimal hyper-parameters, lower the learning rate and re-train the model
xgb2 = XGBClassifier(
 learning_rate =0.01,
 n_estimators=1000,
 max_depth=9,
 min_child_weight=1,
 gamma=0.4,
 subsample=0.9,
 colsample_bytree=0.7,
 reg_alpha=0.1,
 eval_metric='auc',
 nthread=4,
 scale_pos_weight=1,
 seed=27,
 use_label_encoder=False)
modelfit(xgb2, train, predictors)