In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from scipy.sparse import csr_matrix

# Machine Learning Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Tuning
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
#from skopt import BayesSearchCV
#from skopt.space import Real, Categorical, Integer

# Evaluation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

# without text

In [2]:
train = pd.read_csv('train_process_2.csv')
test = pd.read_csv('test_process_2.csv')

In [3]:
train.head()

Unnamed: 0,isAdult,runtime,budget,revenue,type_movie,type_short,type_tvEpisode,type_tvMiniSeries,type_tvMovie,type_tvSeries,...,760,761,762,763,764,765,766,767,sentiment_score,averageRating
0,0,-0.553539,-0.26443,-0.190517,1,0,0,0,0,0,...,0.180402,-0.309614,-0.109253,-0.083931,-0.124375,-0.049131,0.448019,0.212206,0.385075,5.6
1,0,-0.020035,0.52012,0.682158,1,0,0,0,0,0,...,0.037835,-0.344181,-0.057734,0.087253,-0.018626,-0.022519,0.800948,0.356451,0.237056,6.1
2,0,-0.104272,-0.26443,-0.190673,1,0,0,0,0,0,...,-0.030777,-0.322845,-0.061202,0.305364,-0.348252,-0.281722,0.328683,0.15772,-0.939,5.3
3,0,-0.216589,-0.26443,-0.190673,1,0,0,0,0,0,...,-0.032892,-0.349964,-0.195581,0.195616,-0.03752,-0.17388,0.454948,0.368299,0.596515,4.4
4,0,0.288836,-0.26443,-0.190673,1,0,0,0,0,0,...,-0.070856,-0.477748,-0.230749,0.008394,-0.032509,-0.091296,0.341122,0.329087,0.74938,5.9


In [25]:
X_train = train.iloc[:,0:96]
y_train = train['averageRating']
X_test = test.iloc[:,0:96]
y_test = test['averageRating']

In [26]:
X_train

Unnamed: 0,isAdult,runtime,budget,revenue,type_movie,type_short,type_tvEpisode,type_tvMiniSeries,type_tvMovie,type_tvSeries,...,actor_Paul Newman,actor_Bruce Willis,actor_Barbara Stanwyck,actor_others,status_Canceled,status_In Production,status_Planned,status_Post Production,status_Released,status_Rumored
0,0,-0.553539,-0.264430,-0.190517,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,0,-0.020035,0.520120,0.682158,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0,-0.104272,-0.264430,-0.190673,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,0,-0.216589,-0.264430,-0.190673,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,0,0.288836,-0.264430,-0.190673,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29755,0,0.064203,-0.264430,-0.190673,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
29756,0,-0.048114,-0.264430,-0.190673,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
29757,0,-0.104272,0.677031,0.366954,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
29758,0,-0.300826,-0.264430,-0.190673,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


## Baseline

In [27]:
def Clf_train(classifier, X_train_norm, y_train, X_test_norm, y_test):    
    # classifier
    clf = classifier
    # fit data
    clf.fit(X_train_norm, y_train)
    # get predictions
    y_pred = clf.predict(X_test_norm) # get predicted rating

    # mse
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    return mse, mae

In [28]:
def model_fit(X_train_df, y_train_df, X_test_df, y_test):
    models = {
    "Linear Regression": LinearRegression(),    
    "Random Forest": RandomForestRegressor(random_state = seed),
    "XGBoost": XGBRegressor(random_state = seed),
    "LightGBM": LGBMRegressor(random_state = seed)
    }
        
    mse_metrics = []
    mae_metrics = []

    for model_name in models:
        model = models[model_name]
        mae, mse = Clf_train(model,X_train_df,y_train_df,X_test_df,y_test)
        mse_metrics.append([model_name,mse])
        mae_metrics.append([model_name,mae])
        print("{}{}{}".format(model_name, " mse: ", mse))
        print("{}{}{}".format(model_name, " mae: ", mae))
    
    return mse_metrics, mae_metrics

In [29]:
seed = 2022

In [30]:
%%time
mse_metrics, mae_metrics = model_fit(X_train, y_train, X_test,y_test)

Linear Regression mse: 0.6987309347173221
Linear Regression mae: 0.8442448629618331
Random Forest mse: 0.6593907725730811
Random Forest mae: 0.7652096364593123
XGBoost mse: 0.6345542671923995
XGBoost mae: 0.7047230271631109
LightGBM mse: 0.6310972048341736
LightGBM mae: 0.7003988191110373
Wall time: 12min 23s


In [31]:
mse_metrics = pd.DataFrame(mse_metrics, columns = ['model','mse'])
mse_metrics

Unnamed: 0,model,mse
0,Linear Regression,0.698731
1,Random Forest,0.659391
2,XGBoost,0.634554
3,LightGBM,0.631097


## Baseline finetuning

### LightGBM

### XGBoost