In [41]:
# Misc and Fundamentals
import pandas as pd
import matplotlib.pyplot as plt
import os
import joblib

# Setup 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

# Models
from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
import statsmodels.api as sm

# Evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [42]:
tv_df = pd.read_csv("../Data/data/streaming_titles_clean.csv")
tv_df.head(3)

Unnamed: 0,type,title,director,cast,country,release_year,rating,duration,description,score,...,genre.Coming_of_Age,genre.Anthology,genre.Buddy,genre.Parody,genre.Spy/Espionage,genre.Survival,genre.Soap_Opera_/_Melodrama,genre.Dance,genre.Medical,genre.Disaster
0,Movie,Silent Night,,,,2020,,94.0,"Mark, a low end South London hitman recently r...",56.0,...,False,False,False,False,False,False,False,False,False,False
1,Movie,The Marksman,,,,2021,PG-13,108.0,A hardened Arizona rancher tries to protect an...,57.0,...,False,False,False,False,False,False,False,False,False,False
2,Movie,Gaia,,,,2021,R,97.0,A forest ranger and two survivalists with a cu...,63.0,...,False,False,False,False,False,False,False,False,False,False


In [43]:
# Train/Test split
tv_train, tv_test = train_test_split(tv_df, random_state = 2023) # random state important for evaluation

X = tv_train.loc[:,tv_train.columns != "score"]
y = tv_train["score"]

In [44]:
def fitmodel(model, filename, df = tv_train):
    
    if not os.path.isfile(filename):

        model.fit(X,y)

        joblib.dump(model, filename)

    else:
        modeltemp = joblib.load(filename)
        if (type(model) != type(modeltemp)) or \
            (tuple([k[0] for k in model.steps]) != tuple([k[0] for k in modeltemp.steps])):
            print ("\033[93m Warning: model mismatch. Delete the file {filename} and rerun or risk faulty models.\n \033[0m".format(filename=filename))
        model = modeltemp
    
    return model

In [45]:
from sklearn.metrics import get_scorer_names
get_scorer_names()[20:]

['max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_negative_likelihood_ratio',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'positive_likelihood_ratio',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'top_k_accuracy',
 'v_measure_score']

In [71]:
from numpy import mean, std
print("mean: %0.1f, rMSE: %0.4f"% (mean(y.values),std(y.values)))

mean: 48.4, rMSE: 21.7000


In [76]:
lm_predictors = list(tv_df.columns[tv_df.columns.str.startswith("genre")]) + ["release_year"]
# print(lm_predictors)

lm_pipe = Pipeline(steps =  [
    ("predictors", ColumnTransformer([("predictors","passthrough",lm_predictors)])),
    #("onehot",make_column_transformer((OneHotEncoder(), ['rating']),remainder='passthrough')),
    ("linear_regression", linear_model.LinearRegression())
    ])

tv_folded = KFold(n_splits = 5).split(X,y)

lm_cross_val = cross_val_score(estimator = lm_pipe,
                               cv = tv_folded,
                               scoring = 'neg_root_mean_squared_error',
                               X = X,
                               y = y,
                               verbose = 0)

print("%0.4f root MSE with a standard deviation of %0.4f." % (-1*lm_cross_val.mean(), lm_cross_val.std()))

lm = fitmodel(model = lm_pipe,
              filename = "models/linear_regression.joblib")

20.4527 root MSE with a standard deviation of 0.2753.
