In [1]:
# Misc and Fundamentals
import pandas as pd
import matplotlib.pyplot as plt
import os
import joblib

# Setup 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

# Models
from sklearn import tree
from sklearn import ensemble

# Evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [2]:
tv_df = pd.read_csv("../Data/data/streaming_titles_clean.csv")
# Convert columns to dummies
print(tv_df["rating"].unique())

tv_df.head(3)

[nan 'PG-13' 'R' 'TV-14' 'PG' 'TV-PG' 'NOT RATED' 'TV-MA' 'G' 'TV-G'
 'TV-Y7' 'TV-Y' 'NR' '13+' '18+' '16+' 'TV-NR' 'ALL' '7+' 'NC-17'
 'UNRATED' 'AGES_16_' 'NOT_RATE' 'TV-Y7-FV' 'UR']


Unnamed: 0,type,title,director,cast,release_year,duration,description,score,imdbid,genre.Crime,...,rating_TV-14,rating_TV-G,rating_TV-MA,rating_TV-NR,rating_TV-PG,rating_TV-Y,rating_TV-Y7,rating_TV-Y7-FV,rating_UNRATED,rating_UR
0,Movie,Silent Night,,,2020,94.0,"Mark, a low end South London hitman recently r...",56.0,tt11628854,True,...,0,0,0,0,0,0,0,0,0,0
1,Movie,The Marksman,,,2021,108.0,A hardened Arizona rancher tries to protect an...,57.0,tt6902332,False,...,0,0,0,0,0,0,0,0,0,0
2,Movie,Gaia,,,2021,97.0,A forest ranger and two survivalists with a cu...,63.0,tt11881160,False,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Train/Test split
tv_train, tv_test = train_test_split(tv_df, random_state = 2023) # random state important for evaluation

X = tv_train.loc[:,tv_train.columns != "score"]
y = tv_train["score"]

The following function is to be able to save and load models without having to re-compute everything every time.

In [4]:
def fitmodel(model, filename, df = tv_train):
    
    if not os.path.isfile(filename):

        model.fit(X,y)

        joblib.dump(model, filename)

    else:
        modeltemp = joblib.load(filename)
        if (type(model) != type(modeltemp)) or \
            (tuple([k[0] for k in model.steps]) != tuple([k[0] for k in modeltemp.steps])):
            print ("\033[93m Warning: model mismatch. Delete the file {filename} and rerun or risk faulty models.\n \033[0m".format(filename=filename))
        model = modeltemp
    
    return model

In [5]:
from sklearn.metrics import get_scorer_names
# get_scorer_names()[20:]

We are going to use the `neg_root_mean_squared_error` metric, because it essentially gives us the weighted distance our model is from the correct metric.

In [6]:
from numpy import mean, std
print("mean: %0.1f, rMSE: %0.4f"% (mean(y.values),std(y.values)))

mean: 48.4, rMSE: 21.7000


A rMSE of 21.7 here means that if we guessed the mean every time, we would be on average off by 21.7 points. So, our model should try to get a better rMSE than 21.7.

In [7]:
from ModelHelpers import *

Let's start by making a Beta Regression predictor.

In [27]:
beta_predictors = columnstartswith("genre",df=tv_df) + ["rating"]

ct = make_column_transformer(\
       (ColumnTransformer(),[beta_predictors]),
       (OneHotEncoder(),["rating"]))

beta_pipe = Pipeline(steps =  [
    ("column_transform", ct),
    ("beta", BetaRegression(from_range=(0,100)))
    ])

beta_grid = dict(beta__l1_ratio = [0.0,0.25,0.5,0.75,1.0],
                 beta__scale = [0.1,1,10,100],
                 beta__alpha = [100,10,1,0.1,0.01]
                 )

tv_folded = KFold(n_splits = 5).split(X,y)

beta_grid_search = GridSearchCV(estimator = beta_pipe,
                                param_grid = beta_grid,
                                cv = tv_folded,
                                scoring = 'neg_root_mean_squared_error',
                                verbose = 0)

TypeError: __init__() missing 1 required positional argument: 'transformers'

In [9]:
beta_grid_result = beta_grid_search.fit(X,y)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.sc

In [23]:
print("Best parameters: {}\nBest root MSE: %0.4f.".format(beta_grid_result.best_params_) % -beta_grid_result.best_score_)
beta_pipe.set_params(**beta_grid_result.best_params_)

Best parameters: {'beta__alpha': 0.01, 'beta__l1_ratio': 0.0, 'beta__scale': 1}
Best root MSE: 19.8303.


In [26]:
fitmodel(beta_pipe, "models/beta_regression.joblib")
pass