In [64]:
# Misc and Fundamentals
import pandas as pd
import matplotlib.pyplot as plt
import os
import joblib

# Setup 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

# Models
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn import ensemble
import statsmodels.api as sm

# Evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [65]:
tv_df = pd.read_csv("../Data/data/streaming_titles_clean.csv")
tv_df.head(3)

Unnamed: 0,type,title,director,cast,country,release_year,rating,duration,description,score,...,genre.Coming_of_Age,genre.Anthology,genre.Buddy,genre.Parody,genre.Spy/Espionage,genre.Survival,genre.Soap_Opera_/_Melodrama,genre.Dance,genre.Medical,genre.Disaster
0,Movie,Silent Night,,,,2020,,94.0,"Mark, a low end South London hitman recently r...",56.0,...,False,False,False,False,False,False,False,False,False,False
1,Movie,The Marksman,,,,2021,PG-13,108.0,A hardened Arizona rancher tries to protect an...,57.0,...,False,False,False,False,False,False,False,False,False,False
2,Movie,Gaia,,,,2021,R,97.0,A forest ranger and two survivalists with a cu...,63.0,...,False,False,False,False,False,False,False,False,False,False


In [66]:
# Train/Test split
tv_train, tv_test = train_test_split(tv_df, random_state = 2023) # random state important for evaluation

X = tv_train.loc[:,tv_train.columns != "score"]
y = tv_train["score"]

The following function is to be able to save and load models without having to re-compute everything every time.

In [67]:
def fitmodel(model, filename, df = tv_train):
    
    if not os.path.isfile(filename):

        model.fit(X,y)

        joblib.dump(model, filename)

    else:
        modeltemp = joblib.load(filename)
        if (type(model) != type(modeltemp)) or \
            (tuple([k[0] for k in model.steps]) != tuple([k[0] for k in modeltemp.steps])):
            print ("\033[93m Warning: model mismatch. Delete the file {filename} and rerun or risk faulty models.\n \033[0m".format(filename=filename))
        model = modeltemp
    
    return model

In [68]:
from sklearn.metrics import get_scorer_names
get_scorer_names()[20:]

['max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_negative_likelihood_ratio',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'positive_likelihood_ratio',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'top_k_accuracy',
 'v_measure_score']

We are going to use the `neg_root_mean_squared_error` metric, because it essentially gives us the weighted distance our model is from the correct metric.

In [69]:
from numpy import mean, std
print("mean: %0.1f, rMSE: %0.4f"% (mean(y.values),std(y.values)))

mean: 48.4, rMSE: 21.7000


In [108]:
import math
import numpy as np

class ColumnSelector():
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return X[self.columns]

class DummyEncoder():
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return pd.get_dummies(data=X, columns=self.columns,prefix_sep=".")
    
def ProportionScale(X, from_range = (0,100), inverse = False):
    scale = from_range[1] - from_range[0]
    if inverse:
        output = X*scale+from_range[0]
    else:
        output = (X-from_range[0])/scale
    return output

class BetaRegression(LinearRegression):

    def __init__(self, from_range = (0,1)):
        self.from_range = from_range
        super().__init__()

    def fit(self, X, y):
        y = np.asarray(y)
        y = ProportionScale(y)
        y = np.log(y / (1 - y))
        fitted = super().fit(X, y)
        fitted = np.asarray(fitted)
        return ProportionScale(fitted, inverse = True)

    def predict(self, X):
        y = super().predict(X)
        return 1 / (np.exp(-y) + 1)



Let's start by making a Logistic Regression predictor.

In [110]:
beta_predictors = list(tv_df.columns[tv_df.columns.str.startswith("genre")]) + ["release_year"]
# print(lm_predictors)

beta_pipe = Pipeline(steps =  [
    ("predictors", ColumnSelector(columns=beta_predictors)),
    ("beta_regression", BetaRegression(from_range=(0,100)))
    ])

tv_folded = KFold(n_splits = 5).split(X,y)

lm_cross_val = cross_val_score(estimator = beta_pipe,
                               cv = tv_folded,
                               scoring = 'neg_root_mean_squared_error',
                               X = X,
                               y = y,
                               verbose = 0)

print("%0.4f root MSE with a standard deviation of %0.4f." % (-1*lm_cross_val.mean(), lm_cross_val.std()))

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Colton\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Colton\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Colton\AppData\Local\Temp\ipykernel_17612\908247937.py", line 44, in fit
    return ProportionScale(fitted, inverse = True)
  File "C:\Users\Colton\AppData\Local\Temp\ipykernel_17612\908247937.py", line 27, in ProportionScale
    output = X*scale+from_range[0]
TypeError: unsupported operand type(s) for *: 'BetaRegression' and 'int'
