In [1]:
# Misc and Fundamentals
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import joblib

# Setup 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

# Models
from sklearn import tree
from sklearn import ensemble

# Evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [2]:
tv_df = pd.read_csv("../Data/data/streaming_titles_final.csv")
# Convert columns to dummies
print(tv_df["rating"].value_counts())
print(tv_df["country"].value_counts())

# There aren't enough values in NC-17 to fit a model so I won't use those values in my model.

tv_df["rating"] = tv_df["rating"].apply(lambda x: x if x != "NC-17" else np.nan)

tv_df

rating
TV-MA    998
R        997
PG-13    673
TV-14    668
PG       626
G        286
16+      203
7+       122
NR        76
ALL       57
TV-Y      35
NC-17      2
Name: count, dtype: int64
country
United States     1809
India              301
United Kingdom     239
Japan              143
Canada              99
                  ... 
Luxembourg           1
Jordan               1
Iran                 1
Venezuela            1
Somalia              1
Name: count, Length: 64, dtype: int64


Unnamed: 0.1,Unnamed: 0,title,Number_MoviesShows_dir,dir_average_score,Number_MoviesShows_cast,cast_average_score,type,director,cast,country,...,genre.Coming_of_Age,genre.Anthology,genre.Buddy,genre.Parody,genre.Spy/Espionage,genre.Survival,genre.Soap_Opera_/_Melodrama,genre.Dance,genre.Medical,genre.Disaster
0,0,The Marksman,0.0,0.0,0.0,0.0,Movie,,,,...,False,False,False,False,False,False,False,False,False,False
1,1,Home Sweet Home,0.0,0.0,0.0,0.0,TV Show,,,,...,False,False,False,False,False,False,False,False,False,False
2,2,America's Book of Secrets,0.0,0.0,0.0,0.0,TV Show,,,United States,...,False,False,False,False,False,False,False,False,False,False
3,3,Beyond Scared Straight,0.0,0.0,0.0,0.0,TV Show,,,United States,...,False,False,False,False,False,False,False,False,False,False
4,4,Hoarders,0.0,0.0,0.0,0.0,TV Show,,,United States,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4980,4980,Brave Little Tailor,1.0,31.0,0.0,0.0,Movie,Bill Roberts,"Walt Disney, Marcelite Garner, Eddie Holden",United States,...,False,False,False,False,False,False,False,False,False,False
4981,4981,Lady and the Tramp II: Scamp's Adventure,0.0,0.0,0.0,0.0,Movie,Darrell Rooney,"Scott Wolf, Alyssa Milano, Chazz Palminteri, J...",United States,...,True,False,False,False,False,False,False,False,False,False
4982,4982,The Great Mouse Detective,0.0,0.0,0.0,0.0,Movie,"John Musker, Ron Clements, Dave Michener","Vincent Price, Barrie Ingham, Val Bettin, Susa...",United States,...,False,False,False,False,False,False,False,False,False,False
4983,4983,X-Men Origins: Wolverine,0.0,0.0,0.0,0.0,Movie,Gavin Hood,"Hugh Jackman, Liev Schreiber, Danny Huston, wi...",United States,...,False,False,False,False,False,False,False,False,False,False


In [3]:
# Train/Test split
tv_train, tv_test = train_test_split(tv_df, random_state = 2023) # random state important for evaluation

X = tv_train.loc[:,tv_train.columns != "score"]
y = tv_train["score"]

The following function is to be able to save and load models without having to re-compute everything every time.

In [4]:
def fitmodel(model, filename, df = tv_train):
    
    if not os.path.isfile(filename):

        model.fit(X,y)

        joblib.dump(model, filename)

    else:
        modeltemp = joblib.load(filename)
        if (type(model) != type(modeltemp)) or \
            (tuple([k[0] for k in model.steps]) != tuple([k[0] for k in modeltemp.steps])):
            print ("\033[93m Warning: model mismatch. Delete the file {filename} and rerun or risk faulty models.\n \033[0m".format(filename=filename))
        model = modeltemp
    
    return model

In [5]:
from sklearn.metrics import get_scorer_names
# get_scorer_names()[20:]

We are going to use the `neg_root_mean_squared_error` metric, because it essentially gives us the weighted distance our model is from the correct metric.

In [6]:
from numpy import mean, std
print("mean: %0.1f, rMSE: %0.4f"% (mean(y.values),std(y.values)))

mean: 48.1, rMSE: 21.8798


A rMSE of 21.7 here means that if we guessed the mean every time, we would be on average off by 21.7 points. So, our model should try to get a better rMSE than 21.7.

In [7]:
from ModelHelpers import *

In [8]:
print(X.columns)

Index(['Unnamed: 0', 'title', 'Number_MoviesShows_dir', 'dir_average_score',
       'Number_MoviesShows_cast', 'cast_average_score', 'type', 'director',
       'cast', 'country', 'release_year', 'rating', 'duration', 'description',
       'imdbid', 'genre.Crime', 'genre.Drama', 'genre.Thriller',
       'genre.Action', 'genre.Horror', 'genre.Science_Fiction', 'genre.Music',
       'genre.Reality', 'genre.Romance', 'genre.Comedy', 'genre.Mystery',
       'genre.Documentary', 'genre.History', 'genre.Teen',
       'genre.Health_&_Wellness', 'genre.Lifestyle', 'genre.Culture',
       'genre.Black_Stories', 'genre.News', 'genre.Latino', 'genre.Adventure',
       'genre.Anime', 'genre.Talk_Show', 'genre.Sketch_Comedy', 'genre.Family',
       'genre.Kids', 'genre.Classics', 'genre.LGBTQ', 'genre.Adult_Animation',
       'genre.Sitcom', 'genre.Cooking_&_Food', 'genre.Sports',
       'genre.Game_Shows', 'genre.International', 'genre.Cartoons',
       'genre.Science_&_Technology', 'genre.Stand_Up

Let's start by making a Beta Regression predictor.

In [34]:
beta_predictors = columnstartswith("genre",df=tv_df) + ["duration","release_year","type","rating","Number_MoviesShows_dir","Number_MoviesShows_cast"]

beta_ct = make_column_transformer(
    (NumericNAOneHotEncoder(),make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(),["type","rating"]),
    #(PolynomialFeatures(),columnstartswith("genre",df=tv_df)+["release_year"]),
    remainder="passthrough")

beta_pipe = Pipeline(steps =  [
    ("predictors", ColumnSelector(beta_predictors)),
    ("columntransform",beta_ct),
    ("beta", BetaRegression(from_range=(0,100)))
    ])

beta_grid = dict(
                 beta__scale = [0.01,0.1,1,2]
                 )

tv_folded = KFold(n_splits = 5).split(X,y)

beta_grid_search = GridSearchCV(estimator = beta_pipe,
                                param_grid = beta_grid,
                                cv = tv_folded,
                                scoring = 'neg_root_mean_squared_error',
                                verbose = 0
                                )

In [35]:
beta_grid_result = beta_grid_search.fit(X,y)

  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
5 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/coltonrowe/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py

In [36]:
print("Best parameters: {}\nBest root MSE: %0.4f.".format(beta_grid_result.best_params_) % -beta_grid_result.best_score_)
beta_pipe.set_params(**beta_grid_result.best_params_)

Best parameters: {'beta__scale': 1}
Best root MSE: 20.0836.


In [12]:
fitmodel(beta_pipe, "models/beta_regression.joblib")
pass