In [1]:
# Misc and Fundamentals
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import joblib

# Setup 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

# Models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [2]:
tv_df = pd.read_csv("../Data/data/streaming_titles_final.csv")
# Convert columns to dummies
tv_rating_values = dict(tv_df["rating"].value_counts())
tv_rating_values[np.nan] = 100
print(tv_rating_values)
tv_country_values = dict(tv_df["country"].value_counts())
tv_country_values[np.nan] = 100
print(tv_country_values)

# There aren't enough values in NC-17 to fit a model so I won't use those values in my model.

tv_df["rating"] = tv_df["rating"].apply(lambda x: x if tv_rating_values[x] > 10 else np.nan)
tv_df["country"] = tv_df["country"].apply(lambda x: x if tv_country_values[x] > 10 else np.nan)

tv_df

{'TV-MA': 998, 'R': 997, 'PG-13': 673, 'TV-14': 668, 'PG': 626, 'G': 286, '16+': 203, '7+': 122, 'NR': 76, 'ALL': 57, 'TV-Y': 35, 'NC-17': 2, nan: 100}
{'United States': 1809, 'India': 301, 'United Kingdom': 239, 'Japan': 143, 'Canada': 99, 'France': 69, 'South Korea': 61, 'Spain': 55, 'Germany': 40, 'China': 32, 'Australia': 32, 'Mexico': 29, 'Italy': 25, 'Indonesia': 24, 'Turkey': 23, 'Brazil': 23, 'Hong Kong': 22, 'Egypt': 19, 'Philippines': 19, 'Argentina': 16, 'Thailand': 15, 'Ireland': 15, 'Taiwan': 14, 'Denmark': 13, 'Poland': 11, 'South Africa': 10, 'Colombia': 9, 'Norway': 9, 'Sweden': 9, 'Netherlands': 8, 'Russia': 7, 'Pakistan': 6, 'Belgium': 6, 'Chile': 6, 'Israel': 6, 'New Zealand': 5, 'Nigeria': 5, 'Bulgaria': 4, 'Finland': 4, 'Switzerland': 4, 'Austria': 4, 'Romania': 4, 'Iceland': 4, 'Singapore': 4, 'Czech Republic': 3, 'Lebanon': 3, 'Greece': 2, 'Saudi Arabia': 2, 'Peru': 2, 'Soviet Union': 1, 'Slovenia': 1, 'Serbia': 1, 'Uruguay': 1, 'Kuwait': 1, 'Cambodia': 1, 'Puert

Unnamed: 0.1,Unnamed: 0,title,Number_MoviesShows_dir,dir_average_score,Number_MoviesShows_cast,cast_average_score,type,director,cast,country,...,genre.Coming_of_Age,genre.Anthology,genre.Buddy,genre.Parody,genre.Spy/Espionage,genre.Survival,genre.Soap_Opera_/_Melodrama,genre.Dance,genre.Medical,genre.Disaster
0,0,The Marksman,,,,,Movie,,,,...,False,False,False,False,False,False,False,False,False,False
1,1,Home Sweet Home,,,,,TV Show,,,,...,False,False,False,False,False,False,False,False,False,False
2,2,America's Book of Secrets,,,,,TV Show,,,United States,...,False,False,False,False,False,False,False,False,False,False
3,3,Beyond Scared Straight,,,,,TV Show,,,United States,...,False,False,False,False,False,False,False,False,False,False
4,4,Hoarders,,,,,TV Show,,,United States,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4980,4980,Brave Little Tailor,1.0,31.0,,,Movie,Bill Roberts,"Walt Disney, Marcelite Garner, Eddie Holden",United States,...,False,False,False,False,False,False,False,False,False,False
4981,4981,Lady and the Tramp II: Scamp's Adventure,,,,,Movie,Darrell Rooney,"Scott Wolf, Alyssa Milano, Chazz Palminteri, J...",United States,...,True,False,False,False,False,False,False,False,False,False
4982,4982,The Great Mouse Detective,,,,,Movie,"John Musker, Ron Clements, Dave Michener","Vincent Price, Barrie Ingham, Val Bettin, Susa...",United States,...,False,False,False,False,False,False,False,False,False,False
4983,4983,X-Men Origins: Wolverine,,,,,Movie,Gavin Hood,"Hugh Jackman, Liev Schreiber, Danny Huston, wi...",United States,...,False,False,False,False,False,False,False,False,False,False


In [3]:
# Train/Test split
tv_train, tv_test = train_test_split(tv_df, random_state = 2023) # random state important for evaluation

X = tv_train.loc[:,tv_train.columns != "score"]
y = tv_train["score"]

X_test = tv_test.loc[:,tv_train.columns != "score"]
y_test = tv_test["score"]

The following function is to be able to save and load models without having to re-compute everything every time.

In [4]:
def fitmodel(model, filename, df = tv_train):

    model.fit(X,y)

    joblib.dump(model, filename)
    
    return model

In [5]:
# from sklearn.metrics import get_scorer_names
# get_scorer_names()[20:]

We are going to use the `neg_root_mean_squared_error` metric, because it essentially gives us the weighted distance our model is from the correct metric.

In [6]:
from numpy import mean, std
print("mean: %0.1f, rMSE: %0.4f"% (mean(y.values),std(y.values)))

mean: 48.1, rMSE: 21.8798


A rMSE of 21.7 here means that if we guessed the mean every time, we would be on average off by 21.7 points. So, our model should try to get a better rMSE than 21.7.

In [7]:
from ModelHelpers import *

In [8]:
print(X.columns)

Index(['Unnamed: 0', 'title', 'Number_MoviesShows_dir', 'dir_average_score',
       'Number_MoviesShows_cast', 'cast_average_score', 'type', 'director',
       'cast', 'country', 'release_year', 'rating', 'duration', 'description',
       'imdbid', 'genre.Crime', 'genre.Drama', 'genre.Thriller',
       'genre.Action', 'genre.Horror', 'genre.Science_Fiction', 'genre.Music',
       'genre.Reality', 'genre.Romance', 'genre.Comedy', 'genre.Mystery',
       'genre.Documentary', 'genre.History', 'genre.Teen',
       'genre.Health_&_Wellness', 'genre.Lifestyle', 'genre.Culture',
       'genre.Black_Stories', 'genre.News', 'genre.Latino', 'genre.Adventure',
       'genre.Anime', 'genre.Talk_Show', 'genre.Sketch_Comedy', 'genre.Family',
       'genre.Kids', 'genre.Classics', 'genre.LGBTQ', 'genre.Adult_Animation',
       'genre.Sitcom', 'genre.Cooking_&_Food', 'genre.Sports',
       'genre.Game_Shows', 'genre.International', 'genre.Cartoons',
       'genre.Science_&_Technology', 'genre.Stand_Up

Let's start by making a Beta Regression predictor.

In [9]:
beta_predictors = columnstartswith("genre",df=tv_df) + ["duration",
                                                        "release_year",
                                                        "type",
                                                        "rating",
                                                        #"Number_MoviesShows_dir",
                                                        #"Number_MoviesShows_cast",
                                                        "dir_average_score",
                                                        "cast_average_score",
                                                        "country"]

beta_ct = make_column_transformer(
    (NumericNAOneHotEncoder(),make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(),["type","rating","country"]),
    remainder="passthrough")

beta_pipe = Pipeline(steps = [
    ("predictors", ColumnSelector(beta_predictors)),
    ("columntransform",beta_ct),
    ("beta", BetaRegression(from_range=(0,100)))
    ])

beta_grid = dict(
                 beta__scale = [0.01,0.1,1,2]
                 )

tv_folded = KFold(n_splits = 5).split(X,y)

beta_grid_search = GridSearchCV(estimator = beta_pipe,
                                param_grid = beta_grid,
                                cv = tv_folded,
                                scoring = 'neg_root_mean_squared_error',
                                verbose = 0
                                )

In [10]:
beta_grid_result = beta_grid_search.fit(X,y)

  y = 1 / (np.exp(-y/self.scale) + 1)
  y = 1 / (np.exp(-y/self.scale) + 1)
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
  y = np.log(y*self.scale / (1 - y*self.scale))
5 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/coltonrowe/Librar

In [11]:
print("Best parameters: {}\nBest root MSE: %0.4f.".format(beta_grid_result.best_params_) % -beta_grid_result.best_score_)
beta_pipe.set_params(**beta_grid_result.best_params_)

Best parameters: {'beta__scale': 1}
Best root MSE: 19.2478.


In [12]:
beta = fitmodel(beta_pipe, "models/beta_regression.joblib")
pass

In [13]:
dtree_predictors = columnstartswith("genre",df=tv_df) + ["duration",
                                                        "release_year",
                                                        "type",
                                                        "rating",
                                                        #"Number_MoviesShows_dir",
                                                        #"Number_MoviesShows_cast",
                                                        "dir_average_score",
                                                        "cast_average_score",
                                                        "country"]

dtree_ct = make_column_transformer(
    (NumericNAOneHotEncoder(),make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(),["type","rating","country"]),
    remainder="passthrough")

dtree_pipe = Pipeline(steps =  [
    ("predictors", ColumnSelector(beta_predictors)),
    ("columntransform",dtree_ct),
    ("dtree", DecisionTreeRegressor())
    ])

dtree_grid = dict(
                 dtree__max_depth = [1,5,10,None],
                 dtree__min_samples_leaf = range(1,10)
                 )

tv_folded = KFold(n_splits = 5).split(X,y)

dtree_grid_search = GridSearchCV(estimator = dtree_pipe,
                                param_grid = dtree_grid,
                                cv = tv_folded,
                                scoring = 'neg_root_mean_squared_error',
                                verbose = 2
                                )

In [14]:
dtree_grid_result = dtree_grid_search.fit(X,y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END ......dtree__max_depth=1, dtree__min_samples_leaf=1; total time=   0.1s
[CV] END ......dtree__max_depth=1, dtree__min_samples_leaf=1; total time=   0.1s
[CV] END ......dtree__max_depth=1, dtree__min_samples_leaf=1; total time=   0.1s
[CV] END ......dtree__max_depth=1, dtree__min_samples_leaf=1; total time=   0.0s
[CV] END ......dtree__max_depth=1, dtree__min_samples_leaf=1; total time=   0.1s
[CV] END ......dtree__max_depth=1, dtree__min_samples_leaf=2; total time=   0.2s
[CV] END ......dtree__max_depth=1, dtree__min_samples_leaf=2; total time=   0.1s
[CV] END ......dtree__max_depth=1, dtree__min_samples_leaf=2; total time=   0.1s
[CV] END ......dtree__max_depth=1, dtree__min_samples_leaf=2; total time=   0.1s
[CV] END ......dtree__max_depth=1, dtree__min_samples_leaf=2; total time=   0.1s
[CV] END ......dtree__max_depth=1, dtree__min_samples_leaf=3; total time=   0.1s
[CV] END ......dtree__max_depth=1, dtree__min_s

In [15]:
print("Best parameters: {}\nBest root MSE: %0.4f.".format(dtree_grid_result.best_params_) % -dtree_grid_result.best_score_)
dtree_pipe.set_params(**dtree_grid_result.best_params_)
pass

Best parameters: {'dtree__max_depth': 10, 'dtree__min_samples_leaf': 8}
Best root MSE: 20.1962.


In [16]:
dtree = fitmodel(dtree_pipe, "models/decision_tree.joblib")
pass

In [17]:
rf_predictors = columnstartswith("genre",df=tv_df) + ["duration",
                                                        "release_year",
                                                        "type",
                                                        "rating",
                                                        #"Number_MoviesShows_dir",
                                                        #"Number_MoviesShows_cast",
                                                        "dir_average_score",
                                                        "cast_average_score",
                                                        "country"]

rf_ct = make_column_transformer(
    (NumericNAOneHotEncoder(),make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(),["type","rating","country"]),
    remainder="passthrough")

rf_pipe = Pipeline(steps =  [
    ("predictors", ColumnSelector(rf_predictors)),
    ("columntransform",rf_ct),
    ("rf", RandomForestRegressor())
    ])

rf_grid = dict(
                 rf__max_depth = [1,5,10,None],
                 rf__min_samples_leaf = [1,2,3,4,5]
                 rf__n_estimators = [50,100,200,300])

tv_folded = KFold(n_splits = 5).split(X,y)

rf_grid_search = GridSearchCV(estimator = rf_pipe,
                                param_grid = rf_grid,
                                cv = tv_folded,
                                scoring = 'neg_root_mean_squared_error',
                                verbose = 2
                                )

SyntaxError: invalid syntax (2264398423.py, line 25)

In [None]:
rf_grid_result = rf_grid_search.fit(X,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END ............rf__max_depth=1, rf__min_samples_leaf=1; total time=   0.4s
[CV] END ............rf__max_depth=1, rf__min_samples_leaf=1; total time=   0.4s
[CV] END ............rf__max_depth=1, rf__min_samples_leaf=1; total time=   0.4s
[CV] END ............rf__max_depth=1, rf__min_samples_leaf=1; total time=   0.4s
[CV] END ............rf__max_depth=1, rf__min_samples_leaf=1; total time=   0.4s
[CV] END ............rf__max_depth=1, rf__min_samples_leaf=2; total time=   0.3s
[CV] END ............rf__max_depth=1, rf__min_samples_leaf=2; total time=   0.3s
[CV] END ............rf__max_depth=1, rf__min_samples_leaf=2; total time=   0.3s
[CV] END ............rf__max_depth=1, rf__min_samples_leaf=2; total time=   0.3s
[CV] END ............rf__max_depth=1, rf__min_samples_leaf=2; total time=   0.3s
[CV] END ............rf__max_depth=1, rf__min_samples_leaf=3; total time=   0.4s
[CV] END ............rf__max_depth=1, rf__min_s

In [None]:
print("Best parameters: {}\nBest root MSE: %0.4f.".format(rf_grid_result.best_params_) % -rf_grid_result.best_score_)
rf_pipe.set_params(**rf_grid_result.best_params_)
pass

Best parameters: {'rf__max_depth': 10, 'rf__min_samples_leaf': 2}
Best root MSE: 18.9455.


In [None]:
rf = fitmodel(rf_pipe,"models/random_forest.joblib")
pass

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import TransformedTargetRegressor


knn_predictors = columnstartswith("genre",df=tv_df) + ["duration",
                                                        "release_year",
                                                        "type",
                                                        "rating",
                                                        #"Number_MoviesShows_dir",
                                                        #"Number_MoviesShows_cast",
                                                        "dir_average_score",
                                                        "cast_average_score",
                                                        "country"]

knn_ct = make_column_transformer(
    (NumericNAOneHotEncoder(),make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(),["type","rating","country"]),
    remainder="passthrough")

knn_pipe = Pipeline(steps =  [
    ("predictors", ColumnSelector(knn_predictors)),
    ("columntransform",beta_ct),
    ("knn", TransformedTargetRegressor(regressor=KNeighborsRegressor()))
    ])

knn_grid = dict(
    knn__regressor__n_neighbors=range(1,20),
    knn__regressor__weights=["uniform", "distance"]
)

tv_folded = KFold(n_splits = 5).split(X,y)

knn_grid_search = GridSearchCV(estimator = knn_pipe,
                                param_grid = knn_grid,
                                cv = tv_folded,
                                scoring = 'neg_root_mean_squared_error',
                                verbose = 2
                                )

In [None]:
knn_grid_result = knn_grid_search.fit(X,y)

Fitting 5 folds for each of 38 candidates, totalling 190 fits
[CV] END knn__regressor__n_neighbors=1, knn__regressor__weights=uniform; total time=   0.3s
[CV] END knn__regressor__n_neighbors=1, knn__regressor__weights=uniform; total time=   0.1s
[CV] END knn__regressor__n_neighbors=1, knn__regressor__weights=uniform; total time=   0.1s
[CV] END knn__regressor__n_neighbors=1, knn__regressor__weights=uniform; total time=   0.1s
[CV] END knn__regressor__n_neighbors=1, knn__regressor__weights=uniform; total time=   0.1s
[CV] END knn__regressor__n_neighbors=1, knn__regressor__weights=distance; total time=   0.1s
[CV] END knn__regressor__n_neighbors=1, knn__regressor__weights=distance; total time=   0.1s
[CV] END knn__regressor__n_neighbors=1, knn__regressor__weights=distance; total time=   0.1s
[CV] END knn__regressor__n_neighbors=1, knn__regressor__weights=distance; total time=   0.1s
[CV] END knn__regressor__n_neighbors=1, knn__regressor__weights=distance; total time=   0.1s
[CV] END knn_

In [None]:
print("Best parameters: {}\nBest root MSE: %0.4f.".format(knn_grid_result.best_params_) % -knn_grid_result.best_score_)
knn_pipe.set_params(**knn_grid_result.best_params_)
pass

Best parameters: {'knn__regressor__n_neighbors': 19, 'knn__regressor__weights': 'distance'}
Best root MSE: 20.4948.


In [None]:
knn = fitmodel(knn_pipe, "models/knn.joblib")
pass

In [None]:
from numpy import std

print([
std(beta.predict(X_test) - y_test),
std(knn.predict(X_test) - y_test),
std(dtree.predict(X_test) - y_test),
std(rf.predict(X_test) - y_test)])


[19.707860340549054, 20.52818818064173, 19.844683701206268, 19.04890054844948]
