In [137]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import set_config
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import r2_score
import joblib

In [138]:
df = pd.read_csv('datasets/basics_knownForTitles_ratings.csv', index_col=0)
df

Unnamed: 0,tconst,primaryTitle,isAdult,decade,runtimeMinutes,genres,nconst,averageRating,numVotes
0,tt0002220,Grannie,0,191.0,,,,4.0,44
1,tt0003589,0-18 or A Message from the Sky,0,191.0,,"Crime,Drama","nm0301187,nm0613115,nm0347532",6.7,42
2,tt0003609,Alexandra,0,191.0,,"Drama,Romance","nm0823150,nm1990343,nm1987804,nm0903959",5.1,13
3,tt0008808,"$5,000 Reward",0,191.0,50.0,Mystery,"nm0267916,nm0516166,nm0429453,nm0761492",5.4,21
4,tt0008810,99,0,191.0,,Crime,"nm0819331,nm0288005,nm13967494",4.4,32
...,...,...,...,...,...,...,...,...,...
286801,tt12801814,Violation,0,202.0,107.0,"Drama,Horror","nm11777181,nm3631020,nm13926845,nm9278354,nm10...",5.3,4044
286802,tt1280685,The Banished,0,199.0,52.0,Documentary,"nm2601444,nm3110629,nm1300389,nm1759159,nm1657917",7.2,43
286803,tt1287896,Daijôbu de aruyô ni: Cocco owaranai tabi,0,200.0,,"Documentary,Music",,5.7,50
286804,tt1291125,Au revoir Taipei,0,201.0,85.0,"Comedy,Crime,Drama","nm11470103,nm2543454,nm6598458,nm6599739,nm094...",6.5,1433


In [163]:
def load_model():
    parameters = {'model__n_estimators': range(140, 150, 10), 'model__max_depth': range(6,7)}
    df = pd.read_csv('datasets/basics_knownForTitles_ratings.csv', index_col=0)
    (df.tconst.value_counts() == 1).all()
    df = df.query('numVotes >= 375')
    df = df.dropna()
    df = df.query('isAdult == 0')
    df = df.drop(columns=['tconst', 'primaryTitle','isAdult'], axis=1)
    df['nconst'] = df['nconst'].str.replace(',', ' ')
    df['genres'] = df['genres'].str.replace(',', ' ')   
    column_num = ['decade', 'runtimeMinutes', 'numVotes']
    transfo_num = Pipeline(steps=[
        ('scaling', RobustScaler())
    ])
    column_tex1 = 'genres'
    column_tex2 = 'nconst'
    transfo_tex = Pipeline(steps=[
        ('countvec', CountVectorizer()), 
        ('dr', TruncatedSVD())    
        ])
    preparation = ColumnTransformer(
        transformers=[
            ('data_tex1', transfo_tex , column_tex1),
            ('data_tex2', transfo_tex , column_tex2),
            ('data_num', transfo_num , column_num)
        ])
    pipe = Pipeline(steps=[('preparation', preparation),
                            ('model', GradientBoostingRegressor())])
    grid = GridSearchCV(pipe, parameters, scoring='r2', cv = 5, n_jobs =-1, verbose = 1)
    y = df['averageRating']
    X = df.drop(columns='averageRating')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    model = grid.fit(X_train, y_train)
    return model

In [139]:
(df.tconst.value_counts() == 1).all()

True

In [140]:
df.query('numVotes >= 375', inplace=True)
df

Unnamed: 0,tconst,primaryTitle,isAdult,decade,runtimeMinutes,genres,nconst,averageRating,numVotes
7,tt0015145,Monsieur Beaucaire,0,192.0,106.0,"Drama,Romance","nm0909066,nm0948806,nm0497372,nm0245078,nm0401...",6.0,496
9,tt0019286,Storm Over Asia,0,192.0,127.0,"Drama,War","nm0412014,nm0409109,nm0348705,nm0214113,nm2227...",7.0,2188
12,tt0021992,Illicit,0,193.0,79.0,"Drama,Romance","nm0719686,nm1393139,nm1393547,nm1394878,nm1394...",6.1,1062
13,tt0023472,Silver Dollar,0,193.0,83.0,"Biography,Drama,Music","nm0250439,nm0923146,nm0440169",6.3,444
17,tt0026508,I Found Stella Parish,0,193.0,85.0,"Drama,Romance",nm0801209,6.7,523
...,...,...,...,...,...,...,...,...,...
286796,tt1220628,I Hope They Serve Beer in Hell,0,200.0,105.0,Comedy,"nm1910698,nm1901451,nm2056354,nm2103617,nm1924...",5.2,8567
286798,tt1244093,Hisss,0,201.0,98.0,"Comedy,Drama,Horror","nm0816003,nm1335461,nm1324246,nm1997974,nm4147...",2.9,1819
286801,tt12801814,Violation,0,202.0,107.0,"Drama,Horror","nm11777181,nm3631020,nm13926845,nm9278354,nm10...",5.3,4044
286804,tt1291125,Au revoir Taipei,0,201.0,85.0,"Comedy,Crime,Drama","nm11470103,nm2543454,nm6598458,nm6599739,nm094...",6.5,1433


In [141]:
# Exécution du dropna avant de separer y et X
df.dropna(inplace=True)
df.shape

(64679, 9)

In [142]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64679 entries, 7 to 286805
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          64679 non-null  object 
 1   primaryTitle    64679 non-null  object 
 2   isAdult         64679 non-null  int64  
 3   decade          64679 non-null  float64
 4   runtimeMinutes  64679 non-null  float64
 5   genres          64679 non-null  object 
 6   nconst          64679 non-null  object 
 7   averageRating   64679 non-null  float64
 8   numVotes        64679 non-null  int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 4.9+ MB


In [143]:
df.describe()

Unnamed: 0,isAdult,decade,runtimeMinutes,averageRating,numVotes
count,64679.0,64679.0,64679.0,64679.0,64679.0
mean,0.00116,199.323907,102.619954,6.082126,15630.46
std,0.034033,2.342441,30.591262,1.239214,72231.86
min,0.0,189.0,3.0,1.0,375.0
25%,0.0,198.0,90.0,5.4,663.0
50%,0.0,200.0,98.0,6.3,1412.0
75%,0.0,201.0,111.0,7.0,4671.5
max,1.0,202.0,5220.0,9.9,2697394.0


In [144]:
df.query('isAdult == 0', inplace=True)
df

Unnamed: 0,tconst,primaryTitle,isAdult,decade,runtimeMinutes,genres,nconst,averageRating,numVotes
7,tt0015145,Monsieur Beaucaire,0,192.0,106.0,"Drama,Romance","nm0909066,nm0948806,nm0497372,nm0245078,nm0401...",6.0,496
9,tt0019286,Storm Over Asia,0,192.0,127.0,"Drama,War","nm0412014,nm0409109,nm0348705,nm0214113,nm2227...",7.0,2188
12,tt0021992,Illicit,0,193.0,79.0,"Drama,Romance","nm0719686,nm1393139,nm1393547,nm1394878,nm1394...",6.1,1062
13,tt0023472,Silver Dollar,0,193.0,83.0,"Biography,Drama,Music","nm0250439,nm0923146,nm0440169",6.3,444
17,tt0026508,I Found Stella Parish,0,193.0,85.0,"Drama,Romance",nm0801209,6.7,523
...,...,...,...,...,...,...,...,...,...
286796,tt1220628,I Hope They Serve Beer in Hell,0,200.0,105.0,Comedy,"nm1910698,nm1901451,nm2056354,nm2103617,nm1924...",5.2,8567
286798,tt1244093,Hisss,0,201.0,98.0,"Comedy,Drama,Horror","nm0816003,nm1335461,nm1324246,nm1997974,nm4147...",2.9,1819
286801,tt12801814,Violation,0,202.0,107.0,"Drama,Horror","nm11777181,nm3631020,nm13926845,nm9278354,nm10...",5.3,4044
286804,tt1291125,Au revoir Taipei,0,201.0,85.0,"Comedy,Crime,Drama","nm11470103,nm2543454,nm6598458,nm6599739,nm094...",6.5,1433


In [145]:
df = df.drop(columns=['tconst', 'primaryTitle','isAdult'], axis=1)
df['nconst'] = df['nconst'].str.replace(',', ' ')
df['genres'] = df['genres'].str.replace(',', ' ')
df

Unnamed: 0,decade,runtimeMinutes,genres,nconst,averageRating,numVotes
7,192.0,106.0,Drama Romance,nm0909066 nm0948806 nm0497372 nm0245078 nm0401...,6.0,496
9,192.0,127.0,Drama War,nm0412014 nm0409109 nm0348705 nm0214113 nm2227...,7.0,2188
12,193.0,79.0,Drama Romance,nm0719686 nm1393139 nm1393547 nm1394878 nm1394...,6.1,1062
13,193.0,83.0,Biography Drama Music,nm0250439 nm0923146 nm0440169,6.3,444
17,193.0,85.0,Drama Romance,nm0801209,6.7,523
...,...,...,...,...,...,...
286796,200.0,105.0,Comedy,nm1910698 nm1901451 nm2056354 nm2103617 nm1924...,5.2,8567
286798,201.0,98.0,Comedy Drama Horror,nm0816003 nm1335461 nm1324246 nm1997974 nm4147...,2.9,1819
286801,202.0,107.0,Drama Horror,nm11777181 nm3631020 nm13926845 nm9278354 nm10...,5.3,4044
286804,201.0,85.0,Comedy Crime Drama,nm11470103 nm2543454 nm6598458 nm6599739 nm094...,6.5,1433


In [146]:
# Numerical variables
column_num = ['decade', 'runtimeMinutes', 'numVotes']
transfo_num = Pipeline(steps=[
    ('scaling', RobustScaler())
])

In [147]:
# Categorical variables
column_tex1 = 'genres'
column_tex2 = 'nconst'
transfo_tex = Pipeline(steps=[
    ('countvec', CountVectorizer()), 
    ('dr', TruncatedSVD())    
    ])

In [148]:
# Class ColumnTransformer : apply alls steps on the whole dataset
preparation = ColumnTransformer(
    transformers=[
        ('data_tex1', transfo_tex , column_tex1),
        ('data_tex2', transfo_tex , column_tex2),
        ('data_num', transfo_num , column_num)
    ])

In [149]:
# Display diagram
set_config(display="diagram")
preparation

In [150]:
# Declare the pipeline
pipe = Pipeline(steps=[('preparation', preparation),
                        ('model', GradientBoostingRegressor())])

In [151]:
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preparation', 'model', 'preparation__n_jobs', 'preparation__remainder', 'preparation__sparse_threshold', 'preparation__transformer_weights', 'preparation__transformers', 'preparation__verbose', 'preparation__verbose_feature_names_out', 'preparation__data_tex1', 'preparation__data_tex2', 'preparation__data_num', 'preparation__data_tex1__memory', 'preparation__data_tex1__steps', 'preparation__data_tex1__verbose', 'preparation__data_tex1__countvec', 'preparation__data_tex1__dr', 'preparation__data_tex1__countvec__analyzer', 'preparation__data_tex1__countvec__binary', 'preparation__data_tex1__countvec__decode_error', 'preparation__data_tex1__countvec__dtype', 'preparation__data_tex1__countvec__encoding', 'preparation__data_tex1__countvec__input', 'preparation__data_tex1__countvec__lowercase', 'preparation__data_tex1__countvec__max_df', 'preparation__data_tex1__countvec__max_features', 'preparation__data_tex1__countvec__min_df', 'preparation__data_t

In [152]:
# Declare model and parameter for Grid Search
parameters = {'model__n_estimators': range(140, 150, 10), 'model__max_depth': range(6,7)}

In [153]:
# Declare the Grid Search method
grid = GridSearchCV(pipe, parameters, scoring='r2', cv = 5, n_jobs =-1, verbose = 1)

In [154]:
# Fit the model
y = df['averageRating']
X = df.drop(columns='averageRating')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
model = grid.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [155]:
# Evaluate cross validation performance 
print("CV score:", grid.best_score_)

CV score: 0.42223955167471133


In [156]:
# Find the best parameters 
print("CV parameters:", grid.best_params_)

CV parameters: {'model__max_depth': 6, 'model__n_estimators': 140}


In [157]:
# Make predictions
y_pred = grid.predict(X_test)

In [158]:
# Evaluate model performance
print("Test score", r2_score(y_test, y_pred))

Test score 0.44195768705067495
