In [35]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import set_config
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import r2_score

In [4]:
# mount GoogleDrive and set the files path
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/MyDrive/NETFLOOX'

Mounted at /content/drive
/content/drive/MyDrive/NETFLOOX


In [5]:
df = pd.read_csv('data_regression.csv', index_col=0)
df = df.drop(['actor_name', 'actress_name'], axis=1).drop_duplicates(subset='tconst')
df

Unnamed: 0,tconst,genres,runtimeMinutes,averageRating,director_name
0,tt19730260,Musical,154.0,9.0,Haricheth
7,tt14691678,Drama,147.0,8.3,S. Mahendar
8,tt12716284,"Comedy,Romance",,5.1,Dinesh Babu
10,tt9130460,"Action,Comedy",,4.7,Om Sai Prakash
11,tt14333040,Drama,153.0,4.3,Om Sai Prakash
...,...,...,...,...,...
900014,tt19712468,Drama,70.0,8.9,S.S. Jishnu Dev
900017,tt1971393,"Action,Drama,Thriller",92.0,5.3,Fernando A. Mico
900020,tt19715754,Documentary,,8.5,Ali Necati Kumcuoglu
900030,tt19727878,"Horror,Mystery,Thriller",118.0,4.8,Any Gacha


In [6]:
df.shape

(196229, 5)

In [7]:
# Exécution du dropna avant de separer y et X
df.dropna(inplace=True)
df.shape

(173812, 5)

In [8]:
# Création d'une colonne rassemblant le genre et le nom du réalisateur
def concat_features(row):
    return(row['genres'].replace(",", " ") + " " + row['director_name'].replace(" ", ""))
df['movie_features'] = df.apply(concat_features, axis=1)
df

Unnamed: 0,tconst,genres,runtimeMinutes,averageRating,director_name,movie_features
0,tt19730260,Musical,154.0,9.0,Haricheth,Musical Haricheth
7,tt14691678,Drama,147.0,8.3,S. Mahendar,Drama S.Mahendar
11,tt14333040,Drama,153.0,4.3,Om Sai Prakash,Drama OmSaiPrakash
12,tt14338500,Drama,155.0,6.9,V. Ravichandran,Drama V.Ravichandran
13,tt4903314,Drama,142.0,3.0,S.V. Suresh Raj,Drama S.V.SureshRaj
...,...,...,...,...,...,...
900011,tt19711858,Drama,81.0,5.9,Fabian Hernández,Drama FabianHernández
900014,tt19712468,Drama,70.0,8.9,S.S. Jishnu Dev,Drama S.S.JishnuDev
900017,tt1971393,"Action,Drama,Thriller",92.0,5.3,Fernando A. Mico,Action Drama Thriller FernandoA.Mico
900030,tt19727878,"Horror,Mystery,Thriller",118.0,4.8,Any Gacha,Horror Mystery Thriller AnyGacha


In [9]:
# Suppression des colonnes 'genres' et 'director_name'
df.drop(columns=['tconst', 'genres', 'director_name'], inplace=True)
df

Unnamed: 0,runtimeMinutes,averageRating,movie_features
0,154.0,9.0,Musical Haricheth
7,147.0,8.3,Drama S.Mahendar
11,153.0,4.3,Drama OmSaiPrakash
12,155.0,6.9,Drama V.Ravichandran
13,142.0,3.0,Drama S.V.SureshRaj
...,...,...,...
900011,81.0,5.9,Drama FabianHernández
900014,70.0,8.9,Drama S.S.JishnuDev
900017,92.0,5.3,Action Drama Thriller FernandoA.Mico
900030,118.0,4.8,Horror Mystery Thriller AnyGacha


In [19]:
# Categorical variables
column_tex = 'movie_features'
transfo_tex = Pipeline(steps=[
    ('countvec', CountVectorizer()),
    ('dr', TruncatedSVD())    
    ])

In [20]:
# Numerical variables
column_num = ['runtimeMinutes']
transfo_num = Pipeline(steps=[
    ('scaling', RobustScaler())
])

In [21]:
# Class ColumnTransformer : apply alls steps on the whole dataset
preparation = ColumnTransformer(
    transformers=[
        ('data_tex', transfo_tex , column_tex),
        ('data_num', transfo_num , column_num)
    ])

In [22]:
# Display diagram
set_config(display="diagram")
preparation

In [23]:
# Declare the pipeline
pipe = Pipeline(steps=[('preparation', preparation), # ACP,
                        ('model', AdaBoostRegressor())])

In [24]:
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preparation', 'model', 'preparation__n_jobs', 'preparation__remainder', 'preparation__sparse_threshold', 'preparation__transformer_weights', 'preparation__transformers', 'preparation__verbose', 'preparation__verbose_feature_names_out', 'preparation__data_tex', 'preparation__data_num', 'preparation__data_tex__memory', 'preparation__data_tex__steps', 'preparation__data_tex__verbose', 'preparation__data_tex__countvec', 'preparation__data_tex__dr', 'preparation__data_tex__countvec__analyzer', 'preparation__data_tex__countvec__binary', 'preparation__data_tex__countvec__decode_error', 'preparation__data_tex__countvec__dtype', 'preparation__data_tex__countvec__encoding', 'preparation__data_tex__countvec__input', 'preparation__data_tex__countvec__lowercase', 'preparation__data_tex__countvec__max_df', 'preparation__data_tex__countvec__max_features', 'preparation__data_tex__countvec__min_df', 'preparation__data_tex__countvec__ngram_range', 'preparation__

In [64]:
# Declare model and parameter for Grid Search
parameters = {'model__n_estimators': range(50, 301, 50)}

In [65]:
# Declare the Grid Search method
grid = GridSearchCV(pipe, parameters, scoring='r2', cv = 5, n_jobs =-1, verbose = 1)

In [66]:
# Fit the model
y = df['averageRating']
X = df.drop(columns='averageRating')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [67]:
# Evaluate cross validation performance 
print("CV score:", grid.best_score_)

CV score: 0.0780169887269383


In [68]:
# Make predictions
y_pred = grid.predict(X_test)

In [69]:
# Evaluate model performance
print("Test score", r2_score(y_test, y_pred))

Test score 0.09203097792620907
