In [24]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('export/datasetWithoutOutlier.csv', delimiter=',', on_bad_lines='skip', low_memory=False)
#print(len(dataset))

#print(len(dataset))
dataset = dataset.drop(columns=['tconst', 'numVotes'])
print(dataset)

       year        genre parentalAdvisor  duree  themaScore  imdbRating
0      2008    Adventure              PG  115.0    0.307559         1.1
1      2011      Romance       Not Rated  143.0    0.389000         1.1
2      2012  Documentary    Tous publics  100.0    0.286879         1.1
3      2012       Family       Not Rated   70.0    0.303512         1.1
4      1992       Comedy         Unrated   71.0    0.243649         1.2
...     ...          ...             ...    ...         ...         ...
34179  1992        Drama               R   84.0    0.308009         9.0
34180  2014          War           PG-13  117.0    0.103751         9.1
34181  2014       Family       Not Rated   60.0    0.247137         9.1
34182  1993       Family              PG   97.0    0.209361         9.1
34183  2015       Horror       Not Rated   75.0    0.383355         9.1

[34184 rows x 6 columns]


In [25]:
target_name = 'imdbRating'

## Graph de correlation 

In [26]:
import plotly.figure_factory as ff

corr_matrix = round(dataset.corr(), 4)

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.values.tolist(),
                                  y = corr_matrix.index.values.tolist(),
                                  )


fig.show()

## Separation Varibles / Target

In [27]:
## Separation valeurs explicative et valeur cible
Y = dataset[:][target_name]
X = dataset.drop(columns= [target_name])

## Separation en train/test
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=0, stratify=Y)
# stratify = Y # option pour même ratio de réponse que sur la table Y.
print(X)
print(Y)

       year        genre parentalAdvisor  duree  themaScore
0      2008    Adventure              PG  115.0    0.307559
1      2011      Romance       Not Rated  143.0    0.389000
2      2012  Documentary    Tous publics  100.0    0.286879
3      2012       Family       Not Rated   70.0    0.303512
4      1992       Comedy         Unrated   71.0    0.243649
...     ...          ...             ...    ...         ...
34179  1992        Drama               R   84.0    0.308009
34180  2014          War           PG-13  117.0    0.103751
34181  2014       Family       Not Rated   60.0    0.247137
34182  1993       Family              PG   97.0    0.209361
34183  2015       Horror       Not Rated   75.0    0.383355

[34184 rows x 5 columns]
0        1.1
1        1.1
2        1.1
3        1.1
4        1.2
        ... 
34179    9.0
34180    9.1
34181    9.1
34182    9.1
34183    9.1
Name: imdbRating, Length: 34184, dtype: float64


## Pipeline Préprocessor 

In [28]:
## Médiane quantitative & plus fréquent qualitatif

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import( OneHotEncoder, StandardScaler, LabelEncoder )
from sklearn.compose import ColumnTransformer

# Create pipeline for numeric features
numeric_features = X.select_dtypes([np.number]).columns # Automatically detect positions of numeric columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # missing values will be replaced by columns' median
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_features = X.select_dtypes("object").columns # Automatically detect positions of categorical columns
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train = preprocessor.fit_transform(X_train) # Preprocessing influenceur
X_test = preprocessor.transform(X_test) # Preprocessing copieur


## Training
### Random Forest

In [29]:
# https://machinelearningmastery.com/multi-core-machine-learning-in-python/
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_jobs=-1, verbose=2) 
# n_jobs=-1 -> Utilisation de tout les cores 
# verbose=2 -> Affiche toutes les info de constructions

## Prediction
### Random Forest model : Grid search

In [None]:
from sklearn.model_selection import GridSearchCV
# Grid of values to be tested
'''
params = {
    'max_depth': [4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 5],
    'min_samples_split': [2, 4, 8],
    'n_estimators': [10, 20, 40, 60, 80, 100]
}
'''
params = {
    'max_depth': [4, 6, 8, 10, 12, 14],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_leaf': [1, 2, 5, 7, 10],
    'min_samples_split': [2, 4, 8, 16, 32, 64],
    'n_estimators': [10, 20, 40, 60, 80, 100, 200]
}
gridsearch = GridSearchCV(model, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
# 36m 6.5s 

In [35]:
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)

...Done.
Best hyperparameters :  {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 80}
Best validation accuracy :  0.21268367771698468



[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.2s


Accuracy on training set :  0.2995596798739404
Accuracy on test set :  0.22639763677240876


[Parallel(n_jobs=8)]: Done  80 out of  80 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  80 out of  80 | elapsed:    0.0s finished


In [31]:
Y_train_pred = gridsearch.predict(X_train) # Predictions on training set
Y_test_pred = gridsearch.predict(X_test) # Prédictions on test set 

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  80 out of  80 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  80 out of  80 | elapsed:    0.0s finished


#### Grid search : Random Forest

In [38]:
from sklearn.metrics import ( accuracy_score, f1_score, r2_score, mean_squared_error )
from sklearn import metrics

# Compute mean squared error
mse = mean_squared_error(Y_test, Y_test_pred)
print("Mean squared error : ", mse)
print("Mean error : ", mse**0.5)


r2-score on training set :  0.2995596798739404
r2-score on test set :  0.22639763677240876
Mean squared error :  1.1452965692514276
Mean error :  1.0701852966899834


In [62]:
fig = px.scatter(x = Y_test, y = Y_test_pred, opacity=0.2, width=650, height=600, marginal_y='violin')
fig.update_layout( margin=dict(l=20, r=10, t=10, b=10) )
fig.show()

## Save Model

In [33]:
import joblib
# save
joblib.dump(gridsearch.best_estimator_, "export/random_forest.joblib", compress=3)

['export/random_forest.joblib']

In [34]:
# load, no need to initialize the loaded_rf
# loaded_rf = joblib.load("./random_forest.joblib")

# loaded_rf.predict(X)