In [32]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('export/dataset.csv', delimiter=',', on_bad_lines='skip')

In [33]:
dataset = dataset.drop(columns=['title', 'director', 'resume'])

In [34]:
dataset.head(5)

Unnamed: 0,year,imdbRating,parentalAdvisor,duree,genre,themaScore
0,1894,5.3,,45.0,Romance,0.151441
1,1906,6.0,Not Rated,70.0,Biography,0.304189
2,1907,4.7,,90.0,Drama,0.086365
3,1914,6.2,,90.0,Drama,0.107717
4,1913,6.1,,88.0,Adventure,0.128053


In [35]:
dataset.dtypes

year                 int64
imdbRating         float64
parentalAdvisor     object
duree              float64
genre               object
themaScore         float64
dtype: object

In [36]:
target_name = 'imdbRating'

In [37]:
# ReIndex the dataset for put the TARGET on last column. 
dataset = dataset.reindex(columns = [col for col in dataset.columns if col != target_name] + [target_name])

# Suppression des Nan sur la TARGET.
dataset = dataset.dropna(subset=[target_name])

print('Valeurs manquantes en % :')
100*dataset.isnull().sum()/dataset.shape[0]

Valeurs manquantes en % :


year                0.000000
parentalAdvisor    85.714286
duree               0.000000
genre               0.000000
themaScore          0.000000
imdbRating          0.000000
dtype: float64

## Graph de correlation 

In [38]:
import plotly.figure_factory as ff

corr_matrix = round(dataset.corr(), 4)

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.values.tolist(),
                                  y = corr_matrix.index.values.tolist(),
                                  )


fig.show()

## Separation Varibles / Target

In [39]:
## Separation valeurs explicative et valeur cible
Y = dataset[:][target_name]
X = dataset.drop(columns= [target_name])

## Separation en train/test
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=0)
# stratify = Y # option pour même ratio de réponse que sur la table Y.

## Pipeline Préprocessor 

In [40]:
## Médiane quantitative & plus fréquent qualitatif

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import( OneHotEncoder, StandardScaler, LabelEncoder )
from sklearn.compose import ColumnTransformer

# Create pipeline for numeric features
numeric_features = X.select_dtypes([np.number]).columns # Automatically detect positions of numeric columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # missing values will be replaced by columns' median
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_features = X.select_dtypes("object").columns # Automatically detect positions of categorical columns
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train = preprocessor.fit_transform(X_train) # Preprocessing influenceur
X_test = preprocessor.transform(X_test) # Preprocessing copieur


## Training
### Random Forest

In [41]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

## Prediction
### Random Forest model : Grid search

In [42]:
from sklearn.model_selection import GridSearchCV
# Grid of values to be tested
params = {
    'max_depth': [4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 5],
    'min_samples_split': [2, 4, 8],
    'n_estimators': [10, 20, 40, 60, 80, 100]
}
gridsearch = GridSearchCV(model, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less

GridSearchCV(cv=3, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [4, 6, 8, 10],
                         'min_samples_leaf': [1, 2, 5],
                         'min_samples_split': [2, 4, 8],
                         'n_estimators': [10, 20, 40, 60, 80, 100]})

In [43]:
Y_train_pred = gridsearch.predict(X_train) # Predictions on training set
Y_test_pred = gridsearch.predict(X_test) # Prédictions on test set 

#### Grid search : Random Forest

In [44]:
from sklearn.metrics import ( accuracy_score, f1_score, r2_score )
from sklearn import metrics

print("r2-score on training set : ", r2_score(Y_train, Y_train_pred))
print("r2-score on test set : ", r2_score(Y_test, Y_test_pred))

r2-score on training set :  0.6947368421052642
r2-score on test set :  -13.728888888888916


## Save Model

In [50]:
import joblib
# save
joblib.dump(gridsearch.best_estimator_, "export/random_forest.joblib", compress=3)

['export/random_forest.joblib']

In [None]:
# load, no need to initialize the loaded_rf
# loaded_rf = joblib.load("./random_forest.joblib")

# loaded_rf.predict(X)