In [16]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('export/datasetWithoutOutlier.csv', delimiter=',', on_bad_lines='skip', low_memory=False)
#print(len(dataset))

#print(len(dataset))
dataset = dataset.drop(columns=['tconst', 'numVotes'])
print(dataset)

       year        genre parentalAdvisor  duree  themaScore  imdbRating
0      1906    Adventure       Not Rated   70.0    0.157268         6.0
1      1932       Comedy    Tous publics   87.0    0.221232         5.7
2      1916      Fantasy         Unrated   63.0    0.196874         6.3
3      1914        Drama    Tous publics  109.0    0.130555         5.9
4      1932        Drama          Passed   88.0    0.084648         6.4
...     ...          ...             ...    ...         ...         ...
10310  1956        Crime    Tous publics  107.0    0.499591         6.6
10311  1946  Documentary       Not Rated   59.0    0.460585         7.0
10312  1946    Animation          Passed   75.0    0.613953         6.2
10313  1946    Animation          Passed   75.0    0.613953         6.2
10314  1955    Animation        Approved   69.0    0.237465         7.3

[10315 rows x 6 columns]


In [17]:
target_name = 'imdbRating'

In [18]:
# Drop 10x10 last frequency Percentages 
print(len(dataset))

for col_name in dataset.columns :
    values_counts = dataset[col_name].value_counts()
    values_counts_10percents = int(dataset[col_name].value_counts().to_list()[0] / 10)
    values = []
    for i in range(len(values_counts)):
        if values_counts.values[i] < values_counts_10percents:
            values.append(values_counts.index[i])

    dataset = dataset[~dataset[col_name].isin(values)]

print(len(dataset))

print(dataset.head(5))

10315
294
     year      genre parentalAdvisor  duree  themaScore  imdbRating
76   1930     Comedy          Passed  102.0    0.368290         5.8
78   1932     Comedy          Passed   88.0    0.137391         7.0
108  1933      Drama          Passed   60.0    0.185819         6.7
120  1935      Drama        Approved   89.0    0.151597         6.2
344  1936  Adventure          Passed   89.0    0.334205         6.5


## Graph de correlation 

In [19]:
import plotly.figure_factory as ff

corr_matrix = round(dataset.corr(), 4)

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.values.tolist(),
                                  y = corr_matrix.index.values.tolist(),
                                  )


fig.show()

## Separation Varibles / Target

In [20]:
## Separation valeurs explicative et valeur cible
Y = dataset[:][target_name]
X = dataset.drop(columns= [target_name])

## Separation en train/test
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=0)
# stratify = Y # option pour même ratio de réponse que sur la table Y.
print(X)
print(Y)

       year      genre parentalAdvisor  duree  themaScore
76     1930     Comedy          Passed  102.0    0.368290
78     1932     Comedy          Passed   88.0    0.137391
108    1933      Drama          Passed   60.0    0.185819
120    1935      Drama        Approved   89.0    0.151597
344    1936  Adventure          Passed   89.0    0.334205
...     ...        ...             ...    ...         ...
9921   1945      Drama        Approved   77.0    0.171419
9934   1947     Comedy          Passed   85.0    0.109101
9964   1951      Crime        Approved   88.0    0.384395
9965   1952      Crime          Passed   81.0    0.178925
10119  1951     Comedy        Approved   89.0    0.266576

[294 rows x 5 columns]
76       5.8
78       7.0
108      6.7
120      6.2
344      6.5
        ... 
9921     7.6
9934     6.4
9964     6.7
9965     6.6
10119    6.0
Name: imdbRating, Length: 294, dtype: float64


## Pipeline Préprocessor 

In [21]:
## Médiane quantitative & plus fréquent qualitatif

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import( OneHotEncoder, StandardScaler, LabelEncoder )
from sklearn.compose import ColumnTransformer

# Create pipeline for numeric features
numeric_features = X.select_dtypes([np.number]).columns # Automatically detect positions of numeric columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # missing values will be replaced by columns' median
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_features = X.select_dtypes("object").columns # Automatically detect positions of categorical columns
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train = preprocessor.fit_transform(X_train) # Preprocessing influenceur
X_test = preprocessor.transform(X_test) # Preprocessing copieur


## Training
### Random Forest

In [22]:
# https://machinelearningmastery.com/multi-core-machine-learning-in-python/
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_jobs=-1, verbose=2) 
# n_jobs=-1 -> Utilisation de tout les cores 
# verbose=2 -> Affiche toutes les info de constructions

## Prediction
### Random Forest model : Grid search

In [23]:
from sklearn.model_selection import GridSearchCV
# Grid of values to be tested
params = {
    'max_depth': [4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 5],
    'min_samples_split': [2, 4, 8],
    'n_estimators': [10, 20, 40, 60, 80, 100]
}
gridsearch = GridSearchCV(model, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
# 36m 6.5s 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    6.8s remaining:    2.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.9s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   7 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   7 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Paralle

KeyboardInterrupt: 

In [None]:
Y_train_pred = gridsearch.predict(X_train) # Predictions on training set
Y_test_pred = gridsearch.predict(X_test) # Prédictions on test set 

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


#### Grid search : Random Forest

In [None]:
from sklearn.metrics import ( accuracy_score, f1_score, r2_score )
from sklearn import metrics

print("r2-score on training set : ", r2_score(Y_train, Y_train_pred))
print("r2-score on test set : ", r2_score(Y_test, Y_test_pred))

r2-score on training set :  0.931031673127063
r2-score on test set :  0.6097852860912801


## Save Model

In [None]:
import joblib
# save
joblib.dump(gridsearch.best_estimator_, "export/random_forest.joblib", compress=3)

['export/random_forest.joblib']

In [None]:
# load, no need to initialize the loaded_rf
# loaded_rf = joblib.load("./random_forest.joblib")

# loaded_rf.predict(X)