In [70]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('export/table_imdb_paris.csv', delimiter=',', on_bad_lines='skip')
#
# Correction on Type
dataset["debut"] = pd.to_datetime( dataset["debut"], format='%Y-%m-%d')
dataset["fin"] = pd.to_datetime( dataset["fin"], format='%Y-%m-%d')
dataset["postal"] = dataset["postal"].astype(str)
#
# Erase Uniques Raw & Miss Values
dataset = dataset.drop(columns= ['titre', 'réalisateur', 'producteur', 'imdb_search', 'title', 'resume','metascore'])
#
# Erase Second and Third Genre.
dataset["genre"] = [ str(x).replace(' ', '') for x in dataset["genre"][:] ] 
dataset["genre"] = [ x.split(",") for x in dataset["genre"][:] ]

dataset = dataset.explode('genre').reset_index()
frequency = dataset["genre"].value_counts()
dataset["frequency"] = ''

for index in range(len(dataset)):
    for genre in range(len(frequency)):
        if dataset["genre"][index] == frequency.index[genre]:
            dataset["frequency"][index] = frequency.values[genre]

dataset.sort_values(by='frequency', ascending=False)
dataset = dataset.drop_duplicates(subset='index', keep="first")
dataset = dataset.drop(columns= ['index','frequency'])

# 18.9s

In [71]:
# Select TARGET
target_name = 'imdb_note'

In [72]:
# ReIndex the dataset for put the TARGET on last column. 
dataset = dataset.reindex(columns = [col for col in dataset.columns if col != target_name] + [target_name])

# Suppression des Nan sur la TARGET.
dataset = dataset.dropna(subset=[target_name])
pourcentage_valeur_manquante = 100*dataset.isnull().sum()/dataset.shape[0]

print(pourcentage_valeur_manquante)
print(dataset.shape[0])

postal               0.000000
debut                0.000000
fin                  0.000000
latitude             0.000000
longitude            0.000000
date                 0.000000
parental_advisor    13.567581
duree                3.000769
genre                0.000000
vote                 0.000000
imdb_note            0.000000
dtype: float64
3899


## Graph de correlation 

In [73]:
import plotly.figure_factory as ff

corr_matrix = round(dataset.corr(), 4)

# https://plotly.github.io/plotly.py-docs/generated/plotly.figure_factory.create_annotated_heatmap.html
fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.values.tolist(),
                                  y = corr_matrix.index.values.tolist(),
                                  )


fig.show()

## Separation Varibles / Target

In [74]:
## Separation valeurs explicative et valeur cible
Y = dataset[:][target_name]
X = dataset.drop(columns= [target_name])

## Separation en train/test
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=0)
# stratify = Y # option pour même ratio de réponse que sur la table Y.

## Pipeline Préprocessor 

In [75]:
## Médiane quantitative & plus fréquent qualitatif

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import( OneHotEncoder, StandardScaler, LabelEncoder )
from sklearn.compose import ColumnTransformer

# Create pipeline for numeric features
numeric_features = X.select_dtypes([np.number]).columns # Automatically detect positions of numeric columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # missing values will be replaced by columns' median
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_features = X.select_dtypes("object").columns # Automatically detect positions of categorical columns
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train = preprocessor.fit_transform(X_train) # Preprocessing influenceur
X_test = preprocessor.transform(X_test) # Preprocessing copieur


### Encodage valeur cible qualitative

In [76]:
if type(target_name) is not np.number:
    print("c'est un objet")
    encoder = LabelEncoder() # Label encoding
    Y_train = encoder.fit_transform(Y_train)
    Y_test = encoder.transform(Y_test)

c'est un objet


## Training
### Random Forest

In [77]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

# model.fit(X_train, Y_train) # Training is always done on train set !!

## Prediction
### Random Forest model : Grid search

In [78]:
from sklearn.model_selection import GridSearchCV
# Grid of values to be tested
params = {
    'max_depth': [4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 5],
    'min_samples_split': [2, 4, 8],
    'n_estimators': [10, 20, 40, 60, 80, 100]
}
gridsearch = GridSearchCV(model, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)


The least populated class in y has only 1 members, which is less than n_splits=3.



GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [4, 6, 8, 10],
                         'min_samples_leaf': [1, 2, 5],
                         'min_samples_split': [2, 4, 8],
                         'n_estimators': [10, 20, 40, 60, 80, 100]})

In [79]:
Y_train_pred = gridsearch.predict(X_train) # Predictions on training set
Y_test_pred = gridsearch.predict(X_test) # Prédictions on test set 

#### Grid search : Random Forest

In [83]:
from sklearn.metrics import accuracy_score

print("accuracy on training set : ", accuracy_score(Y_train, Y_train_pred))
print("accuracy on test set : ", accuracy_score(Y_test, Y_test_pred))

accuracy on training set :  0.836149668074834
accuracy on test set :  0.747008547008547


In [95]:
# Genre 

# accuracy on training set :  0.8145563310069791
# accuracy on test set :  0.7740112994350282

# Note IMDB

# accuracy on training set :  0.836149668074834
# accuracy on test set :  0.747008547008547

display( dataset.drop(columns= [target_name]).columns.tolist() )

['postal',
 'debut',
 'fin',
 'latitude',
 'longitude',
 'date',
 'parental_advisor',
 'duree',
 'genre',
 'vote']