In [1]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('src/table_imdb_paris.csv', delimiter=',', on_bad_lines='skip')
#
# Correction on Type
dataset["debut"] = pd.to_datetime( dataset["debut"], format='%Y-%m-%d')
dataset["fin"] = pd.to_datetime( dataset["fin"], format='%Y-%m-%d')
dataset["postal"] = dataset["postal"].astype(str)
#
# Erase Uniques Raw & Miss Values
# dataset = dataset.drop(columns= ['titre', 'réalisateur', 'producteur', 'imdb_search', 'title', 'resume','metascore'])
dataset = dataset.drop(columns= ['titre','réalisateur', 'producteur', 'imdb_search','metascore', 'title'])

# Convert string Genre to list
dataset["genre"] = [ str(x).replace(' ', '') for x in dataset["genre"][:] ] 
dataset["genre"] = [ x.split(",") for x in dataset["genre"][:] ]

### Encodage valeur cible qualitative (Random Forest)

In [2]:
# https://www.sbert.net/docs/quickstart.html#comparing-sentence-similarities
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
from unittest import result

dataset['genres'] = dataset['genre']
dataset['genre'] = ''
dataset['themaScore'] = 0.0

for i in range( len(dataset) ):
    resume = dataset.iloc[i]['resume']
    genres = dataset.iloc[i]['genres']
    bestScore = 0.0
    bestGenre = ''
    for genre in genres:
        emb1 = model.encode(str(genre))
        emb2 = model.encode(str(resume))
        cos_sim = util.cos_sim(emb1, emb2)
        result = cos_sim.tolist()[0][0]
        if result > bestScore:
            bestScore = result
            bestGenre = genre
    # print(i, '/', len(dataset))
    dataset.themaScore[i] = bestScore
    dataset.genre[i] = bestGenre
# 5m 17s

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset.themaScore[i] = bestScore
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset.genre[i] = bestGenre


In [4]:
dataset = dataset.drop(columns= ['genres', 'resume'])
dataset.head(2)

Unnamed: 0,postal,debut,fin,latitude,longitude,date,parental_advisor,duree,genre,imdb_note,vote,themaScore
0,75013,2020-08-20,2020-08-21,48.83566,2.348315,2021,Tous publics,113.0,Drama,6.8,1572.0,0.056418
1,75010,2020-07-25,2020-07-25,48.881901,2.353763,2021,Tous publics,113.0,Drama,6.8,1572.0,0.056418


In [5]:
dataset.dtypes

postal                      object
debut               datetime64[ns]
fin                 datetime64[ns]
latitude                   float64
longitude                  float64
date                         int64
parental_advisor            object
duree                      float64
genre                       object
imdb_note                  float64
vote                       float64
themaScore                 float64
dtype: object

In [6]:
# Select TARGET
target_name = 'imdb_note'

In [7]:
# ReIndex the dataset for put the TARGET on last column. 
dataset = dataset.reindex(columns = [col for col in dataset.columns if col != target_name] + [target_name])

# Suppression des Nan sur la TARGET.
dataset = dataset.dropna(subset=[target_name])

dataset.isna().sum()

postal                0
debut                 0
fin                   0
latitude              0
longitude             0
date                  0
parental_advisor    529
duree               117
genre                 0
vote                  0
themaScore            0
imdb_note             0
dtype: int64

## Graph de correlation 

In [8]:
import plotly.figure_factory as ff

corr_matrix = round(dataset.corr(), 4)

# https://plotly.github.io/plotly.py-docs/generated/plotly.figure_factory.create_annotated_heatmap.html
fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.values.tolist(),
                                  y = corr_matrix.index.values.tolist(),
                                  )


fig.show()

## Separation Varibles / Target

In [9]:
## Separation valeurs explicative et valeur cible
Y = dataset[:][target_name]
X = dataset.drop(columns= [target_name])

## Separation en train/test
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=0)
# stratify = Y # option pour même ratio de réponse que sur la table Y.

## Pipeline Préprocessor 

In [10]:
## Médiane quantitative & plus fréquent qualitatif

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import( OneHotEncoder, StandardScaler, LabelEncoder )
from sklearn.compose import ColumnTransformer

# Create pipeline for numeric features
numeric_features = X.select_dtypes([np.number]).columns # Automatically detect positions of numeric columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # missing values will be replaced by columns' median
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_features = X.select_dtypes("object").columns # Automatically detect positions of categorical columns
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train = preprocessor.fit_transform(X_train) # Preprocessing influenceur
X_test = preprocessor.transform(X_test) # Preprocessing copieur


### Encodage valeur cible qualitative

In [11]:
# encoder = LabelEncoder() # Label encoding
# Y_train = encoder.fit_transform(Y_train)
# Y_test = encoder.transform(Y_test)

## Training
### Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

#model = RandomForestClassifier()
model = RandomForestRegressor()

# model.fit(X_train, Y_train) # Training is always done on train set !!

## Prediction
### Random Forest model : Grid search

In [13]:
from sklearn.model_selection import GridSearchCV
# Grid of values to be tested
params = {
    'max_depth': [4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 5],
    'min_samples_split': [2, 4, 8],
    'n_estimators': [10, 20, 40, 60, 80, 100]
}
gridsearch = GridSearchCV(model, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)

GridSearchCV(cv=3, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [4, 6, 8, 10],
                         'min_samples_leaf': [1, 2, 5],
                         'min_samples_split': [2, 4, 8],
                         'n_estimators': [10, 20, 40, 60, 80, 100]})

In [14]:
Y_train_pred = gridsearch.predict(X_train) # Predictions on training set
Y_test_pred = gridsearch.predict(X_test) # Prédictions on test set 

#### Grid search : Random Forest

In [15]:
from sklearn.metrics import ( accuracy_score, f1_score, r2_score )
from sklearn import metrics

#print("accuracy on training set : ", accuracy_score(Y_train, Y_train_pred))
#print("accuracy on test set : ", accuracy_score(Y_test, Y_test_pred))

print("r2-score on training set : ", r2_score(Y_train, Y_train_pred))
print("r2-score on test set : ", r2_score(Y_test, Y_test_pred))

r2-score on training set :  0.9243004771959339
r2-score on test set :  0.9155969921021707


In [16]:
#  OLD Note IMDB without Ponderation on Genre
# accuracy on training set :  0.836149668074834
# accuracy on test set :  0.747008547008547

# Note IMDB with Ponderation Score on Genre
# r2-score on training set :  0.9244979178400918
# r2-score on test set :  0.9149046103964305