In [69]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('export/table_imdb_paris.csv', delimiter=',', on_bad_lines='skip')

dataset.head(1)

Unnamed: 0,titre,réalisateur,producteur,postal,debut,fin,latitude,longitude,imdb_search,title,date,parental_advisor,duree,genre,imdb_note,metascore,vote,resume
0,TOUT S'EST BIEN PASSE,Francois OZON,MANDARIN PRODUCTION,75013,2020-08-20,2020-08-21,48.83566,2.348315,https://www.imdb.com/search/title/?title=TOUT+...,Everything Went Fine,2021,Tous publics,113.0,Drama,6.8,67.0,1572.0,"When André, 85, has a stroke, Emmanuelle hurri..."


In [70]:
dataset["debut"] = pd.to_datetime( dataset["debut"], format='%Y-%m-%d')
dataset["fin"] = pd.to_datetime( dataset["fin"], format='%Y-%m-%d')
dataset["postal"] = dataset["postal"].astype(str)

In [71]:
dataset = dataset.drop(columns= ['titre', 'réalisateur', 'producteur', 'imdb_search', 'title', 'resume'])

# dataset.dtypes
dataset = dataset.reindex(columns = [col for col in dataset.columns if col != 'genre'] + ['genre'])

In [72]:
pourcentage_valeur_manquante = 100*dataset.isnull().sum()/dataset.shape[0]

pourcentage_valeur_manquante

postal               0.000000
debut                0.000000
fin                  0.000000
latitude             0.000000
longitude            0.000000
date                 0.000000
parental_advisor    28.389831
duree               11.207627
imdb_note           17.394068
metascore           78.516949
vote                17.394068
genre                1.186441
dtype: float64

## Separation Varibles / Target

In [73]:
target_name = 'genre'

dataset = dataset.dropna(subset=[target_name]).reset_index()

## Separation valeurs explicative et valeur cible
Y = dataset[:][target_name]
X = dataset.drop(columns= [target_name])

## Separation en train/test
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=0)
# stratify = Y # option pour même ratio de réponse que sur la table Y.

In [74]:
pourcentage_valeur_manquante = 100*dataset.isnull().sum()/dataset.shape[0]

pourcentage_valeur_manquante

index                0.000000
postal               0.000000
debut                0.000000
fin                  0.000000
latitude             0.000000
longitude            0.000000
date                 0.000000
parental_advisor    27.530017
duree               10.398799
imdb_note           16.402230
metascore           78.259005
vote                16.402230
genre                0.000000
dtype: float64

## Pipeline Préprocessor 

In [75]:
## Médiane quantitative & plus fréquent qualitatif

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import( OneHotEncoder, StandardScaler, LabelEncoder )
from sklearn.compose import ColumnTransformer

# Create pipeline for numeric features
numeric_features = X.select_dtypes([np.number]).columns # Automatically detect positions of numeric columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # missing values will be replaced by columns' median
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_features = X.select_dtypes("object").columns # Automatically detect positions of categorical columns
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train = preprocessor.fit_transform(X_train) # Preprocessing influenceur
X_test = preprocessor.transform(X_test) # Preprocessing copieur


### Encodage valeur cible qualitative

In [76]:
encoder = LabelEncoder() # Label encoding
Y_train = encoder.fit_transform(Y_train)
Y_test = encoder.transform(Y_test)

## Training

In [77]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', max_iter=1000) # Train model

In [78]:
model.fit(X_train, Y_train) # Training is always done on train set !!

LogisticRegression(max_iter=1000)

## Prediction

In [79]:
Y_train_pred = model.predict(X_train) # Predictions on training set
Y_test_pred = model.predict(X_test) # Prédictions on test set 

## Accuracy
__Qualitatif (Classification)__

In [80]:
from sklearn.metrics import accuracy_score

print("Accuracy on training set : ", accuracy_score(Y_train, Y_train_pred))
print("Accuracy on test set : ", accuracy_score(Y_test, Y_test_pred))

Accuracy on training set :  0.5688698284561049
Accuracy on test set :  0.5614285714285714


## Classement valeurs explicatives

In [81]:
from sklearn.feature_selection import f_regression

feature_importance = f_regression(X_train, Y_train)

# Create DataFrame with feature importance
feature_ranking = pd.DataFrame(columns=dataset.columns[:-1], data=feature_importance, index=["f-score", "p-value"])
# Reshape DataFrame and sort by f-score
feature_ranking = feature_ranking.transpose().sort_values(["f-score", "p-value"], ascending=False)
# Create column with feature names
feature_ranking = feature_ranking.reset_index().rename(columns = {'index': 'feature'})

px.bar(feature_ranking.sort_values(["f-score", "p-value"]), x = 'f-score', y = 'feature')

from matplotlib import pyplot

importance = model.coef_
# summarize feature importance
for i,v in enumerate(importance):
print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

IndentationError: expected an indented block (<ipython-input-81-6fbb1cf91457>, line 19)