In [22]:
#Les imports 

%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from utils import *
import pickle
import ctypes

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing

from dataset_prepare import load_dataset, pred_thres

import imblearn

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
df_2 = load_dataset()
label = 'mortel'

In [24]:
# valeurs catégorielles
categorical_features = ['trajet', 'catr', 'circ', 'nbv', 'prof',
                        'plan', 'surf', 'vma', 'lum', 'agg', 
                        'int', 'atm', 'col', 'catv', 'obs', 'obsm', 'choc', 'pieton',
                        'sexe_conducteur', 'infra', 'situ']
# valeurs numériques
numerical_features = ['mois', 'dep','age']

print("numerical : ", numerical_features)
print("categorical : ", categorical_features)

numerical :  ['mois', 'dep', 'age']
categorical :  ['trajet', 'catr', 'circ', 'nbv', 'prof', 'plan', 'surf', 'vma', 'lum', 'agg', 'int', 'atm', 'col', 'catv', 'obs', 'obsm', 'choc', 'pieton', 'sexe_conducteur', 'infra', 'situ']


In [25]:
from sklearn.model_selection import train_test_split

# On découpe le jeu de données tout en conservant ensemble les véhicules impliqué dans un même accident
# Les données d'entrainement et de test n'ont donc pas de rapport direct

unique_accidents = df_2['Num_Acc'].unique() # Num_Acc uniques

df_3 = df_2.drop(columns=['mortal'])

# Création des train et test set à partir des numéros d'accident
X_train, X_test = train_test_split(unique_accidents, test_size=0.33, random_state=42)

# On peut ensuite récupérer les véhicules correspondants aux accidents
train_df = df_2[df_2['Num_Acc'].isin(X_train)]
test_df = df_2[df_2['Num_Acc'].isin(X_test)]
y_train = train_df['mortal']
y_test = test_df['mortal']
X_train = train_df.drop(columns=['mortal', 'Num_Acc'])
X_test = test_df.drop(columns=['mortal', 'Num_Acc'])
df_2 = df_2.drop(columns='Num_Acc')

In [26]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

In [32]:
from utils import custom_RFC
import imblearn

dt = custom_RFC()
clf = imblearn.pipeline.Pipeline(
    [
        ('preprocessor', transformations),
        ("resample", imblearn.over_sampling.SMOTE()),
        ('classifier', dt)
    ])
clf = clf.fit(X_train, y_train)

preds = clf.predict(X_test)

clf.score(X_train, y_train), clf.score(X_test, y_test)

(0.9967062754121095, 0.8777022455620349)

In [33]:
clf = pickle.dump(clf, open('models/rfc_model.sav', 'wb'))

In [34]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
tn, fp, fn, tp

(26595, 2740, 1056, 648)

In [35]:
import plotly.express as px

fig = px.imshow([[tn, fp], [fn, tp]], text_auto=True, labels=dict(y="Truth", x="Pred"),
                x=["False", "True"],
                y=["False", "True"]
               )
fig.show()