# Data drift P7
- Analyse : du data drift de la TARGET et des features sur le dataset Credit Risk
- Objectif : détecter 1 dérive (variation importante) dans le comportement des clients
- Moyen : analyse de la différence dans la distribution statistique de chaque variable entre les données de référence et les données courantes via le package EVIDENTLY 

# Import

In [1]:
# Data manipulation
import pandas as pd
import re # Traitement des caractères spéciaux

# Pre-processing
from sklearn.model_selection import train_test_split

# Evidently for data drif reports
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset

#from evidently.dashboard import Dashboard
#from evidently.dashboard.tabs import Dashboard
#from evidently.metrics import *

#from evidently.dashboard.tabs import DataDriftTab, CatTargetDriftTab
#from evidently.model_profile import Profile
#from evidently.profile_sections import DataDriftProfileSection, CatTargetDriftProfileSection

import warnings 
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# Dataset pre-processing

In [2]:
# Lecture du fichier des data de Risque crédit pré traités
df = pd.read_csv('df_scenario_0.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
print(f'Dimensions : {df.shape}')

Dimensions : (356251, 803)


In [3]:
# Fonction de nettoyage des NaN du dataset
# Nettoyage des valeurs NaN et des noms de colonnes contenant des caractères spéciaux
# Fill NaN --> 0 : correspond aux valeurs sans correspondance dans les jointures gauche (left join)
def fct_clean_data(df):
    # Fill NaN par 0
    feat = [v for v in list(df.columns) if v != 'TARGET']
    df[feat] = df[feat].fillna(0, axis=0)

    # Bug : "[LightGBM] Do not support special JSON characters in feature name"
    # Correction : suppression des caractères spéciaux ' ', ','... dans le nom des colonnes
    new_names = {col: re.sub(r'[^A-Za-z0-9_]+', '', col) for col in df.columns}
    new_names_list = list(new_names.values())
    
    # Nom des colonnes unique : ajout du suffix i si le nom de la colonne apparaît plus d'1 fois après suppression des caractères spéciaux
    new_names = {col: f'{new_col}_{i}' if new_col in new_names_list[:i] else new_col for i, (col, new_col) in enumerate(new_names.items())}
    df.columns = new_names.values()

    # Retourne le dataframe nettoyé
    return df

df = df[~df['TARGET'].isna()] # TARGET=0/1
df = fct_clean_data(df)

In [4]:
# Extract les 50 features les + importantes selon le meilleur modèle retenu Light_GBM
feats_importance = pd.read_csv('df_lgbm_feat_importance.csv')['feat_importance'].values.tolist()
df = df[feats_importance + ['TARGET']]
df.head()

Unnamed: 0,DAYS_EMPLOYED,EXT_SOURCE_1,EXT_SOURCE_3,DAYS_BIRTH,DAYS_ID_PUBLISH,EXT_SOURCE_2,CODE_GENDER,NAME_EDUCATION_TYPE_Secondarysecondaryspecial,CC_CNT_DRAWINGS_CURRENT_MAX,CC_CNT_DRAWINGS_ATM_CURRENT_MEAN,...,CC_CNT_DRAWINGS_ATM_CURRENT_MAX,PREV_CODE_REJECT_REASON_SCOFR_MEAN,FLOORSMAX_MODE,REG_CITY_NOT_WORK_CITY,FLAG_EMP_PHONE,TR_AGE,NAME_INCOME_TYPE_Pensioner,NAME_EDUCATION_TYPE_Highereducation,ORGANIZATION_TYPE_XNA,TARGET
0,-637.0,0.083037,0.139376,-9461,-2120,0.262949,1,1,0.0,0.0,...,0.0,0.0,0.0833,0,1,0,0,0,0,1.0
1,-1188.0,0.311267,0.0,-16765,-291,0.622246,0,0,0.0,0.0,...,0.0,0.0,0.2917,0,1,2,0,1,0,0.0
2,-225.0,0.0,0.729567,-19046,-2531,0.555912,1,1,0.0,0.0,...,0.0,0.0,0.0,0,1,3,0,0,0,0.0
3,-3039.0,0.0,0.0,-19005,-2437,0.650442,0,1,0.0,0.0,...,0.0,0.0,0.0,0,1,3,0,0,0,0.0
4,-3038.0,0.0,0.0,-19932,-3458,0.322738,1,1,0.0,0.0,...,0.0,0.0,0.0,1,1,3,0,0,0,0.0


In [5]:
# Split Train set/Test set 
# Pour simuler les data de référence (df_ref) et les data courantes (df_cur)

# Split sans stratify pour pouvoir simuler une dérive de la TARGET
df_ref, df_cur = train_test_split(df, test_size=0.02, stratify=None, random_state=0)
print(f'Dimension Reference set : {df_ref.shape}')
print(f'Dimension Current set : {df_cur.shape}')

Dimension Reference set : (301356, 51)
Dimension Current set : (6151, 51)


In [6]:
# Check distribution de la Target entre les 2 datasets
print(f"Distribution du REFERENCE set :\n{df_ref['TARGET'].value_counts()}")
print('-'*50)
print(f"Distribution du CURRENT set :\n{df_cur['TARGET'].value_counts()}")

Distribution du REFERENCE set :
0.0    277031
1.0     24325
Name: TARGET, dtype: int64
--------------------------------------------------
Distribution du CURRENT set :
0.0    5651
1.0     500
Name: TARGET, dtype: int64


# Data drift

In [7]:
# Data drift de la TARGET
data_drift_report = Report(metrics=[DataDriftPreset(),])
data_drift_report.run(reference_data=df_ref[['TARGET']], current_data=df_cur[['TARGET']], column_mapping=None)
data_drift_report.save_html('data_drift_target.html')

In [8]:
# Data drift des features sans la TARGET
df_ref = df_ref.drop('TARGET', axis=1)
df_cur = df_cur.drop('TARGET', axis=1)
data_drift_report = Report(metrics=[DataDriftPreset(),])
data_drift_report.run(reference_data=df_ref, current_data=df_cur, column_mapping=None)
data_drift_report.save_html('data_drift_feats.html')

# Conclusion
- On n'observe pas de dérive dans les données (pas de changement de comportement)
- C'est normal car ici les données de référence et courantes appartiennent au même dataset (prises au même moment)