# Recherche des features importantes

# Chargement des librairies

In [1]:
import pandas as pd
import numpy as np
import  matplotlib.pyplot as plt
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

# Chargement du dataset

In [2]:
# X = pd.read_csv('../credit_train.csv', nrows=150000, index_col=0)
# Lecture du fichier par lot afin d'éviter les crashes
TextFileReader = pd.read_csv('../data_models/credit_train.csv',
                             chunksize=10000,
                             nrows=None,
                             index_col=0)

In [3]:
dfList = []
for df in TextFileReader:
    dfList.append(df)

X = pd.concat(dfList,sort=False)

In [4]:
# Il faut remplacer les valeurs infinies par nan, autrement XGBoost plante
X = X.replace(np.inf, np.nan)

In [5]:
# Suppression des espaces dans le nom des features
X.columns = X.columns.str.replace(' ', '_')

In [6]:
X.head()

Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,100002,1.0,0,0,0,0,202500.0,406597.5,24700.5,351000.0,...,,,,,,,,,,
1,100003,0.0,1,0,1,0,270000.0,1293502.5,35698.5,1129500.0,...,,,,,,,,,,
2,100004,0.0,0,1,0,0,67500.0,135000.0,6750.0,135000.0,...,,,,,,,,,,
3,100006,0.0,1,0,0,0,135000.0,312682.5,29686.5,297000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,100007,0.0,0,0,0,0,121500.0,513000.0,21865.5,513000.0,...,,,,,,,,,,


In [7]:
X.shape

(307511, 797)

In [8]:
# Création de la target
y = X['TARGET']

In [9]:
# Suppression de la target de la matrice X
X = X.drop(columns='TARGET')

In [10]:
X.head()

Unnamed: 0,SK_ID_CURR,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,100002,0,0,0,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,,,,,,,,,,
1,100003,1,0,1,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,,,,,,,,,,
2,100004,0,1,0,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,,,,,,,,,,
3,100006,1,0,0,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,100007,0,0,0,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,,,,,,,,,,


# Préparation des données

## Suppression des variables peu renseignées

Un module utile  
https://github.com/WillKoehrsen/feature-selector/blob/master/feature_selector/feature_selector.py

### Suppression des variables ayant plus de 50% de valeurs manquantes

In [11]:
# pct de valeurs manquantes
percent_missing = X.isnull().sum() * 100 / len(X)

In [12]:
# Tri par ordre décroissant du pct de valeurs manquantes
missing_value_X = pd.DataFrame({'percent_missing': percent_missing})

missing_value_X.sort_values('percent_missing', inplace=True, ascending=False)
missing_value_X.head()

Unnamed: 0,percent_missing
REFUSED_AMT_DOWN_PAYMENT_MIN,85.311095
REFUSED_RATE_DOWN_PAYMENT_MAX,85.311095
REFUSED_AMT_DOWN_PAYMENT_MAX,85.311095
REFUSED_RATE_DOWN_PAYMENT_MIN,85.311095
REFUSED_AMT_DOWN_PAYMENT_MEAN,85.311095


In [13]:
# Effacer les colonnes avec plus de 50% de valeurs manquantes
col_low_missing = missing_value_X[missing_value_X.percent_missing>50].index

In [14]:
# Nom des colonnes à supprimer
col_low_missing

Index(['REFUSED_AMT_DOWN_PAYMENT_MIN', 'REFUSED_RATE_DOWN_PAYMENT_MAX',
       'REFUSED_AMT_DOWN_PAYMENT_MAX', 'REFUSED_RATE_DOWN_PAYMENT_MIN',
       'REFUSED_AMT_DOWN_PAYMENT_MEAN', 'REFUSED_RATE_DOWN_PAYMENT_MEAN',
       'REFUSED_APP_CREDIT_PERC_VAR', 'CC_AMT_PAYMENT_CURRENT_VAR',
       'CC_CNT_DRAWINGS_POS_CURRENT_VAR', 'CC_AMT_DRAWINGS_OTHER_CURRENT_VAR',
       ...
       'CLOSED_AMT_CREDIT_MAX_OVERDUE_MEAN', 'APARTMENTS_MEDI',
       'APARTMENTS_MODE', 'APARTMENTS_AVG', 'ENTRANCES_AVG', 'ENTRANCES_MODE',
       'ENTRANCES_MEDI', 'LIVINGAREA_AVG', 'LIVINGAREA_MODE',
       'LIVINGAREA_MEDI'],
      dtype='object', length=236)

In [15]:
# Suppression de ces colonnes
X = X.drop(columns=col_low_missing)

In [16]:
X.shape

(307511, 560)

In [17]:
X.columns

Index(['SK_ID_CURR', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
       'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
       'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE',
       ...
       'INSTAL_AMT_INSTALMENT_MEAN', 'INSTAL_AMT_INSTALMENT_SUM',
       'INSTAL_AMT_PAYMENT_MIN', 'INSTAL_AMT_PAYMENT_MAX',
       'INSTAL_AMT_PAYMENT_MEAN', 'INSTAL_AMT_PAYMENT_SUM',
       'INSTAL_DAYS_ENTRY_PAYMENT_MAX', 'INSTAL_DAYS_ENTRY_PAYMENT_MEAN',
       'INSTAL_DAYS_ENTRY_PAYMENT_SUM', 'INSTAL_COUNT'],
      dtype='object', length=560)

## Utilisation des méthodes de filtre

### Suppression des features trop corrélées

Dans un modèle de Machine Learning, il est recommandé d'utiliser des variables faiblement corrélées. On va donc supprimer les variables dont la valuer absolue du coefficient de corrélation de Pearson est supérieur à 0.5.

In [18]:
# Create correlation matrix
corr_matrix = X.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.5
to_drop = [column for column in upper.columns if any(upper[column] > 0.5)]

In [19]:
# Nom des colonnes à supprimer
to_drop

['AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'FLAG_EMP_PHONE',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'LIVE_REGION_NOT_WORK_REGION',
 'LIVE_CITY_NOT_WORK_CITY',
 'YEARS_BEGINEXPLUATATION_MODE',
 'FLOORSMAX_MODE',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'FLOORSMAX_MEDI',
 'TOTALAREA_MODE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'FLAG_DOCUMENT_6',
 'NAME_CONTRACT_TYPE_Revolving_loans',
 'NAME_TYPE_SUITE_Unaccompanied',
 'NAME_INCOME_TYPE_Pensioner',
 'NAME_INCOME_TYPE_Working',
 'NAME_EDUCATION_TYPE_Secondary_/_secondary_special',
 'NAME_FAMILY_STATUS_Married',
 'NAME_FAMILY_STATUS_Single_/_not_married',
 'NAME_HOUSING_TYPE_Municipal_apartment',
 'NAME_HOUSING_TYPE_With_parents',
 'ORGANIZATION_TYPE_Medicine',
 'ORGANIZATION_TYPE_Security',
 'ORGANIZATION_TYPE_XNA',
 'HOUSETYPE_MODE_block_of_flats',
 'WALLSMATERIAL_MODE_Panel',
 'EMERGENCYSTATE_MODE_No',
 'DAYS_EMPLOYED_PERC',
 'INCOME_CREDIT_PERC',
 'INCOME_PER_PERSON',
 'PAYMENT_RATE',
 'BUR

In [20]:
# Drop features 
X.drop(to_drop, axis=1, inplace=True)

In [21]:
X.shape  # Après suppression

(307511, 351)

### Suppression des features dont la variance est trop faible

https://openclassrooms.com/fr/courses/6401081-improve-the-performance-of-a-machine-learning-model/6539936-improve-your-feature-selection

La fonction VarianceThreshold de scikit-Learn permet de supprimer les features peu importantes. Elle utilise un critère de variance comme critère de tri. Suppression des features qui ont une variance très faible, c'est à dire des valeurs qui varient très peu.

In [22]:
# Perform feature selection using a variance threshold
# Suppression des features dont variance < 2%
sel = VarianceThreshold(threshold=(0.02))
sel.fit(X)

print("Feature selection", sel.get_support())
print("Selected features:", list(X.columns[sel.get_support()]))
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print("Removed features:", list(X.columns[~sel.get_support()]))

Feature selection [ True  True  True  True  True  True  True False  True  True  True  True
 False  True False  True  True  True False  True  True  True  True  True
 False  True  True  True  True False  True False False False  True False
 False False False False False False False False False False False False
 False False  True  True  True  True  True False  True False False False
  True False  True False  True False False False  True  True False  True
  True False  True False  True False False  True False False  True  True
 False  True False  True False  True  True False False  True False  True
 False  True  True  True  True  True  True  True False False False False
  True  True False  True False False False  True False False False False
 False False False False False False False False False False False False
  True False False False  True False False False False False  True False
  True False False False False False False False False  True False False
 False False False False False  T

In [23]:
# Transform (remove low variance features)
X = pd.DataFrame(sel.transform(X), columns=X.columns[sel.get_support()])

In [24]:
# Après transformation il reste 200 colonnes.
X.shape

(307511, 130)

### Suppression des lignes peu renseignées

In [25]:
# Analyse du taux de remplissage des lignes
pct_missing_per_line = (X.isnull().sum(axis=1)/len(X.columns)).sort_values(ascending=False)
pct_missing_per_line

269492    0.530769
258474    0.530769
53550     0.530769
133770    0.530769
215458    0.530769
            ...   
177301    0.000000
177296    0.000000
177291    0.000000
177286    0.000000
307510    0.000000
Length: 307511, dtype: float64

Les lignes les moins bien renseignées ont 60% de valeurs manquantes.

In [26]:
pct_missing_per_line.unique()

array([0.53076923, 0.52307692, 0.51538462, 0.50769231, 0.5       ,
       0.49230769, 0.48461538, 0.47692308, 0.46923077, 0.46153846,
       0.45384615, 0.44615385, 0.43846154, 0.43076923, 0.42307692,
       0.41538462, 0.40769231, 0.4       , 0.39230769, 0.38461538,
       0.37692308, 0.36923077, 0.36153846, 0.35384615, 0.34615385,
       0.33846154, 0.33076923, 0.32307692, 0.31538462, 0.30769231,
       0.3       , 0.29230769, 0.28461538, 0.27692308, 0.26923077,
       0.26153846, 0.25384615, 0.24615385, 0.23846154, 0.23076923,
       0.22307692, 0.21538462, 0.20769231, 0.2       , 0.19230769,
       0.18461538, 0.17692308, 0.16923077, 0.16153846, 0.15384615,
       0.14615385, 0.13846154, 0.13076923, 0.12307692, 0.11538462,
       0.10769231, 0.1       , 0.09230769, 0.08461538, 0.07692308,
       0.06923077, 0.06153846, 0.05384615, 0.04615385, 0.03846154,
       0.03076923, 0.02307692, 0.01538462, 0.00769231, 0.        ])

In [27]:
# Si on retient un seuil max de 30% de valeurs manquantes
# On élimine 16097 lignes sur 307511, soit environ 5%
pct_missing_per_line[pct_missing_per_line>=0.3].index

Int64Index([269492, 258474,  53550, 133770, 215458, 233257, 116937,  54457,
            138913,  12087,
            ...
            196186,  15339,  18079,  61055, 174862,  82015, 287161, 151286,
             47573,  11385],
           dtype='int64', length=15449)

In [28]:
# Stockage de l'index des lignes peu renseignées
index_to_del = pct_missing_per_line[pct_missing_per_line>=0.3].index

In [29]:
# Target associée aux lignes peu renseignées
y[index_to_del]

269492    0.0
258474    0.0
53550     0.0
133770    0.0
215458    0.0
         ... 
82015     0.0
287161    0.0
151286    0.0
47573     0.0
11385     0.0
Name: TARGET, Length: 15449, dtype: float64

In [30]:
# Nombre de lignes pour lesquelles y=1 parmis les lignes à supprimer
y[index_to_del].sum()

902.0

In [31]:
# On supprime plus de lignes de la classe majoritaire (y=0)
16097/947

16.997888067581837

In [32]:
# Suppression des lignes dans matrice X
X.drop(index_to_del, inplace=True)

In [33]:
X.shape

(292062, 130)

In [34]:
# Suppression des lignes dans matrice y
y.drop(index_to_del, inplace=True)

In [35]:
y.shape

(292062,)

In [36]:
# Comptage de la répartition après suppression
Counter(y)

Counter({1.0: 23923, 0.0: 268139})

In [37]:
weight = int(Counter(y)[0]/Counter(y)[1])
weight

11

Le rapport entre les classes 0 et 1 n'est pas modifié.

## Création des jeux de test et d'entrainement post cleaning

In [38]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1, stratify=y)

In [39]:
X_train.shape

(195681, 130)

In [40]:
X_train.head()

Unnamed: 0,SK_ID_CURR,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,PREV_NAME_YIELD_GROUP_low_normal_MEAN,PREV_NAME_YIELD_GROUP_middle_MEAN,POS_SK_DPD_MAX,POS_SK_DPD_DEF_MAX,INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE,INSTAL_DPD_MAX,INSTAL_DBD_MAX,INSTAL_PAYMENT_PERC_MAX,INSTAL_PAYMENT_DIFF_MAX,INSTAL_PAYMENT_DIFF_MEAN
215169,349328.0,1.0,0.0,1.0,0.0,99000.0,508495.5,-20255.0,-335.0,-8392.0,...,0.0,0.5,0.0,0.0,2.0,0.0,16.0,1.0,0.0,0.0
275171,418907.0,1.0,0.0,1.0,0.0,184500.0,728460.0,-21989.0,-5862.0,-11270.0,...,0.142857,0.142857,0.0,0.0,2.0,0.0,56.0,1.0,0.0,0.0
139526,261775.0,1.0,0.0,0.0,0.0,112500.0,239850.0,-25054.0,,-2306.0,...,0.111111,0.333333,0.0,0.0,3.0,0.0,44.0,1.0,0.0,0.0
283387,428191.0,0.0,0.0,0.0,1.0,360000.0,450000.0,-18862.0,-7093.0,-4303.0,...,0.285714,0.357143,0.0,0.0,3.0,22.0,44.0,1.0,28395.09,315.81
14341,116728.0,1.0,1.0,0.0,0.0,112500.0,1350000.0,-21817.0,,-1363.0,...,0.25,0.25,0.0,0.0,3.0,27.0,25.0,1.0,4342.5,34.615385


In [41]:
X_train.head()

Unnamed: 0,SK_ID_CURR,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,PREV_NAME_YIELD_GROUP_low_normal_MEAN,PREV_NAME_YIELD_GROUP_middle_MEAN,POS_SK_DPD_MAX,POS_SK_DPD_DEF_MAX,INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE,INSTAL_DPD_MAX,INSTAL_DBD_MAX,INSTAL_PAYMENT_PERC_MAX,INSTAL_PAYMENT_DIFF_MAX,INSTAL_PAYMENT_DIFF_MEAN
215169,349328.0,1.0,0.0,1.0,0.0,99000.0,508495.5,-20255.0,-335.0,-8392.0,...,0.0,0.5,0.0,0.0,2.0,0.0,16.0,1.0,0.0,0.0
275171,418907.0,1.0,0.0,1.0,0.0,184500.0,728460.0,-21989.0,-5862.0,-11270.0,...,0.142857,0.142857,0.0,0.0,2.0,0.0,56.0,1.0,0.0,0.0
139526,261775.0,1.0,0.0,0.0,0.0,112500.0,239850.0,-25054.0,,-2306.0,...,0.111111,0.333333,0.0,0.0,3.0,0.0,44.0,1.0,0.0,0.0
283387,428191.0,0.0,0.0,0.0,1.0,360000.0,450000.0,-18862.0,-7093.0,-4303.0,...,0.285714,0.357143,0.0,0.0,3.0,22.0,44.0,1.0,28395.09,315.81
14341,116728.0,1.0,1.0,0.0,0.0,112500.0,1350000.0,-21817.0,,-1363.0,...,0.25,0.25,0.0,0.0,3.0,27.0,25.0,1.0,4342.5,34.615385


## Sauvegarde des tables pour la prochaine étape (voir Notebook 5)

In [42]:
X_train.to_csv('../data_models/X_train_post_preprocess.csv')

In [43]:
X_test.to_csv('../data_models/X_test_post_preprocess.csv')

In [44]:
y_train.to_csv('../data_models/y_train_post_preprocess.csv')
y_test.to_csv('../data_models/y_test_post_preprocess.csv')