In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.pipeline import Pipeline

# Retraitement du dataset

In [41]:
# Chargement des données intégrales imputées
X = pd.read_csv('../data_models/X_top10_imputed.csv', index_col=0)

In [42]:
X.head()

Unnamed: 0,EXT_SOURCE_3,EXT_SOURCE_2,PREV_DAYS_DECISION_MIN,CODE_GENDER,DAYS_EMPLOYED,PREV_APP_CREDIT_PERC_MIN,INSTAL_DPD_MAX,AMT_CREDIT,DAYS_BIRTH,FLAG_OWN_CAR,NAME_EDUCATION_TYPE_Higher_education,TARGET
0,0.574447,0.09225,1229.0,1.0,335.0,0.627786,0.0,508495.5,20255.0,0.0,0.0,0.0
1,0.7463,0.77968,2688.0,1.0,5862.0,0.906651,0.0,728460.0,21989.0,0.0,0.0,0.0
2,0.15952,0.475448,1632.0,1.0,224.0,0.938086,0.0,239850.0,25054.0,0.0,0.0,0.0
3,0.45611,0.698863,2500.0,0.0,7093.0,0.0,22.0,450000.0,18862.0,0.0,1.0,0.0
4,0.761026,0.658295,2608.0,1.0,224.0,0.0,27.0,1350000.0,21817.0,1.0,0.0,0.0


In [43]:
y = X['TARGET']
X = X.iloc[:, :-1]

In [44]:
X.shape

(292062, 11)

In [45]:
# Split du datset en train et test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2, stratify=y)

In [46]:
X_train

Unnamed: 0,EXT_SOURCE_3,EXT_SOURCE_2,PREV_DAYS_DECISION_MIN,CODE_GENDER,DAYS_EMPLOYED,PREV_APP_CREDIT_PERC_MIN,INSTAL_DPD_MAX,AMT_CREDIT,DAYS_BIRTH,FLAG_OWN_CAR,NAME_EDUCATION_TYPE_Higher_education
206999,0.659406,0.433854,635.0,1.0,1060.0,1.111111,0.0,180000.0,7704.0,0.0,0.0
46389,0.698668,0.576064,940.0,1.0,5219.0,1.000000,0.0,1174977.0,13738.0,1.0,0.0
239843,0.746300,0.315776,233.0,1.0,3304.0,0.853534,0.0,170640.0,21865.0,0.0,0.0
80975,0.742182,0.503096,2298.0,1.0,108.0,0.940019,0.0,1288350.0,20871.0,0.0,0.0
273794,0.000527,0.029398,645.0,0.0,1044.0,0.934244,0.0,198666.0,8691.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
150763,0.538863,0.624346,2440.0,0.0,3522.0,0.000000,10.0,1006920.0,15976.0,0.0,0.0
233880,0.652897,0.722165,275.0,0.0,2728.0,0.909918,0.0,159264.0,18358.0,1.0,1.0
110449,0.629674,0.370502,848.0,1.0,230.0,0.904466,0.0,1032093.0,15200.0,0.0,0.0
253042,0.639708,0.664129,2884.0,1.0,4566.0,0.000000,11.0,1078200.0,16060.0,0.0,0.0


In [47]:
X_train.shape

(204443, 11)

In [48]:
X_train['TARGET']=y_train

In [49]:
X_train.to_csv('X_train.csv')

In [50]:
# Sauvegarde du train set en format pickle pour le dashboard
X_train.to_pickle('X_train.pkl')

## Tests de sampling pour les calculs d'explicabilité

In [51]:
def subsample_data(X, y, n_sample=100, seed_temp=1234):
    """Subsample data, stratified by target variable y
    https://github.com/Chancylin/shap_loss/blob/master/helper_functions/shap_help.py
    https://towardsdatascience.com/use-shap-loss-values-to-debug-monitor-your-model-83f7808af40f
    """
    frac = n_sample / X.shape[0]

    data = X.copy(deep=True)
    data["label"] = y
    strata = ["label"]

    data_subsample = data.groupby(strata, group_keys=False)\
        .apply(lambda x: x.sample(frac=frac, replace=False, random_state=seed_temp))

    # assert
    #print("imbalance ratio()")

    return data_subsample[X.columns]

In [52]:
X_train.iloc[:,:-1]

Unnamed: 0,EXT_SOURCE_3,EXT_SOURCE_2,PREV_DAYS_DECISION_MIN,CODE_GENDER,DAYS_EMPLOYED,PREV_APP_CREDIT_PERC_MIN,INSTAL_DPD_MAX,AMT_CREDIT,DAYS_BIRTH,FLAG_OWN_CAR,NAME_EDUCATION_TYPE_Higher_education
206999,0.659406,0.433854,635.0,1.0,1060.0,1.111111,0.0,180000.0,7704.0,0.0,0.0
46389,0.698668,0.576064,940.0,1.0,5219.0,1.000000,0.0,1174977.0,13738.0,1.0,0.0
239843,0.746300,0.315776,233.0,1.0,3304.0,0.853534,0.0,170640.0,21865.0,0.0,0.0
80975,0.742182,0.503096,2298.0,1.0,108.0,0.940019,0.0,1288350.0,20871.0,0.0,0.0
273794,0.000527,0.029398,645.0,0.0,1044.0,0.934244,0.0,198666.0,8691.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
150763,0.538863,0.624346,2440.0,0.0,3522.0,0.000000,10.0,1006920.0,15976.0,0.0,0.0
233880,0.652897,0.722165,275.0,0.0,2728.0,0.909918,0.0,159264.0,18358.0,1.0,1.0
110449,0.629674,0.370502,848.0,1.0,230.0,0.904466,0.0,1032093.0,15200.0,0.0,0.0
253042,0.639708,0.664129,2884.0,1.0,4566.0,0.000000,11.0,1078200.0,16060.0,0.0,0.0


In [53]:
y_train.shape

(204443,)

In [54]:
X_train_sample = subsample_data(X_train, y_train, n_sample=10000)

In [55]:
X_train_sample.shape

(10000, 12)

In [56]:
len(X_train_sample[X_train_sample.TARGET==0])/len(X_train_sample[X_train_sample.TARGET==1])

11.21001221001221

In [57]:
X_train_sample.describe()

Unnamed: 0,EXT_SOURCE_3,EXT_SOURCE_2,PREV_DAYS_DECISION_MIN,CODE_GENDER,DAYS_EMPLOYED,PREV_APP_CREDIT_PERC_MIN,INSTAL_DPD_MAX,AMT_CREDIT,DAYS_BIRTH,FLAG_OWN_CAR,NAME_EDUCATION_TYPE_Higher_education,TARGET
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,0.544732,0.513462,1547.0658,0.6631,1985.4694,0.789689,19.0309,583529.9,16034.9933,0.3326,0.227,0.0819
std,0.210742,0.190432,865.933485,0.472674,2232.396532,0.355928,113.319812,387469.8,4358.852949,0.471168,0.418914,0.274226
min,0.000527,0.000127,2.0,0.0,15.0,0.0,0.0,45000.0,7685.0,0.0,0.0,0.0
25%,0.401407,0.389132,729.0,0.0,305.75,0.812348,0.0,270000.0,12439.75,0.0,0.0,0.0
50%,0.591977,0.565526,1513.5,1.0,1213.0,0.89842,1.0,508495.5,15732.0,0.0,0.0,0.0
75%,0.7463,0.66202,2407.0,1.0,2780.25,1.0,9.0,792162.0,19654.25,1.0,0.0,0.0
max,0.872456,0.821714,2922.0,1.0,15860.0,3.166553,2751.0,2700000.0,25196.0,1.0,1.0,1.0


In [58]:
X_train_sample.to_csv('X_train_sample.csv')

In [59]:
X_train_sample.to_pickle('X_train_sample.pkl')

# chargement des données retraitées avec undersampling

In [60]:
X_rus_train = pd.read_csv('../data_models/X_train_imputed.csv', index_col=0)

In [64]:
X_rus_test = pd.read_csv('../data_models/X_test_imputed.csv', index_col=0)

In [65]:
X_rus_train.shape

(33492, 12)

In [66]:
X_rus_train

Unnamed: 0,EXT_SOURCE_3,EXT_SOURCE_2,PREV_DAYS_DECISION_MIN,CODE_GENDER,DAYS_EMPLOYED,PREV_APP_CREDIT_PERC_MIN,INSTAL_DPD_MAX,AMT_CREDIT,DAYS_BIRTH,FLAG_OWN_CAR,NAME_EDUCATION_TYPE_Higher_education,TARGET
0,0.547810,0.464989,2467.0,1.0,1161.0,1.026477,25.0,700830.0,15920.0,0.0,0.0,0.0
1,0.684828,0.284672,617.0,0.0,6830.0,1.000000,0.0,1042560.0,14970.0,1.0,1.0,0.0
2,0.340906,0.682258,1307.0,1.0,1045.0,0.834716,6.0,112500.0,10975.0,0.0,1.0,0.0
3,0.778904,0.750705,2823.0,0.0,8100.0,0.000000,80.0,808650.0,14948.0,1.0,0.0,0.0
4,0.713631,0.419807,422.0,1.0,751.0,0.825616,0.0,509602.5,11680.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
33487,0.162442,0.018309,1715.0,0.0,465.0,1.000000,0.0,153000.0,10364.0,1.0,0.0,1.0
33488,0.429424,0.550870,900.0,0.0,1689.0,0.810180,1.0,101880.0,11919.0,0.0,0.0,1.0
33489,0.000527,0.474772,923.0,1.0,234.0,0.920967,0.0,568800.0,21127.0,0.0,0.0,1.0
33490,0.218859,0.721287,705.0,1.0,358.0,0.844024,0.0,521280.0,11065.0,0.0,0.0,1.0


In [67]:
y_rus_train = X_rus_train['TARGET']
y_rus_test = X_rus_test['TARGET']

In [68]:
X_rus_train_sample = subsample_data(X_rus_train, y_rus_train, n_sample=700)
X_rus_test_sample = subsample_data(X_rus_test, y_rus_test, n_sample=300)

In [69]:
# Sauvegarde en format pickle pour utilisation dans le dashboard pour la partie explicabilité
X_rus_train_sample.to_csv('X_rus_train_sample.csv')
X_rus_train_sample.to_pickle('X_rus_train_sample.pkl')
X_rus_test_sample.to_csv('X_rus_test_sample.csv')

In [22]:
X_rus_train.to_csv('X_train_raw.csv')
X_rus_train.to_pickle('X_train_raw.pkl')

In [133]:
steps = [('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
                    ('scaler', StandardScaler()),
                    ('model', LogisticRegression(penalty='l2',
                                                 C=0.01,
                                                 solver='liblinear'))]
pipeline = Pipeline(steps=steps)

In [134]:
pipeline.fit(X_rus_train, y_rus_train)

In [135]:
### Create a Pickle file using serialization 
pickle_out = open("logreg_shap.pkl","wb")
pickle.dump(pipeline, pickle_out)
pickle_out.close()

In [77]:
# Imputation des valeurs manquantes
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train_imputed = imputer.fit_transform(X_rus_train)

In [78]:
X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)

In [79]:
X_train_imputed

Unnamed: 0,EXT_SOURCE_3,EXT_SOURCE_2,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Higher education,CODE_GENDER,FLAG_DOCUMENT_3,INSTAL_DPD_MEAN,FLAG_OWN_CAR,PREV_NAME_CONTRACT_STATUS_Approved_MEAN,NAME_CONTRACT_TYPE_Cash loans,...,PREV_NAME_YIELD_GROUP_low_normal_MEAN,INSTAL_DAYS_ENTRY_PAYMENT_MAX,PREV_NAME_PRODUCT_TYPE_walk-in_MEAN,OCCUPATION_TYPE_Drivers,ACTIVE_DAYS_CREDIT_MAX,CLOSED_AMT_CREDIT_SUM_MAX,APPROVED_AMT_ANNUITY_MEAN,FLOORSMAX_AVG,INSTAL_DPD_MAX,INSTAL_DAYS_ENTRY_PAYMENT_SUM
0,0.455763,0.615644,0.0,0.0,1.0,0.0,0.000000,0.0,1.000000,1.0,...,0.5,-36.0,0.0,0.0,-374.994296,474778.067388,13915.8225,0.166700,0.0,-14815.0
1,0.106156,0.372754,0.0,0.0,1.0,1.0,9.543860,0.0,0.800000,1.0,...,0.2,-33.0,0.0,0.0,-54.000000,135000.000000,11192.6025,0.333300,122.0,-71899.0
2,0.746300,0.265107,0.0,0.0,1.0,1.0,0.421053,0.0,0.714286,1.0,...,0.0,-607.0,0.0,0.0,-254.000000,540000.000000,17082.7470,0.216534,6.0,-28967.0
3,0.452534,0.360245,1.0,0.0,1.0,1.0,32.507463,0.0,0.600000,1.0,...,0.2,-21.0,0.5,0.0,-65.000000,87795.000000,5189.3250,0.166700,2070.0,-39276.0
4,0.513694,0.685332,1.0,1.0,1.0,1.0,0.444444,0.0,1.000000,1.0,...,0.0,-1916.0,0.0,0.0,-253.000000,630000.000000,14764.5000,0.216534,6.0,-62859.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34749,0.576209,0.578252,0.0,1.0,1.0,1.0,0.000000,1.0,1.000000,1.0,...,1.0,-289.0,0.0,0.0,-327.000000,91677.870000,9767.5200,0.216534,0.0,-5392.0
34750,0.563835,0.592945,1.0,0.0,0.0,1.0,18.200000,0.0,0.200000,1.0,...,0.2,-2260.0,0.0,0.0,-1103.000000,87471.000000,14634.5850,0.216534,91.0,-11897.0
34751,0.522697,0.371891,1.0,0.0,1.0,1.0,0.000000,0.0,1.000000,1.0,...,0.0,-1216.0,0.0,0.0,-727.000000,474778.067388,5240.9700,0.216534,0.0,-16537.0
34752,0.455763,0.594701,0.0,0.0,1.0,1.0,0.296774,0.0,0.666667,1.0,...,0.5,-14.0,0.0,0.0,-374.994296,474778.067388,30709.4850,0.208300,19.0,-209777.0


In [81]:
X_train_imputed = X_train_imputed[features_names]

In [96]:
X_train_imputed['TARGET'] = y_rus_train.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_imputed['TARGET'] = y_rus_train.astype(int)


In [97]:
X_train_imputed

Unnamed: 0,EXT_SOURCE_3,EXT_SOURCE_2,AMT_CREDIT,FLAG_DOCUMENT_3,AMT_GOODS_PRICE,CODE_GENDER,INSTAL_DAYS_ENTRY_PAYMENT_MAX,INSTAL_DAYS_ENTRY_PAYMENT_MEAN,DAYS_EMPLOYED,NAME_INCOME_TYPE_Working,TARGET
0,0.455763,0.615644,450000.0,0.0,450000.0,1.0,-36.0,-352.738095,-1063.000000,0.0,0
1,0.106156,0.372754,254700.0,1.0,225000.0,1.0,-33.0,-1261.385965,-2106.798239,0.0,0
2,0.746300,0.265107,1129500.0,1.0,1129500.0,1.0,-607.0,-1524.578947,-2811.000000,0.0,0
3,0.452534,0.360245,612000.0,1.0,612000.0,1.0,-21.0,-586.208955,-432.000000,1.0,0
4,0.513694,0.685332,166810.5,1.0,144000.0,1.0,-1916.0,-2328.111111,-7516.000000,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...
34749,0.576209,0.578252,545040.0,1.0,450000.0,1.0,-289.0,-449.333333,-154.000000,0.0,1
34750,0.563835,0.592945,239850.0,1.0,225000.0,0.0,-2260.0,-2379.400000,-308.000000,1.0,1
34751,0.522697,0.371891,499221.0,1.0,373500.0,1.0,-1216.0,-1378.083333,-282.000000,1.0,1
34752,0.455763,0.594701,311877.0,1.0,252000.0,1.0,-14.0,-1353.400000,-2106.798239,0.0,1


In [98]:
X_train_imputed.columns

Index(['EXT_SOURCE_3', 'EXT_SOURCE_2', 'AMT_CREDIT', 'FLAG_DOCUMENT_3',
       'AMT_GOODS_PRICE', 'CODE_GENDER', 'INSTAL_DAYS_ENTRY_PAYMENT_MAX',
       'INSTAL_DAYS_ENTRY_PAYMENT_MEAN', 'DAYS_EMPLOYED',
       'NAME_INCOME_TYPE_Working', 'TARGET'],
      dtype='object')

In [99]:
# save imputed data before standardisation
X_train_imputed.to_csv('X_train_imputed.csv')

In [95]:
X_train_imputed.iloc[:, :-1]

Unnamed: 0,EXT_SOURCE_3,EXT_SOURCE_2,AMT_CREDIT,FLAG_DOCUMENT_3,AMT_GOODS_PRICE,CODE_GENDER,INSTAL_DAYS_ENTRY_PAYMENT_MAX,INSTAL_DAYS_ENTRY_PAYMENT_MEAN,DAYS_EMPLOYED,NAME_INCOME_TYPE_Working
0,0.455763,0.615644,450000.0,0.0,450000.0,1.0,-36.0,-352.738095,-1063.000000,0.0
1,0.106156,0.372754,254700.0,1.0,225000.0,1.0,-33.0,-1261.385965,-2106.798239,0.0
2,0.746300,0.265107,1129500.0,1.0,1129500.0,1.0,-607.0,-1524.578947,-2811.000000,0.0
3,0.452534,0.360245,612000.0,1.0,612000.0,1.0,-21.0,-586.208955,-432.000000,1.0
4,0.513694,0.685332,166810.5,1.0,144000.0,1.0,-1916.0,-2328.111111,-7516.000000,1.0
...,...,...,...,...,...,...,...,...,...,...
34749,0.576209,0.578252,545040.0,1.0,450000.0,1.0,-289.0,-449.333333,-154.000000,0.0
34750,0.563835,0.592945,239850.0,1.0,225000.0,0.0,-2260.0,-2379.400000,-308.000000,1.0
34751,0.522697,0.371891,499221.0,1.0,373500.0,1.0,-1216.0,-1378.083333,-282.000000,1.0
34752,0.455763,0.594701,311877.0,1.0,252000.0,1.0,-14.0,-1353.400000,-2106.798239,0.0


In [100]:
# standardisation
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train_imputed.iloc[:, :-1])

In [101]:
X_train_std

array([[-2.99370561e-16,  7.14313515e-01, -3.37933642e-01, ...,
         9.00860113e-01,  5.32952317e-01, -1.12833397e+00],
       [-1.88542733e+00, -4.53871524e-01, -8.55860462e-01, ...,
        -6.43142969e-01,  0.00000000e+00, -1.12833397e+00],
       [ 1.56686002e+00, -9.71602405e-01,  1.46406981e+00, ...,
        -1.09036877e+00, -3.59557955e-01, -1.12833397e+00],
       ...,
       [ 3.60973944e-01, -4.58025319e-01, -2.07401763e-01, ...,
        -8.41438813e-01,  9.31722639e-01,  8.86262424e-01],
       [-2.99370561e-16,  6.13586367e-01, -7.04229629e-01, ...,
        -7.99496115e-01,  0.00000000e+00, -1.12833397e+00],
       [-2.99370561e-16,  9.90381419e-01,  6.56080103e-01, ...,
         1.17767302e+00,  0.00000000e+00, -1.12833397e+00]])

In [106]:
X_train_std[0]

array([-2.99370561e-16,  7.14313515e-01, -3.37933642e-01, -1.67740060e+00,
       -1.83705114e-01,  7.86925496e-01,  5.55543537e-01,  9.00860113e-01,
        5.32952317e-01, -1.12833397e+00])

In [107]:
y_rus_train.shape

(34754,)

In [105]:
# Régression logistique
lr = LogisticRegression(penalty='l2',
                        C=0.01,
                        solver='liblinear')
# Entrainement du modèle
lr.fit(X_train_std, y_rus_train)