# Chargement du modèle

In [26]:
# Load libraries
import sklearn
import joblib

# Get scikit-learn version
scikit_version = sklearn.__version__

# Load the model
pipe = joblib.load("models/model_{version}.pkl".format(version=scikit_version))

# display the model
pipe

Pipeline(memory=None,
         steps=[('preprocess',
                 Pipeline(memory=None,
                          steps=[('scale',
                                  StandardScaler(copy=True, with_mean=True,
                                                 with_std=True))],
                          verbose=False)),
                ('model',
                 LogisticRegression(C=1.3348904828439347,
                                    class_weight='balanced', dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=477,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='liblinear',
                                    tol=2.0533801171184104e-05, verbose=0,
                                    warm_start=True))],
         verbose=False)

# Préparation des données transformées

In [27]:
# load libraries
import pandas as pd

data_train_featured = pd.read_csv('data/cleaned/data_train_featured.csv', index_col='SK_ID_CURR')
print('Featured training data set shape: ', data_train_featured.shape)
data_train_featured.head()

Featured training data set shape:  (288028, 216)


Unnamed: 0_level_0,EXT_SOURCE_2,client_installments_AMT_PAYMENT_min_sum,DAYS_BIRTH,AMT_CREDIT,AMT_ANNUITY,DAYS_EMPLOYED,DAYS_ID_PUBLISH,DAYS_REGISTRATION,previous_loans_CNT_PAYMENT_mean,client_cash_CNT_INSTALMENT_FUTURE_min_max,...,previous_loans_WEEKDAY_APPR_PROCESS_START_WEDNESDAY_count_norm,previous_loans_NAME_CONTRACT_STATUS_Approved_count,client_cash_CNT_INSTALMENT_min_max,"WALLSMATERIAL_MODE_Stone, brick",previous_loans_NAME_TYPE_SUITE_Children_count_norm,previous_loans_PRODUCT_COMBINATION_Cash_count_norm,previous_loans_NAME_GOODS_CATEGORY_Sport and Leisure_count_norm,previous_loans_NAME_SELLER_INDUSTRY_Consumer electronics_count,client_cash_CNT_INSTALMENT_min_min,TARGET
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100002,0.262949,175783.725,-9461,406597.5,24700.5,-637,-2120,-3648.0,24.0,6.0,...,0.0,1.0,24.0,1,0.0,0.0,0.0,0.0,24.0,1
100003,0.622246,1154108.295,-16765,1293502.5,35698.5,-1188,-291,-1186.0,10.0,1.0,...,0.0,3.0,12.0,0,0.0,0.0,0.0,1.0,6.0,0
100004,0.555912,16071.75,-19046,135000.0,6750.0,-225,-2531,-4260.0,4.0,0.0,...,0.0,1.0,3.0,0,0.0,0.0,0.0,0.0,3.0,0
100006,0.650442,994476.69,-19005,312682.5,29686.5,-3039,-2437,-9833.0,23.0,3.0,...,0.0,5.0,12.0,0,0.0,0.222222,0.0,1.0,1.0,0
100007,0.322738,483756.39,-19932,513000.0,21865.5,-3038,-3458,-4311.0,20.666667,13.0,...,0.0,6.0,24.0,0,0.0,0.0,0.0,3.0,10.0,0


In [28]:
from sklearn.impute import SimpleImputer

# Split features and targets
target_train = data_train_featured['TARGET']
data_train_featured = data_train_featured.drop(columns='TARGET')
feature_names = data_train_featured.columns
index = data_train_featured.index

# Imputations
imp_mean = SimpleImputer(strategy='mean')
imp_mean.fit(data_train_featured)
data_train_featured = pd.DataFrame(
    imp_mean.transform(data_train_featured),
    columns=feature_names,
    index=index
)

# Préparation des données initiales

In [30]:
import pandas as pd

data_train_original = pd.read_csv('data/input/application_train.csv', index_col='SK_ID_CURR')
print('Original training set (application_set) shape: ', data_train_original.shape)

Original training set (application_set) shape:  (307511, 121)


In [34]:
mask = data_train_original.index.isin(data_train_featured.index)

# Keep only points that are in the engineered set
data_train_original = data_train_original[mask]

# Drop target column
data_train_original = data_train_original.drop(columns='TARGET')

In [40]:
# Get the predictions of the model (for positive class: default)
y_train_pred = pipe.predict_proba(data_train_featured)[:,1]
y_train_pred = pd.Series(
    y_train_pred,
    index=data_train_featured.index,
)

# Save the predictions
y_train_pred.to_csv(
    'data/cleaned/target_train_predictions.csv',
    header='TARGET')

In [None]:
from sklearn.impute import SimpleImputer

# Imputation of missing values for numerical features
numerical_features = list(data_train_original.select_dtypes(include='number').columns)
data_train_original[numerical_features] = SimpleImputer(strategy='mean').fit_transform(data_train_original[numerical_features])

# Imputation of missing values for categorical features
categorical_features = list(data_train_original.select_dtypes(include='object').columns)
data_train_original[categorical_features] = SimpleImputer(strategy='constant', fill_value='missing').fit_transform(data_train_original[categorical_features])

# Display results
print("Remaining missing values:", data_train_original.isna().any().any())

In [46]:
# label encoding of each categorical feature
categorical_names = {} # dictionnary of modalities for each cat. feature
label_encoders = {} # dictionnary of encoders for each cat. feature
for feature in categorical_features:
    le = sklearn.preprocessing.LabelEncoder()
    data_train_original.loc[:, feature] = le.fit_transform(data_train_original.loc[:, feature])
    categorical_names[feature] = le.classes_
    label_encoders[feature] = le

# Save the processed original data
data_train_original.to_csv('data/cleaned/data_train_original.csv')

# Modèle de substitution (*Surrogate model*)

## Interprétation globale

In [50]:
from sklearn.tree import DecisionTreeRegressor

# Instanciate a surrogate model without depth limit to overfit
sur_dt = DecisionTreeRegressor()

# Over-fitting the surrogate model on original features
sur_dt.fit(data_train_original, y_train_pred)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [132]:
print("Main features per importance:")

sum_val = 0
for col, val in sorted(zip(data_train_original.columns, sur_dt.feature_importances_,), key=lambda x: x[1], reverse=True,)[:10]:
    print(f"{col:28}{val:10.3f}")
    sum_val += val
    
print("Percentage of the model explained by the 10 first features:", sum_val*100)

Main features per importance:
EXT_SOURCE_2                     0.368
DAYS_BIRTH                       0.103
NAME_EDUCATION_TYPE              0.040
EXT_SOURCE_3                     0.039
DAYS_LAST_PHONE_CHANGE           0.037
CODE_GENDER                      0.035
DAYS_ID_PUBLISH                  0.031
DAYS_REGISTRATION                0.023
DAYS_EMPLOYED                    0.019
AMT_ANNUITY                      0.016
Percentage of the model explained by the 10 first features: 71.16472795408532


## Interprétation locale

In [129]:
application_id = 100002
application_data = data_train_original.loc[application_id:application_id]

print("Application ID:", application_id)
print("Predicted value:", y_train_pred.loc[application_id])

Application ID: 100002
Predicted value: 0.8417595358522602


In [130]:
from treeinterpreter import treeinterpreter as ti

# Computation of the prediction, bias and contribs from surrogate model
prediction, bias, contribs = ti.predict(sur_dt, application_data)

print("Prediction:", prediction[i])

print("Bias (trainset mean):", bias[i])

print("Main features contributions:")
for contrib, feature in sorted(zip(contribs[0], data_train_original.columns), key=lambda x: abs(x[0]), reverse=True,):
    if contrib != 0:
        print("   {:32}{}".format(feature, contrib))

Prediction: [0.84175954]
Bias (trainset mean): 0.4246449135303252
Main features contributions:
   EXT_SOURCE_2                    0.12542015088740555
   DAYS_BIRTH                      0.06348533572265769
   CODE_GENDER                     0.05041459286843275
   DEF_30_CNT_SOCIAL_CIRCLE        0.04755578403967442
   AMT_GOODS_PRICE                 0.027402686138232935
   NAME_EDUCATION_TYPE             0.02258049054571709
   DAYS_LAST_PHONE_CHANGE          0.0183454766541562
   DAYS_ID_PUBLISH                 0.017498736200626253
   HOUR_APPR_PROCESS_START         -0.0136952881865966
   FLAG_DOCUMENT_3                 0.012374783640661469
   NAME_CONTRACT_TYPE              0.009710204595902328
   YEARS_BEGINEXPLUATATION_MODE    0.00941549145422238
   REG_CITY_NOT_LIVE_CITY          -0.009369183776955614
   BASEMENTAREA_MEDI               0.007986378089045343
   REGION_POPULATION_RELATIVE      0.007924409664567533
   ENTRANCES_MEDI                  0.007777792701513575
   OWN_CAR_AGE   

# Description des variables

In [138]:
# Loading the file with descriptions
features_descriptions = pd.read_csv('data/HomeCredit_columns_description.csv', encoding='iso-8859-1')
features_descriptions.head(2)

Unnamed: 0.1,Unnamed: 0,Table,Row,Description,Special
0,1,application_{train|test}.csv,SK_ID_CURR,ID of loan in our sample,
1,2,application_{train|test}.csv,TARGET,Target variable (1 - client with payment diffi...,


In [135]:
# filtering features from 'application_train' table
mask = features_descriptions['Table'] == 'application_{train|test}.csv'
features_descriptions = features_descriptions[mask]

# setting the name of the feature as index
features_descriptions = features_descriptions.set_index('Row')

# keeping only description
features_descriptions = features_descriptions['Description']

# display result
features_descriptions.head()

Row
SK_ID_CURR                                     ID of loan in our sample
TARGET                Target variable (1 - client with payment diffi...
NAME_CONTRACT_TYPE          Identification if loan is cash or revolving
CODE_GENDER                                        Gender of the client
FLAG_OWN_CAR                              Flag if the client owns a car
Name: Description, dtype: object

In [137]:
# Checking the result
for column in data_train_original.columns[:5]:
    print(column, features_descriptions[column])

NAME_CONTRACT_TYPE Identification if loan is cash or revolving
CODE_GENDER Gender of the client
FLAG_OWN_CAR Flag if the client owns a car
FLAG_OWN_REALTY Flag if client owns a house or flat
CNT_CHILDREN Number of children the client has
