# Chargement du modèle

In [None]:
# Load libraries
import sklearn
import joblib

# Get scikit-learn version
scikit_version = sklearn.__version__

# Load the model
pipe = joblib.load("models/model_{version}.pkl".format(version=scikit_version))

# Save the model as pickle file for the web app
joblib.dump(pipe, "web/models/model_{version}.pkl".format(version=scikit_version))

# display the model
pipe

# Préparation des données transformées

In [None]:
# load libraries
import pandas as pd

data_train_featured = pd.read_csv('data/cleaned/data_train_featured.csv', index_col='SK_ID_CURR')
print('Featured training data set shape: ', data_train_featured.shape)
data_train_featured.head()

In [None]:
from sklearn.impute import SimpleImputer

# Split features and targets
target_train = data_train_featured['TARGET']
data_train_featured = data_train_featured.drop(columns='TARGET')
feature_names = data_train_featured.columns
index = data_train_featured.index

# Imputations
imp_mean = SimpleImputer(strategy='mean')
imp_mean.fit(data_train_featured)
data_train_featured = pd.DataFrame(
    imp_mean.transform(data_train_featured),
    columns=feature_names,
    index=index
)

In [None]:
# Get the predictions of the model (for positive class: default)
y_train_pred = pipe.predict_proba(data_train_featured)[:,1]
y_train_pred = pd.Series(
    y_train_pred,
    index=data_train_featured.index,
)

# Save the predictions
y_train_pred.to_csv(
    'data/cleaned/target_train_predictions.csv',
    header='TARGET')

## Échantillonage (pour l'application web)

In [None]:
# Sampling
data_processed = data_train_featured.sample(n=5000, random_state=42)

# Save the sample for web app
data_processed.to_csv('web/data/data_processed.csv')

# Display
data_processed

# Préparation des données initiales

In [None]:
import pandas as pd

data_train_original = pd.read_csv('data/input/application_train.csv', index_col='SK_ID_CURR')
print('Original training set (application_set) shape: ', data_train_original.shape)

In [None]:
mask = data_train_original.index.isin(data_train_featured.index)

# Keep only points that are in the engineered set
data_train_original = data_train_original[mask]

# Drop target column
data_train_original = data_train_original.drop(columns='TARGET')

In [None]:
# Sampling
data_original = data_train_original.sample(n=5000, random_state=42)

# Save the sample for web app
data_original.to_csv('web/data/data_original.csv')

# Display
data_original

In [None]:
from sklearn.impute import SimpleImputer

# Imputation of missing values for numerical features
numerical_features = list(data_train_original.select_dtypes(include='number').columns)
data_train_original[numerical_features] = SimpleImputer(strategy='mean').fit_transform(data_train_original[numerical_features])

# Imputation of missing values for categorical features
categorical_features = list(data_train_original.select_dtypes(include='object').columns)
data_train_original[categorical_features] = SimpleImputer(strategy='constant', fill_value='missing').fit_transform(data_train_original[categorical_features])

# Display results
print("Remaining missing values:", data_train_original.isna().any().any())

In [None]:
# label encoding of each categorical feature
categorical_names = {} # dictionnary of modalities for each cat. feature
label_encoders = {} # dictionnary of encoders for each cat. feature
for feature in categorical_features:
    le = sklearn.preprocessing.LabelEncoder()
    data_train_original.loc[:, feature] = le.fit_transform(data_train_original.loc[:, feature])
    categorical_names[feature] = le.classes_
    label_encoders[feature] = le

# Save the processed original data
data_train_original.to_csv('data/cleaned/data_train_original.csv')

In [None]:
# Sampling
data_original_le = data_train_original.sample(n=5000, random_state=42)

# Save the sample for web app
data_original_le.to_csv('web/data/data_original_le.csv')

# Display
data_original_le

# Modèle de substitution (*Surrogate model*)

## Interprétation globale

In [None]:
from sklearn.tree import DecisionTreeRegressor

# Instanciate a surrogate model without depth limit to overfit
sur_dt = DecisionTreeRegressor()

# Over-fitting the surrogate model on original features
sur_dt.fit(data_train_original, y_train_pred)

In [None]:
# Get scikit-learn version
scikit_version = sklearn.__version__

# Save the model as pickle file
joblib.dump(sur_dt, "web/models/surrogate_model_{version}.pkl".format(version=scikit_version))

In [None]:
print("Main features per importance:")

sum_val = 0
for col, val in sorted(zip(data_train_original.columns, sur_dt.feature_importances_,), key=lambda x: x[1], reverse=True,)[:10]:
    print(f"{col:28}{val:10.3f}")
    sum_val += val
    
print("Percentage of the model explained by the 10 first features:", sum_val*100)

## Interprétation locale

In [None]:
SK_ID_CURR = 100002
application_data = data_train_original.loc[SK_ID_CURR:SK_ID_CURR]

print("Application ID:", SK_ID_CURR)
print("Predicted value:", y_train_pred.loc[SK_ID_CURR])

In [None]:
from treeinterpreter import treeinterpreter as ti

# Computation of the prediction, bias and contribs from surrogate model
prediction, bias, contribs = ti.predict(sur_dt, application_data)

print("Prediction:", prediction)

print("Bias (trainset mean):", bias)

print("Main features contributions:")
for contrib, feature in sorted(zip(contribs[0], data_train_original.columns), key=lambda x: abs(x[0]), reverse=True,):
    if contrib != 0:
        print("   {:32}{}".format(feature, contrib))

In [None]:
# Creating the pd.Series of features_contribs
features_contribs = pd.Series(contribs[0], index=data_original_le.columns)

In [None]:
import json
# Converting the pd.Series to JSON
features_contribs_json = json.loads(features_contribs.to_json())

In [None]:
from flask import jsonify

# Returning the processed data
jsonify({
        'status': 'ok',
        'prediction': prediction,
        'bias': bias[0],
        'contribs': features_contribs_json,
     })

In [None]:
prediction[0][0]

## Graphs

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# sns.set()

# initialization
sum_val = 0
labels = []
frequencies = []

# get the labels and frequencies of 10 most important features
for col, val in sorted(zip(data_train_original.columns, sur_dt.feature_importances_,), key=lambda x: x[1], reverse=True,)[:9]:
    labels.append(col)
    frequencies.append(val)
    sum_val += val

# complete the data with other features
labels.append("OTHER FEATURES…")
frequencies.append(1 - sum_val)

fig, ax = plt.subplots()
ax.axis("equal")
ax.pie(frequencies,
        # autopct="%1.1f pourcents",
      )
plt.title("Features importance")
plt.legend(
    labels,
    loc='center left',
    bbox_to_anchor=(1, 0.5),
)
plt.show()
fig.savefig('plots/FI.png')

In [None]:
type(sur_dt.feature_importances_)

In [None]:
data_train_original = data_train_original.drop(columns='TARGET')

In [None]:
features_names = data_train_original.columns
features_importance = sur_dt.feature_importances_

In [None]:
features_importance = pd.Series(sur_dt.feature_importances_, index=data_train_original.columns).sort_values(ascending=False)
features_importance

# Description des variables

In [None]:
# Loading the file with descriptions
features_descriptions = pd.read_csv('data/HomeCredit_columns_description.csv', encoding='iso-8859-1')
features_descriptions.head(2)

In [None]:
# filtering features from 'application_train' table
mask = features_descriptions['Table'] == 'application_{train|test}.csv'
features_descriptions = features_descriptions[mask]

# setting the name of the feature as index
features_descriptions = features_descriptions.set_index('Row')

# keeping only description
features_descriptions = features_descriptions['Description']

# display result
features_descriptions.head()

In [None]:
# Checking the result
for column in data_train_original.columns[:5]:
    print(column, features_descriptions[column])

In [None]:
# Save the data for web app
features_descriptions.to_csv('web/data/features_descriptions.csv', header='description')

In [None]:
features_descriptions

# Données aggrégées

In [None]:
import pandas as pd

data_train_original = pd.read_csv('data/input/application_train.csv', index_col='SK_ID_CURR')
print('Original training set (application_set) shape: ', data_train_original.shape)

In [None]:
data_train_original.head()

In [None]:
# Aggregate the data from loan applications
data_agg_num = data_train_original.mean(numeric_only=True)
data_agg_cat = data_train_original.select_dtypes(exclude='number').mode().iloc[0]
data_agg = pd.concat([data_agg_num, data_agg_cat])

In [None]:
# Drop the target
data_agg = data_agg.drop('TARGET')

In [None]:
# Save the data for web app
data_agg.to_csv('web/data/data_agg.csv', header='mean or mode', index=True)

In [None]:
data_agg.head()

In [None]:
# aggregated data of the train set for comparison to current applicant
data_agg = pd.read_csv("web/data/data_agg.csv", index_col=0)

In [None]:
data_agg.head()