# Create and run model

## Load libraries

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, precision_recall_curve, auc, confusion_matrix, f1_score

import pandas as pd
import numpy as np

## Set global parameters

In [4]:
data_path = '../data/03_primary/processed_data.csv'
artifact_path = "../data/06_models/"
test_size = 0.2
random_state = 18
cost_fn = 10
cost_fp = 1

## Load data

In [5]:
# Load data
raw_data = pd.read_csv(data_path)
raw_data = raw_data.astype('float32')


In [75]:
data = raw_data[['SK_ID_CURR', 'PAYMENT_RATE', 'EXT_SOURCE_3', 'EXT_SOURCE_2', 'DAYS_BIRTH', 'EXT_SOURCE_1', 'DAYS_EMPLOYED', 'DAYS_EMPLOYED_PERC', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'ANNUITY_INCOME_PERC', 'INSTAL_DBD_MEAN', 'AMT_ANNUITY', 'TARGET']].astype('float32')

In [76]:

# Remove lines with TARGET = NaN
data = data.dropna(subset=["TARGET"])

In [77]:
X = data.drop("TARGET", axis=1)
y = data["TARGET"]

## Split data

In [78]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

## Create model

### Set model parameters

In [24]:
params = {
    "n_estimators": 100,
    "random_state": random_state,
}

### Set grid parameters

In [25]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 8],
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion' :['gini', 'entropy']
}

### Select a model

Models to test:
- RandomForestClassifier
- XGBoost
- LightGBM
- Logistic Regression avec pondération des classes

In [37]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

In [35]:
# Random forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=random_state,)
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
print(f"Mean CV AUC-ROC: {np.mean(cv_scores)}, Std CV AUC-ROC: {np.std(cv_scores)}")

Mean CV AUC-ROC: 0.7141470598822511, Std CV AUC-ROC: 0.003167640807687861


In [38]:
# XGBoost classifier for binary classification outputing probabilities
model = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
print(f"Mean CV AUC-ROC: {np.mean(cv_scores)}, Std CV AUC-ROC: {np.std(cv_scores)}")

Mean CV AUC-ROC: 0.6926229244684583, Std CV AUC-ROC: 0.002501769040956962


In [39]:
# XGBoost classifier for binary classification outputing scores
model = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:hinge')
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
print(f"Mean CV AUC-ROC: {np.mean(cv_scores)}, Std CV AUC-ROC: {np.std(cv_scores)}")

Mean CV AUC-ROC: 0.5, Std CV AUC-ROC: 0.0


In [40]:
# LightGBM classifier
model = LGBMClassifier(n_estimators=100, random_state=random_state)
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
print(f"Mean CV AUC-ROC: {np.mean(cv_scores)}, Std CV AUC-ROC: {np.std(cv_scores)}")

[LightGBM] [Info] Number of positive: 15956, number of negative: 180848
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017390 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3315
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.081076 -> initscore=-2.427822
[LightGBM] [Info] Start training from score -2.427822
[LightGBM] [Info] Number of positive: 15956, number of negative: 180848
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3315
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.081076 -> initscore=-2.42782

In [63]:
missing_percentage = data.isnull().mean() * 100
missing_table = pd.DataFrame({'Column': data.columns, 'Missing Percentage': missing_percentage})
missing_table


Unnamed: 0,Column,Missing Percentage
SK_ID_CURR,SK_ID_CURR,0.0
PAYMENT_RATE,PAYMENT_RATE,0.003902
EXT_SOURCE_3,EXT_SOURCE_3,19.825565
EXT_SOURCE_2,EXT_SOURCE_2,0.214629
DAYS_BIRTH,DAYS_BIRTH,0.0
EXT_SOURCE_1,EXT_SOURCE_1,56.381156
DAYS_EMPLOYED,DAYS_EMPLOYED,18.007395
DAYS_EMPLOYED_PERC,DAYS_EMPLOYED_PERC,18.007395
DAYS_REGISTRATION,DAYS_REGISTRATION,0.0
DAYS_ID_PUBLISH,DAYS_ID_PUBLISH,0.0


In [47]:
data['EXT_SOURCE_1'].describe()

count    134131.000000
mean          0.502129
std           0.211061
min           0.014568
25%           0.334007
50%           0.505998
75%           0.675057
max           0.962693
Name: EXT_SOURCE_1, dtype: float64

In [52]:
data['EXT_SOURCE_3'].describe()

count    246542.000000
mean          0.510856
std           0.194831
min           0.000527
25%           0.370650
50%           0.535276
75%           0.669057
max           0.896010
Name: EXT_SOURCE_3, dtype: float64

In [53]:
data['DAYS_EMPLOYED'].describe()

count    252133.000000
mean      -2384.142334
std        2338.247559
min      -17912.000000
25%       -3175.000000
50%       -1648.000000
75%        -767.000000
max           0.000000
Name: DAYS_EMPLOYED, dtype: float64

In [55]:
data['INSTAL_DBD_MEAN'].describe()

count    291639.000000
mean         12.265641
std           9.142044
min           0.000000
25%           6.672414
50%          10.119047
75%          15.138889
max         295.000000
Name: INSTAL_DBD_MEAN, dtype: float64

In [79]:
data_no_na = data.dropna(subset=['PAYMENT_RATE', 'ANNUITY_INCOME_PERC', 'EXT_SOURCE_2', 'AMT_ANNUITY'])


In [80]:
data_no_na.loc[:, 'INSTAL_DBD_MEAN'] = data_no_na['INSTAL_DBD_MEAN'].fillna(data_no_na['INSTAL_DBD_MEAN'].mean())
data_no_na.loc[:, 'EXT_SOURCE_3'] = data_no_na['EXT_SOURCE_3'].fillna(data_no_na['EXT_SOURCE_3'].mean())
data_no_na.loc[:, 'EXT_SOURCE_1'] = data_no_na['EXT_SOURCE_1'].fillna(data_no_na['EXT_SOURCE_1'].mean())
data_no_na.loc[:, 'DAYS_EMPLOYED'] = data_no_na['DAYS_EMPLOYED'].fillna(data_no_na['DAYS_EMPLOYED'].mean())


In [81]:
data_no_na = data_no_na.drop('DAYS_EMPLOYED_PERC', axis=1)

In [82]:
missing_percentage = data_no_na.isnull().mean() * 100
missing_table = pd.DataFrame({'Column': data_no_na.columns, 'Missing Percentage': missing_percentage})
missing_table


Unnamed: 0,Column,Missing Percentage
SK_ID_CURR,SK_ID_CURR,0.0
PAYMENT_RATE,PAYMENT_RATE,0.0
EXT_SOURCE_3,EXT_SOURCE_3,0.0
EXT_SOURCE_2,EXT_SOURCE_2,0.0
DAYS_BIRTH,DAYS_BIRTH,0.0
EXT_SOURCE_1,EXT_SOURCE_1,0.0
DAYS_EMPLOYED,DAYS_EMPLOYED,0.0
DAYS_REGISTRATION,DAYS_REGISTRATION,0.0
DAYS_ID_PUBLISH,DAYS_ID_PUBLISH,0.0
ANNUITY_INCOME_PERC,ANNUITY_INCOME_PERC,0.0


In [83]:
X_no_na = data_no_na.drop("TARGET", axis=1)
y_no_na = data_no_na["TARGET"]

In [84]:
X_no_na_train, X_no_na_test, y_no_na_train, y_no_na_test = train_test_split(X_no_na, y_no_na, test_size=test_size, random_state=random_state)

In [86]:
# Logistic regression classifier with class weight
model = LogisticRegression(class_weight={0: cost_fn, 1: cost_fp}, random_state=random_state, max_iter=1000)
cv_scores = cross_val_score(model, X_no_na_train, y_no_na_train, cv=5, scoring='roc_auc')
print(f"Mean CV AUC-ROC: {np.mean(cv_scores)}, Std CV AUC-ROC: {np.std(cv_scores)}")

Mean CV AUC-ROC: 0.721595330445109, Std CV AUC-ROC: 0.005410793566440281


On choisit LGBMClassifier qui a le meilleur AUC-ROC : 0.75