In [1]:
# Importations
import sys
sys.path.append('..')

import pandas as pd
#import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix, classification_report

from preprocessing import preprocessor, preprocessor_without_scaler

In [6]:
# Initialisation
train = pd.read_csv('../02_data/application_train.csv')
test = pd.read_csv('../02_data/application_test.csv')

id_error_msg = lambda x: '`SK_ID_CURR` is not unic for {} set!'.format(x)
assert len(train.SK_ID_CURR.unique()) == train.shape[0], id_error_msg('train')
assert len(test.SK_ID_CURR.unique()) == test.shape[0], id_error_msg('test')
train.set_index('SK_ID_CURR', inplace=True)
test.set_index('SK_ID_CURR', inplace=True)

print('Training set dimensions :', train.shape)

cls_size = train.TARGET.value_counts()
cls_freq = train.TARGET.value_counts(normalize=True)
print(pd.DataFrame({'size': cls_size,
                    'freq': cls_freq.apply(lambda x: '%.3f' % x)}))

X, y = train.iloc[:, 1:], train.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

Training set dimensions : (307511, 121)
     size   freq
0  282686  0.919
1   24825  0.081
X_train: (246008, 120)
y_train: (246008,)
X_test: (61503, 120)
y_test: (61503,)


# Modèle 1 : SGD Classifier

In [6]:
model1 = make_pipeline(preprocessor, SGDClassifier())
model1.fit(X_train, y_train)
print('Score:', model1.score(X_test, y_test))

Score: 0.9189307838642018


In [7]:
y_pred = model1.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

[[56517     0]
 [ 4986     0]]


# Modèle 2 : Random Forest Classifier

In [8]:
model2 = make_pipeline(preprocessor_without_scaler, RandomForestClassifier())
model2.fit(X_train, y_train)
print('Score:', model2.score(X_test, y_test))

Score: 0.9189633026031251


In [9]:
y_pred = model2.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

[[56512     5]
 [ 4979     7]]


In [11]:
print(X_train[:5])

           NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  \
SK_ID_CURR                                                               
320991             Cash loans           F            N               Y   
258600             Cash loans           M            N               Y   
316389             Cash loans           F            N               Y   
239474        Revolving loans           F            N               Y   
135015             Cash loans           M            N               Y   

            CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
SK_ID_CURR                                                            
320991                 0          135000.0    189000.0       9778.5   
258600                 0          112500.0    645889.5      21474.0   
316389                 0           72000.0    315000.0      22954.5   
239474                 0           94500.0    270000.0      13500.0   
135015                 0          270000.0   1110582.0 

In [6]:
model2.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('simpleimputer-1',
                                    SimpleImputer(strategy='median'),
                                    ['CNT_CHILDREN', 'AMT_INCOME_TOTAL',
                                     'AMT_CREDIT', 'AMT_ANNUITY',
                                     'AMT_GOODS_PRICE',
                                     'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH',
                                     'DAYS_EMPLOYED', 'DAYS_REGISTRATION',
                                     'DAYS_ID_PUBLISH', 'OWN_CAR_AGE',
                                     'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT',
                                     'REGION_RAT...
                                                     SimpleImputer(fill_value='Unknown',
                                                                   strategy='constant')),
                                              

# Modèle 3 : LightGBM

In [7]:
model3 = make_pipeline(preprocessor, LGBMClassifier())
model3.fit(X_train, y_train)
print('Score:', model3.score(X_test, y_test))

Score: 0.9192071931450498


In [8]:
y_pred = model3.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

[[56447    81]
 [ 4888    87]]


In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     56528
           1       0.52      0.02      0.03      4975

    accuracy                           0.92     61503
   macro avg       0.72      0.51      0.50     61503
weighted avg       0.89      0.92      0.88     61503



In [None]:
# à faire

# smote tomek
# random search precision des deux classes (privilégier light_gbm)
# 
# choisir optimisation recall(classe 1)
# fonction coût : manque à gagner pour chaque treshold
# treshold = + = + precision - recall
# precision élevée = on accepte tout le monde
# recall élevée = on refuse tout le monde
# regarder crer une colonne intérêts (amt credit - good price),
# optimiser mon threshold % de ça