In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import seaborn as sns
import timeit
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [3]:




@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

# Preprocess application_train.csv and application_test.csv
def application_train_test(num_rows = None, nan_as_category = False):
    # Read data and merge
    df = pd.read_csv('C:/Users/bigdata/Projet7/application_train.csv', nrows= num_rows)
    test_df = pd.read_csv('C:/Users/bigdata/Projet7/application_test.csv', nrows= num_rows)
    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
    df = df.append(test_df).reset_index()
    # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
    df = df[df['CODE_GENDER'] != 'XNA']
    
    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)
    
    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
    # Some simple new features (percentages)
    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    del test_df
    gc.collect()
    return df

# Preprocess bureau.csv and bureau_balance.csv
def bureau_and_balance(num_rows = None, nan_as_category = True):
    bureau = pd.read_csv('C:/Users/bigdata/Projet7/bureau.csv', nrows = num_rows)
    bb = pd.read_csv('C:/Users/bigdata/Projet7/bureau_balance.csv', nrows = num_rows)
    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
    
    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
    del bb, bb_agg
    gc.collect()
    
    # Bureau and bureau_balance numeric features
    num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }
    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
    
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
    del active, active_agg
    gc.collect()
    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
    del closed, closed_agg, bureau
    gc.collect()
    return bureau_agg

# Preprocess previous_applications.csv
def previous_applications(num_rows = None, nan_as_category = True):
    prev = pd.read_csv('C:/Users/bigdata/Projet7/previous_application.csv', nrows = num_rows)
    prev, cat_cols = one_hot_encoder(prev, nan_as_category= True)
    # Days 365.243 values -> nan
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    # Add feature: value ask / value received percentage
    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
    # Previous applications numeric features
    num_aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'mean'],
        'AMT_APPLICATION': ['min', 'max', 'mean'],
        'AMT_CREDIT': ['min', 'max', 'mean'],
        'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
    }
    # Previous applications categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']
    
    prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    del refused, refused_agg, approved, approved_agg, prev
    gc.collect()
    return prev_agg

# Preprocess POS_CASH_balance.csv
def pos_cash(num_rows = None, nan_as_category = True):
    pos = pd.read_csv('C:/Users/bigdata/Projet7/POS_CASH_balance.csv', nrows = num_rows)
    pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    
    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    del pos
    gc.collect()
    return pos_agg
    
# Preprocess installments_payments.csv
def installments_payments(num_rows = None, nan_as_category = True):
    ins = pd.read_csv('C:/Users/bigdata/Projet7/installments_payments.csv', nrows = num_rows)
    ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)
    # Percentage and difference paid in each installment (amount paid and installment value)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
    # Days past due and days before due (no negative values)
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum'],
        'DBD': ['max', 'mean', 'sum'],
        'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    # Count installments accounts
    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
    del ins
    gc.collect()
    return ins_agg

# Preprocess credit_card_balance.csv
def credit_card_balance(num_rows = None, nan_as_category = True):
    cc = pd.read_csv('C:/Users/bigdata/Projet7/credit_card_balance.csv', nrows = num_rows)
    cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
    # General aggregations
    cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
    cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    del cc
    gc.collect()
    return cc_agg

In [4]:

def main(debug = False):
    num_rows = 10000 if debug else None
    df = application_train_test(num_rows)
    with timer("Process bureau and bureau_balance"):
        bureau = bureau_and_balance(num_rows)
        print("Bureau df shape:", bureau.shape)
        df = df.join(bureau, how='left', on='SK_ID_CURR')
        del bureau
        gc.collect()
    with timer("Process previous_applications"):
        prev = previous_applications(num_rows)
        print("Previous applications df shape:", prev.shape)
        df = df.join(prev, how='left', on='SK_ID_CURR')
        del prev
        gc.collect()
    with timer("Process POS-CASH balance"):
        pos = pos_cash(num_rows)
        print("Pos-cash balance df shape:", pos.shape)
        df = df.join(pos, how='left', on='SK_ID_CURR')
        del pos
        gc.collect()
    with timer("Process installments payments"):
        ins = installments_payments(num_rows)
        print("Installments payments df shape:", ins.shape)
        df = df.join(ins, how='left', on='SK_ID_CURR')
        del ins
        gc.collect()
    with timer("Process credit card balance"):
        cc = credit_card_balance(num_rows)
        print("Credit card balance df shape:", cc.shape)
        df = df.join(cc, how='left', on='SK_ID_CURR')
        del cc
        gc.collect()
    return df

In [5]:
data = main(debug = False)

Train samples: 307511, test samples: 48744
Bureau df shape: (305811, 116)
Process bureau and bureau_balance - done in 46s
Previous applications df shape: (338857, 249)
Process previous_applications - done in 59s
Pos-cash balance df shape: (337252, 18)
Process POS-CASH balance - done in 27s
Installments payments df shape: (339587, 26)
Process installments payments - done in 123s
Credit card balance df shape: (103558, 141)
Process credit card balance - done in 63s


In [6]:
data.head()

Unnamed: 0,index,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,3,29686.5,312682.5,297000.0,135000.0,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [7]:
data.shape

(356251, 798)

In [8]:
df = data[data['TARGET'].notnull()]

In [9]:
df.shape

(307507, 798)

In [10]:
df1 = df.replace([np.inf, -np.inf], np.nan)

In [11]:
# Remplacer les valeurs manquantes par le médian de chaque colonne
#df1 = df.apply(lambda x: x.fillna(x.mean()),axis=0)

In [12]:
# Remplacer les valeurs manquantes par le médian de chaque colonne
df2 = df1.apply(lambda x: x.fillna(x.median()),axis=0)

# Premier test sur 1% des données

In [13]:
dataset = df2.sample(frac=0.01)

In [14]:
dataset.shape

(3075, 798)

In [15]:
X= dataset.drop(['TARGET','SK_ID_CURR','index'], axis=1).values

In [16]:
y = dataset['TARGET'].values

In [17]:
from sklearn import model_selection
X_train, X_test, y_train, y_test = \
    model_selection.train_test_split(X, y, test_size=0.3, stratify=y)


In [18]:
# standardiser les données
from sklearn import preprocessing
std_scale = preprocessing.StandardScaler().fit(X_train)

X_train_std = std_scale.transform(X_train)
X_test_std = std_scale.transform(X_test)

## DummyClassifier

In [19]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import roc_auc_score
 

dclf = DummyClassifier(strategy = 'most_frequent', random_state =42) 

start_time = timeit.default_timer()

dclf.fit(X_train_std, y_train)

elapsed0 = timeit.default_timer() - start_time

y_pred_dum = dclf.predict(X_test_std)

y_prob_dum = dclf.predict_proba(X_test_std)[:, 1]


# Calculate roc auc
roc_value = roc_auc_score(y_test, y_prob_dum)

print("auc_cv {:.2f} time {:.2f}s".format(roc_value, elapsed0))


auc_cv 0.50 time 0.00s


## svm.SVC

In [20]:
# Créer une SVM avec un noyau gaussien de paramètre gamma=0.01
from sklearn import svm
classifier = svm.SVC(kernel='rbf', gamma=0.01, random_state =42)

start_time = timeit.default_timer()

# Entraîner la SVM sur le jeu d'entraînement
classifier.fit(X_train_std, y_train)

elapsed = timeit.default_timer() - start_time

# prédire sur le jeu de test
y_test_pred = classifier.decision_function(X_test_std)

y_proba = (1/(1 + np.exp(-y_test_pred)))



roc_value = roc_auc_score(y_test, y_proba)

print("auc_cv {:.2f} time {:.2f}s".format(roc_value, elapsed))


auc_cv 0.69 time 8.07s


In [21]:
cv = StratifiedKFold(n_splits=5, random_state=42)

# choisir 6 valeurs pour C, entre 1e-2 et 1e3
C_range = np.logspace(-2, 3, 6)

# choisir 4 valeurs pour gamma, entre 1e-2 et 10
gamma_range = np.logspace(-2, 1, 4)

# grille de paramètres
param_grid = {'C': C_range, 'gamma': gamma_range}

# critère de sélection du meilleur modèle
score = 'roc_auc'

# initialiser une recherche sur grille
grid = model_selection.GridSearchCV(svm.SVC(kernel='rbf'), 
                                    param_grid, 
                                    cv=cv, # 5 folds de validation croisée  
                                    scoring=score)

start_time = timeit.default_timer()

# faire tourner la recherche sur grille
grid.fit(X_train_std, y_train)

elapsed1 = timeit.default_timer() - start_time


# prédire sur le jeu de test avec le modèle optimisé
y_test_pred_cv = grid.decision_function(X_test_std)

y_proba_cv = (1/(1 + np.exp(-y_test_pred_cv)))


roc_value = roc_auc_score(y_test, y_proba_cv)

print("auc_cv {:.2f} time {:.2f}s".format(roc_value, elapsed1))
# afficher les paramètres optimaux
print("The optimal parameters are {} with a score of {:.2f}".format(grid.best_params_, grid.best_score_))
print(grid.cv_results_)

auc_cv 0.71 time 635.99s
The optimal parameters are {'C': 10.0, 'gamma': 0.01} with a score of 0.61
{'mean_fit_time': array([5.41292744, 5.33147578, 5.69088163, 5.9244565 , 5.57316828,
       6.16864548, 5.35368948, 5.45883307, 4.45637941, 3.41268525,
       3.41311407, 3.41924715, 3.34307775, 3.35224657, 3.48928633,
       3.52639351, 3.3795742 , 3.35413256, 3.53813868, 3.52468162,
       3.37452793, 3.36860905, 3.68375268, 3.5820384 ]), 'std_fit_time': array([0.10569442, 0.28215872, 0.25673387, 0.14952579, 0.4031113 ,
       0.92797125, 0.4705368 , 0.29251867, 0.58027342, 0.1636388 ,
       0.05904785, 0.06370609, 0.0620976 , 0.06758132, 0.07193603,
       0.06550336, 0.10149264, 0.07408017, 0.07443473, 0.07571862,
       0.06497649, 0.07135239, 0.20638264, 0.11497936]), 'mean_score_time': array([1.23530526, 1.37183771, 1.54181986, 1.48023095, 1.48086524,
       1.3524003 , 1.40600023, 1.38777232, 1.01912808, 0.85760183,
       0.85059705, 0.87325559, 0.82199206, 0.80225906, 0.859298

## LogisticRegression

In [22]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc

lr = LogisticRegression(solver = 'newton-cg', n_jobs=1)

start_time = timeit.default_timer()

lr.fit(X_train_std,y_train)

# On récupère la prédiction de la valeur positive
y_prob = lr.predict_proba(X_test_std)[:,1]

elapsed2 = timeit.default_timer() - start_time

# On créé un vecteur de prédiction à partir du vecteur de probabilités
y_pred = np.where(y_prob > 0.5, 1, 0)

roc_value = roc_auc_score(y_test, y_prob)

print("auc_cv {:.2f} time {:.2f}s".format(roc_value, elapsed2))

auc_cv 0.57 time 0.89s


In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

model = LogisticRegression()
# define evaluation
cv = StratifiedKFold(n_splits=5, random_state=42)
# define search space
space = dict()
space['solver'] = ['liblinear']
space['penalty'] = ['l1', 'l2']
space['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
# define search
search = GridSearchCV(model, space, scoring='roc_auc', n_jobs=-1, cv=cv)
# execute search
start_time = timeit.default_timer()

result = search.fit(X_train_std, y_train)

elapsed3 = timeit.default_timer() - start_time



# On récupère la prédiction de la valeur positive
y_prob_cv = result.predict_proba(X_test_std)[:,1] 

# On créé un vecteur de prédiction à partir du vecteur de probabilités
y_pred_cv = np.where(http://localhost:8889/notebooks/P7_GIT_GUI/P7_tous_les_mod%C3%A8les.ipynb#RandomForestClassifiery_prob_cv > 0.5, 1, 0)


roc_value = roc_auc_score(y_test, y_pred_cv)


print("auc_cv {:.2f} time {:.2f}s".format(roc_value, elapsed3))

# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)
#print(result.cv_results_)

auc_cv 0.50 time 894.78s
Best Score: 0.6968742674836486
Best Hyperparameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}


## RandomForestClassifier

In [24]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

rfc = RandomForestClassifier(oob_score=True, random_state = 42)

start_time = timeit.default_timer()

model = rfc.fit(X_train_std, y_train)

elapsed4 = timeit.default_timer() - start_time

# Actual class predictions
rf_predictions = model.predict(X_test_std)

# Probabilities for each class
rf_probs = model.predict_proba(X_test_std)[:, 1]

# Calculate roc auc
roc_value = roc_auc_score(y_test, rf_probs)

print("auc_cv {:.2f} time {:.2f}s".format(roc_value, elapsed4))


auc_cv 0.61 time 2.61s


In [25]:
from sklearn.model_selection import GridSearchCV

cv = StratifiedKFold(n_splits=5, random_state=42)


param_grid = {
    
    'max_depth': [60, 70, 80],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1, 2,3],
    'min_samples_split': [2, 3, 4],
    
    
}

# Create a base model
rf = RandomForestClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                                 cv = cv, n_jobs = -1,scoring='roc_auc', return_train_score=False)

start_time = timeit.default_timer()

grid_search.fit(X_train_std, y_train)

elapsed5 = timeit.default_timer() - start_time

predictions = grid_search.predict(X_test_std)
    
# Probabilities for each class
rf_probs = model.predict_proba(X_test_std)[:, 1]

# Calculate roc auc
roc_value = roc_auc_score(y_test, rf_probs)

print("auc_cv {:.2f} time {:.2f}s".format(roc_value, elapsed5))
    

auc_cv 0.61 time 121.35s


In [26]:
# summarize result
print('Best Score: %s' % grid_search.best_score_)
print('Best Hyperparameters: %s' % grid_search.best_params_)
#print(result.cv_results_)

Best Score: 0.666890256406924
Best Hyperparameters: {'max_depth': 60, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2}


## GradientBoostingClassifier

In [27]:
# import machine learning algorithms
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
gb = GradientBoostingClassifier(random_state = 42)
start_time = timeit.default_timer()

gb.fit(X_train_std, y_train)

elapsed7 = timeit.default_timer() - start_time

predictions = gb.predict_proba(X_test_std)[:, 1]

# Calculate roc auc
roc_value = roc_auc_score(y_test, predictions)

print("auc_cv {:.2f} time {:.2f}s".format(roc_value, elapsed7))

auc_cv 0.69 time 15.92s


In [29]:
from sklearn.model_selection import GridSearchCV

cv = StratifiedKFold(n_splits=5, random_state=42)

param_grid = {
    'n_estimators' : [100, 400, 700],
    
    'max_depth': [60, 70, 80],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1, 2,3],
    'min_samples_split': [100, 300, 500],
    
    
}

# Create a base model
gbc = GradientBoostingClassifier()

# Instantiate the grid search model
grid_search_gbc = GridSearchCV(estimator = gbc, param_grid = param_grid, 
                                 cv = cv, n_jobs = -1, scoring='roc_auc', return_train_score=False)

start_time = timeit.default_timer()

grid_search_gbc.fit(X_train_std, y_train)

elapsed9 = timeit.default_timer() - start_time


predictions = grid_search_gbc.predict(X_test_std)
    
# Probabilities for each class
gbc_probs = grid_search_gbc.predict_proba(X_test_std)[:, 1]

# Calculate roc auc
roc_value = roc_auc_score(y_test, gbc_probs)

print("auc_cv {:.2f} time {:.2f}s".format(roc_value, elapsed9))

# summarize result
print('Best Score: %s' % grid_search_gbc.best_score_)
print('Best Hyperparameters: %s' % grid_search_gbc.best_params_)
#print(grid_search_gbc.cv_results_)

auc_cv 0.67 time 1914.10s
Best Score: 0.6938194206077556
Best Hyperparameters: {'max_depth': 80, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 500, 'n_estimators': 400}


## LightGBM

In [30]:
lg = LGBMClassifier(random_state = 42)
start_time = timeit.default_timer()

lg.fit(X_train_std, y_train)

elapsed10 = timeit.default_timer() - start_time

predictions = lg.predict_proba(X_test_std)[:, 1]

# Calculate roc auc
roc_value = roc_auc_score(y_test, predictions)

print("auc_cv {:.2f} time {:.2f}s".format(roc_value, elapsed10))

auc_cv 0.71 time 4.25s


In [35]:

cv = StratifiedKFold(n_splits=5, random_state=42)

 
parameters = {
              'n_estimators' : [100, 400, 700],
              'max_depth': [15, 25, 35],
              'learning_rate': [0.01,  0.05, 0.15],
              'num_leaves': [8, 15, 25]
              
}
lgbm = LGBMClassifier()
 # With gridsearch we don't need the fit function
gsearch_lgbm = GridSearchCV(lgbm, param_grid=parameters, cv=cv, n_jobs = -1, scoring='roc_auc')

start_time = timeit.default_timer()

gsearch_lgbm.fit(X_train_std, y_train)

elapsed11 = timeit.default_timer() - start_time


predictions = gsearch_lgbm.predict(X_test_std)
    
# Probabilities for each class
lgbm_probs = gsearch_lgbm.predict_proba(X_test_std)[:, 1]

# Calculate roc auc
roc_value = roc_auc_score(y_test, lgbm_probs)

print("auc_cv {:.2f} time {:.2f}s".format(roc_value, elapsed11))

# summarize result
print('Best Score: %s' % gsearch_lgbm.best_score_)
print('Best Hyperparameters: %s' % gsearch_lgbm.best_params_)
#print(gsearch_lgbm.cv_results_)

auc_cv 0.70 time 2707.60s
Best Score: 0.7088586676337998
Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 15, 'n_estimators': 100, 'num_leaves': 8}


In [62]:
dic ={'Temps_normal' : ['8.07s', '0.89s', '2.61s', '15.92s', '4.25s'],
      'Roc_nrmal' : [0.69, 0.57, 0.61, 0.69, 0.71],
      'Temps_grid' : ['635.99s', '894.78s', '121.35s', '1914.10s', '2707.60s'],
      'Roc_grid' : [0.71, 0.50, 0.61, 0.67, 0.70]
    
}

lst = ['svm.', 'L_Regression', 'RandomForest', 'GradientBoosting', 'LGBM' ]

test_models = pd.DataFrame((dic), index=lst)

In [63]:
test_models

Unnamed: 0,Temps_normal,Roc_nrmal,Temps_grid,Roc_grid
svm.,8.07s,0.69,635.99s,0.71
L_Regression,0.89s,0.57,894.78s,0.5
RandomForest,2.61s,0.61,121.35s,0.61
GradientBoosting,15.92s,0.69,1914.10s,0.67
LGBM,4.25s,0.71,2707.60s,0.7


# Deuxième test sur 10% des données

In [36]:
dataset1 = df2.sample(frac=0.1)

In [37]:
X1= dataset1.drop(['TARGET','SK_ID_CURR','index'], axis=1).values
y1 = dataset1['TARGET'].values

In [38]:
from sklearn import model_selection
X1_train, X1_test, y1_train, y1_test = \
    model_selection.train_test_split(X1, y1, test_size=0.3, stratify=y1)

In [39]:
from sklearn import preprocessing
std_scale = preprocessing.StandardScaler().fit(X1_train)

X1_train_std = std_scale.transform(X1_train)
X1_test_std = std_scale.transform(X1_test)

## DummyClassifier

In [40]:
dclf = DummyClassifier(strategy = 'most_frequent', random_state =42) 

start_time = timeit.default_timer()

dclf.fit(X1_train_std, y1_train)

elapsed0 = timeit.default_timer() - start_time

y_pred_dum = dclf.predict(X1_test_std)

y_prob_dum = dclf.predict_proba(X1_test_std)[:, 1]


# Calculate roc auc
roc_value = roc_auc_score(y1_test, y_prob_dum)

print("auc_cv {:.2f} time {:.2f}s".format(roc_value, elapsed0))


auc_cv 0.50 time 0.00s


## svm.SVC

In [41]:
# Créer une SVM avec un noyau gaussien de paramètre gamma=0.01
from sklearn import svm
classifier = svm.SVC(kernel='rbf', gamma=0.01, random_state =42)

start_time = timeit.default_timer()

# Entraîner la SVM sur le jeu d'entraînement
classifier.fit(X1_train_std, y1_train)

elapsed = timeit.default_timer() - start_time

# prédire sur le jeu de test
y_test_pred = classifier.decision_function(X1_test_std)

y_proba = (1/(1 + np.exp(-y_test_pred)))



roc_value = roc_auc_score(y1_test, y_proba)

print("auc_cv {:.2f} time {:.2f}s".format(roc_value, elapsed))


auc_cv 0.68 time 2155.60s


## RandomForestClassifier

In [43]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

rfc = RandomForestClassifier(oob_score=True, random_state = 42)

start_time = timeit.default_timer()

model = rfc.fit(X1_train_std, y1_train)

elapsed4 = timeit.default_timer() - start_time

# Actual class predictions
rf_predictions = model.predict(X1_test_std)

# Probabilities for each class
rf_probs = model.predict_proba(X1_test_std)[:, 1]

# Calculate roc auc
roc_value = roc_auc_score(y1_test, rf_probs)

print("auc_cv {:.2f} time {:.2f}s".format(roc_value, elapsed4))


auc_cv 0.69 time 57.24s


In [45]:
from sklearn.model_selection import GridSearchCV

cv = StratifiedKFold(n_splits=5, random_state=42)


param_grid = {
    
    'max_depth': [60, 70, 80],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1, 2,3],
    'min_samples_split': [2, 3, 4],
    
    
}

# Create a base model
rf = RandomForestClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                                 cv = cv, n_jobs = -1,scoring='roc_auc', return_train_score=False)

start_time = timeit.default_timer()

grid_search.fit(X1_train_std, y1_train)

elapsed5 = timeit.default_timer() - start_time

predictions = grid_search.predict(X1_test_std)
    
# Probabilities for each class
rf_probs = model.predict_proba(X1_test_std)[:, 1]

# Calculate roc auc
roc_value = roc_auc_score(y1_test, rf_probs)

print("auc_cv {:.2f} time {:.2f}s".format(roc_value, elapsed5))
    

auc_cv 0.69 time 2479.78s


## Lightgbm

In [44]:
lg = LGBMClassifier(random_state = 42)
start_time = timeit.default_timer()

lg.fit(X1_train_std, y1_train)

elapsed10 = timeit.default_timer() - start_time

predictions = lg.predict_proba(X1_test_std)[:, 1]

# Calculate roc auc
roc_value = roc_auc_score(y1_test, predictions)

print("auc_cv {:.2f} time {:.2f}s".format(roc_value, elapsed10))

auc_cv 0.75 time 18.05s


In [46]:

cv = StratifiedKFold(n_splits=5, random_state=42)

 
parameters = {
              'n_estimators' : [100, 400, 700],
              'max_depth': [15, 25, 35],
              'learning_rate': [0.01,  0.05, 0.15],
              'num_leaves': [8, 15, 25]
              
}
lgbm = LGBMClassifier()
 # With gridsearch we don't need the fit function
gsearch_lgbm = GridSearchCV(lgbm, param_grid=parameters, cv=cv, n_jobs = -1, scoring='roc_auc')

start_time = timeit.default_timer()

gsearch_lgbm.fit(X1_train_std, y1_train)

elapsed11 = timeit.default_timer() - start_time


predictions = gsearch_lgbm.predict(X1_test_std)
    
# Probabilities for each class
lgbm_probs = gsearch_lgbm.predict_proba(X1_test_std)[:, 1]

# Calculate roc auc
roc_value = roc_auc_score(y1_test, lgbm_probs)

print("auc_cv {:.2f} time {:.2f}s".format(roc_value, elapsed11))

# summarize result
print('Best Score: %s' % gsearch_lgbm.best_score_)
print('Best Hyperparameters: %s' % gsearch_lgbm.best_params_)
#print(gsearch_lgbm.cv_results_)

auc_cv 0.77 time 7661.16s
Best Score: 0.7580049193052539
Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 25, 'n_estimators': 700, 'num_leaves': 25}


# Troisième test sur 100% des données 

In [60]:
df2.shape

(307507, 798)

In [47]:
X2= df2.drop(['TARGET','SK_ID_CURR','index'], axis=1).values
y2 = df2['TARGET'].values

In [50]:
from sklearn import model_selection
X2_train, X2_test, y2_train, y2_test = \
    model_selection.train_test_split(X2, y2, test_size=0.3, stratify=y2)

In [51]:
from sklearn import preprocessing
std_scale = preprocessing.StandardScaler().fit(X2_train)

X2_train_std = std_scale.transform(X2_train)
X2_test_std = std_scale.transform(X2_test) 

## DummyClassifier

In [55]:
dclf = DummyClassifier(strategy = 'most_frequent', random_state =42) 

start_time = timeit.default_timer()

dclf.fit(X2_train_std, y2_train)

elapsed0 = timeit.default_timer() - start_time

y_pred_dum = dclf.predict(X2_test_std)

y_prob_dum = dclf.predict_proba(X2_test_std)[:, 1]


# Calculate roc auc
roc_value = roc_auc_score(y2_test, y_prob_dum)

print("auc_cv {:.2f} time {:.2f}s".format(roc_value, elapsed0))


auc_cv 0.50 time 0.06s


## Lightgbm

In [52]:
lg = LGBMClassifier(random_state = 42)
start_time = timeit.default_timer()

lg.fit(X2_train_std, y2_train)

elapsed12 = timeit.default_timer() - start_time

predictions = lg.predict_proba(X2_test_std)[:, 1]

# Calculate roc auc
roc_value = roc_auc_score(y2_test, predictions)

print("auc_cv {:.2f} time {:.2f}s".format(roc_value, elapsed12))

auc_cv 0.78 time 126.35s


In [58]:
lg.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': 42,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [64]:
df2.to_csv(r'D:\Documents\P7\df2.csv')

# Voir le notebook qui contient le modèle final