# 0. Introduction

# 1. Initialization

## 1.1. (most) package imports

In [1]:
# kaggle/python Docker image: https://github.com/kaggle/docker-python

import gc
import time
import os
import re
from contextlib import contextmanager
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 20, "display.max_columns", 30)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

## 1.2. Global variables

In [2]:
mini = False
offline = True

## 1.3. Global functions

In [3]:
@contextmanager
def timer(title):
    t0 = time.perf_counter()
    yield
    print("{} - done in {:.0f}s".format(title, time.perf_counter() - t0))

def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == "object"]
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [4]:
df_ohe_test = pd.DataFrame({
    "C": ["R", "G", "B", "B", "R"],
    "L": ["A", "B", "C", "D", "A"]
})

display(one_hot_encoder(df_ohe_test, False)[1])
display(one_hot_encoder(df_ohe_test, True)[0])

['C_B', 'C_G', 'C_R', 'L_A', 'L_B', 'L_C', 'L_D']

Unnamed: 0,C_B,C_G,C_R,C_nan,L_A,L_B,L_C,L_D,L_nan
0,0,0,1,0,1,0,0,0,0
1,0,1,0,0,0,1,0,0,0
2,1,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,1,0
4,0,0,1,0,1,0,0,0,0


## 1.4. Dataset overview

offline j'ai:

<p style="background:black">
<code style="background:black;color:white">filenames = [
    "home-credit-default-risk/application_test.csv",
    "home-credit-default-risk/application_train.csv",
    "home-credit-default-risk/bureau.csv",
    "home-credit-default-risk/bureau_balance.csv",
    "home-credit-default-risk/credit_card_balance.csv",
    "home-credit-default-risk/installments_payments.csv",
    "home-credit-default-risk/POS_CASH_balance.csv",
    "home-credit-default-risk/previous_application.csv",
    "home-credit-default-risk/sample_submission.csv"
    ]
</code>
</p>

not offline (on Kaggle) j'ai:

<p style="background:black">
<code style="background:black;color:white">filenames = [
    "/kaggle/input/home-credit-default-risk/sample_submission.csv",
    "/kaggle/input/home-credit-default-risk/bureau_balance.csv",
    "/kaggle/input/home-credit-default-risk/POS_CASH_balance.csv",
    "/kaggle/input/home-credit-default-risk/application_train.csv",
    "/kaggle/input/home-credit-default-risk/application_test.csv",
    "/kaggle/input/home-credit-default-risk/previous_application.csv",
    "/kaggle/input/home-credit-default-risk/credit_card_balance.csv",
    "/kaggle/input/home-credit-default-risk/installments_payments.csv",
    "/kaggle/input/home-credit-default-risk/bureau.csv"]
    ]
</code>
</p>

In [5]:
filenames = []

if offline:
    le_path = "home-credit-default-risk/"
else:
    le_path = "/kaggle/input"

for dirname, _, filenamess in os.walk(le_path):
    for filenamee in filenamess:
#                        HomeCredit_columns_description.csv est illisible.
        if filenamee != "HomeCredit_columns_description.csv":
            filename = os.path.join(dirname, filenamee)
            print(filename)
            filenames.append(filename)
 #           df = pd.read_csv(filename)
 #           display(df[-1:])
 #           if filenamee != "bureau_balance.csv":
 #               print(set(df.SK_ID_CURR.value_counts()))

home-credit-default-risk/application_test.csv
home-credit-default-risk/application_train.csv
home-credit-default-risk/bureau.csv
home-credit-default-risk/bureau_balance.csv
home-credit-default-risk/credit_card_balance.csv
home-credit-default-risk/installments_payments.csv
home-credit-default-risk/POS_CASH_balance.csv
home-credit-default-risk/previous_application.csv
home-credit-default-risk/sample_submission.csv


In [6]:
if not offline:
    flnms = []
    flnms.append(filenames[4])
    flnms.append(filenames[3])
    flnms.append(filenames[8])
    flnms.append(filenames[1])
    flnms.append(filenames[6])
    flnms.append(filenames[7])
    flnms.append(filenames[2])
    flnms.append(filenames[5])
    flnms.append(filenames[0])
    filenames = flnms

In [7]:
%%script echo skipped
for filename in (filenames[:4] + filenames[5:]):
    df = pd.read_csv(filename)
    print(filename)
    display(df.head(2))
    #display(df.info())
    print("shape: ", df.shape)
    display(df.nunique())
    #display(df.isna().mean().sort_values())
    print("")

Couldn't find program: 'echo'


In [8]:
#pd.read_csv(filenames[4])

In [9]:
1234567890123456789012345678901234567890123456789012345678901234567890123456789

1234567890123456789012345678901234567890123456789012345678901234567890123456789

# 2. Preprocessing

## 2.1. application_train_test

### 2.1.1. tests

In [10]:
%%script echo skipped
num_rows = None
nan_as_category = False
df = pd.read_csv("../input/home-credit-default-risk/application_train.csv", nrows=num_rows)
test_df = pd.read_csv("../input/home-credit-default-risk/application_test.csv", nrows=num_rows)
print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
df = df.append(test_df).reset_index()
display(df.head(2))

Couldn't find program: 'echo'


In [11]:
%%script echo skipped
pd.factorize(df["CODE_GENDER"])

Couldn't find program: 'echo'


### 2.1.2. preprocessing function

In [12]:
def application_train_test(num_rows=None, nan_as_category=False):
    # Read data and merge
    if mini:
        df = pd.read_csv(filenames[1], nrows=num_rows)[-10:]
        test_df = pd.read_csv(filenames[0], nrows=num_rows)[-2:]
    else:
        df = pd.read_csv(filenames[1], nrows=num_rows)
        test_df = pd.read_csv(filenames[0], nrows=num_rows)
    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
    df = df.append(test_df).reset_index()
    # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
    df = df[df["CODE_GENDER"] != "XNA"]

    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ["CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY"]:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)

    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df["DAYS_EMPLOYED"].replace(365243, np.nan, inplace=True)
    # Some simple new features (percentages)
    df["DAYS_EMPLOYED_PERC"] = df["DAYS_EMPLOYED"] / df["DAYS_BIRTH"]
    df["INCOME_CREDIT_PERC"] = df["AMT_INCOME_TOTAL"] / df["AMT_CREDIT"]
    df["INCOME_PER_PERSON"] = df["AMT_INCOME_TOTAL"] / df["CNT_FAM_MEMBERS"]
    df["ANNUITY_INCOME_PERC"] = df["AMT_ANNUITY"] / df["AMT_INCOME_TOTAL"]
    df["PAYMENT_RATE"] = df["AMT_ANNUITY"] / df["AMT_CREDIT"]
    del test_df
    gc.collect()
    #print(df.dtypes.value_counts())
    return df

## 2.2. bureau_and_balance

### 2.2.1. tests

In [13]:
%%script echo skipped
bureau = pd.read_csv("../input/home-credit-default-risk/bureau.csv", nrows=num_rows)
bb = pd.read_csv("../input/home-credit-default-risk/bureau_balance.csv", nrows=num_rows)
bb, bb_cat = one_hot_encoder(bb, nan_as_category)
bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
bb_aggregations = {"MONTHS_BALANCE": ["min", "max", "size"]}
for col in bb_cat:
    bb_aggregations[col] = ["mean"]
print(bb_aggregations, "\n", bb.columns, "\n", bureau_cat, "\n", bureau.columns)

Couldn't find program: 'echo'


In [14]:
%%script echo skipped
bb_agg = bb.groupby("SK_ID_BUREAU").agg(bb_aggregations)
bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
bureau = bureau.join(bb_agg, how="left", on="SK_ID_BUREAU")
bureau.drop(["SK_ID_BUREAU"], axis=1, inplace=True)
bureau.head(2)

Couldn't find program: 'echo'


In [15]:
%%script echo skipped
num_aggregations = {
    "DAYS_CREDIT": ["min", "max", "mean", "var"],
    "DAYS_CREDIT_ENDDATE": ["min", "max", "mean"],
    "DAYS_CREDIT_UPDATE": ["mean"],
    "CREDIT_DAY_OVERDUE": ["max", "mean"],
    "AMT_CREDIT_MAX_OVERDUE": ["mean"],
    "AMT_CREDIT_SUM": ["max", "mean", "sum"],
    "AMT_CREDIT_SUM_DEBT": ["max", "mean", "sum"],
    "AMT_CREDIT_SUM_OVERDUE": ["mean"],
    "AMT_CREDIT_SUM_LIMIT": ["mean", "sum"],
    "AMT_ANNUITY": ["max", "mean"],
    "CNT_CREDIT_PROLONG": ["sum"],
    "MONTHS_BALANCE_MIN": ["min"],
    "MONTHS_BALANCE_MAX": ["max"],
    "MONTHS_BALANCE_SIZE": ["mean", "sum"]
}
# Bureau and bureau_balance categorical features
cat_aggregations = {}
for cat in bureau_cat: cat_aggregations[cat] = ["mean"]
for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ["mean"]

bureau_agg = bureau.groupby("SK_ID_CURR").agg({**num_aggregations, **cat_aggregations})
bureau_agg.columns = pd.Index(["BURO_" + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
# Bureau: Active credits - using only numerical aggregations
active = bureau[bureau["CREDIT_ACTIVE_Active"] == 1]
active.head(2)

Couldn't find program: 'echo'


### 2.2.2. preprocessing function

In [16]:
def bureau_and_balance(num_rows=None, nan_as_category=True):
    if mini:
        bureau = pd.read_csv(filenames[2], nrows=num_rows)[-5:-3]
        bb = pd.read_csv(filenames[3], nrows=num_rows)[-5:-3]
    else:
        bureau = pd.read_csv(filenames[2], nrows=num_rows)
        bb = pd.read_csv(filenames[3], nrows=num_rows)
    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)

    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {"MONTHS_BALANCE": ["min", "max", "size"]}
    for col in bb_cat:
        bb_aggregations[col] = ["mean"]
    bb_agg = bb.groupby("SK_ID_BUREAU").agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how="left", on="SK_ID_BUREAU")
    bureau.drop(["SK_ID_BUREAU"], axis=1, inplace=True)
    del bb, bb_agg
    gc.collect()

    # Bureau and bureau_balance numeric features
    num_aggregations = {
        "DAYS_CREDIT": ["min", "max", "mean", "var"],
        "DAYS_CREDIT_ENDDATE": ["min", "max", "mean"],
        "DAYS_CREDIT_UPDATE": ["mean"],
        "CREDIT_DAY_OVERDUE": ["max", "mean"],
        "AMT_CREDIT_MAX_OVERDUE": ["mean"],
        "AMT_CREDIT_SUM": ["max", "mean", "sum"],
        "AMT_CREDIT_SUM_DEBT": ["max", "mean", "sum"],
        "AMT_CREDIT_SUM_OVERDUE": ["mean"],
        "AMT_CREDIT_SUM_LIMIT": ["mean", "sum"],
        "AMT_ANNUITY": ["max", "mean"],
        "CNT_CREDIT_PROLONG": ["sum"],
        "MONTHS_BALANCE_MIN": ["min"],
        "MONTHS_BALANCE_MAX": ["max"],
        "MONTHS_BALANCE_SIZE": ["mean", "sum"]
    }
    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ["mean"]
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ["mean"]

    bureau_agg = bureau.groupby("SK_ID_CURR").agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(["BURO_" + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau["CREDIT_ACTIVE_Active"] == 1]
    active_agg = active.groupby("SK_ID_CURR").agg(num_aggregations)
    active_agg.columns = pd.Index(["ACTIVE_" + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how="left", on="SK_ID_CURR")
    del active, active_agg
    gc.collect()
    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau["CREDIT_ACTIVE_Closed"] == 1]
    closed_agg = closed.groupby("SK_ID_CURR").agg(num_aggregations)
    closed_agg.columns = pd.Index(["CLOSED_" + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how="left", on="SK_ID_CURR")
    del closed, closed_agg, bureau
    gc.collect()
    #print(bureau_agg.dtypes.value_counts())
    return bureau_agg

### 2.2.3. more tests

In [17]:
%%script echo skipped
df = pd.read_csv(filenames[2])[-5:]
df

Couldn't find program: 'echo'


In [18]:
%%script echo skipped
prev.columns

Couldn't find program: 'echo'


## 2.3. previous_applications

### 2.3.1. preprocessing function

In [19]:
def previous_applications(num_rows=None, nan_as_category=True):
    if mini:
        prev = pd.read_csv(filenames[7], nrows=num_rows)[-7:-5]
    else:
        prev = pd.read_csv(filenames[7], nrows=num_rows)
    prev, cat_cols = one_hot_encoder(prev, nan_as_category=True)
    # Days 365.243 values -> nan
    prev["DAYS_FIRST_DRAWING"].replace(365243, np.nan, inplace=True)
    prev["DAYS_FIRST_DUE"].replace(365243, np.nan, inplace=True)
    prev["DAYS_LAST_DUE_1ST_VERSION"].replace(365243, np.nan, inplace=True)
    prev["DAYS_LAST_DUE"].replace(365243, np.nan, inplace=True)
    prev["DAYS_TERMINATION"].replace(365243, np.nan, inplace=True)
    # Add feature: value ask / value received percentage
    prev["APP_CREDIT_PERC"] = prev["AMT_APPLICATION"] / prev["AMT_CREDIT"]
    # Previous applications numeric features
    num_aggregations = {
        "AMT_ANNUITY": ["min", "max", "mean"],
        "AMT_APPLICATION": ["min", "max", "mean"],
        "AMT_CREDIT": ["min", "max", "mean"],
        "APP_CREDIT_PERC": ["min", "max", "mean", "var"],
        "AMT_DOWN_PAYMENT": ["min", "max", "mean"],
        "AMT_GOODS_PRICE": ["min", "max", "mean"],
        "HOUR_APPR_PROCESS_START": ["min", "max", "mean"],
        "RATE_DOWN_PAYMENT": ["min", "max", "mean"],
        "DAYS_DECISION": ["min", "max", "mean"],
        "CNT_PAYMENT": ["mean", "sum"],
    }
    # Previous applications categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ["mean"]

    prev_agg = prev.groupby("SK_ID_CURR").agg({**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(["PREV_" + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev["NAME_CONTRACT_STATUS_Approved"] == 1]
    approved_agg = approved.groupby("SK_ID_CURR").agg(num_aggregations)
    approved_agg.columns = pd.Index(["APPROVED_" + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how="left", on="SK_ID_CURR")
    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev["NAME_CONTRACT_STATUS_Refused"] == 1]
    refused_agg = refused.groupby("SK_ID_CURR").agg(num_aggregations)
    refused_agg.columns = pd.Index(["REFUSED_" + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how="left", on="SK_ID_CURR")
    del refused, refused_agg, approved, approved_agg, prev
    gc.collect()
    #print(prev_agg.dtypes.value_counts())
    return prev_agg

### 2.3.2. tests

In [20]:
%%script echo skipped
df = pd.read_csv(filenames[7])[-7:]
df.NAME_CONTRACT_STATUS

Couldn't find program: 'echo'


In [21]:
%%script echo skipped
df.columns

Couldn't find program: 'echo'


## 2.4. pos_cash

### 2.4.1. preprocessing function

In [22]:
def pos_cash(num_rows=None, nan_as_category=True):
    if mini:
        pos = pd.read_csv(filenames[6], nrows=num_rows)[-2:]
    else:
        pos = pd.read_csv(filenames[6], nrows=num_rows)
    pos, cat_cols = one_hot_encoder(pos, nan_as_category=True)
    # Features
    aggregations = {
        "MONTHS_BALANCE": ["max", "mean", "size"],
        "SK_DPD": ["max", "mean"],
        "SK_DPD_DEF": ["max", "mean"]
    }
    for cat in cat_cols:
        aggregations[cat] = ["mean"]

    pos_agg = pos.groupby("SK_ID_CURR").agg(aggregations)
    pos_agg.columns = pd.Index(["POS_" + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg["POS_COUNT"] = pos.groupby("SK_ID_CURR").size()
    del pos
    gc.collect()
    #print(pos_agg.dtypes.value_counts())
    return pos_agg

## 2.5. installment_payments

### 2.5.1. preprocessing function

In [23]:
def installments_payments(num_rows=None, nan_as_category=True):
    if mini:
        ins = pd.read_csv(filenames[5], nrows=num_rows)[-2:]
    else:
        ins = pd.read_csv(filenames[5], nrows=num_rows)
    ins, cat_cols = one_hot_encoder(ins, nan_as_category=True)
    # Percentage and difference paid in each installment (amount paid and installment value)
    ins["PAYMENT_PERC"] = ins["AMT_PAYMENT"] / ins["AMT_INSTALMENT"]
    ins["PAYMENT_DIFF"] = ins["AMT_INSTALMENT"] - ins["AMT_PAYMENT"]
    # Days past due and days before due (no negative values)
    ins["DPD"] = ins["DAYS_ENTRY_PAYMENT"] - ins["DAYS_INSTALMENT"]
    ins["DBD"] = ins["DAYS_INSTALMENT"] - ins["DAYS_ENTRY_PAYMENT"]
    ins["DPD"] = ins["DPD"].apply(lambda x: x if x > 0 else 0)
    ins["DBD"] = ins["DBD"].apply(lambda x: x if x > 0 else 0)
    # Features: Perform aggregations
    aggregations = {
        "NUM_INSTALMENT_VERSION": ["nunique"],
        "DPD": ["max", "mean", "sum"],
        "DBD": ["max", "mean", "sum"],
        "PAYMENT_PERC": ["max", "mean", "sum", "var"],
        "PAYMENT_DIFF": ["max", "mean", "sum", "var"],
        "AMT_INSTALMENT": ["max", "mean", "sum"],
        "AMT_PAYMENT": ["min", "max", "mean", "sum"],
        "DAYS_ENTRY_PAYMENT": ["max", "mean", "sum"]
    }
    for cat in cat_cols:
        aggregations[cat] = ["mean"]
    ins_agg = ins.groupby("SK_ID_CURR").agg(aggregations)
    ins_agg.columns = pd.Index(["INSTAL_" + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    # Count installments accounts
    ins_agg["INSTAL_COUNT"] = ins.groupby("SK_ID_CURR").size()
    del ins
    gc.collect()
    #print(ins_agg.dtypes.value_counts())
    return ins_agg

## 2.6. credit_card_balance

### 2.6.1. preprocessing function

In [24]:
def credit_card_balance(num_rows=None, nan_as_category=True):
    if mini:
        cc = pd.read_csv(filenames[4], nrows=num_rows)[-2:]
    else:
        cc = pd.read_csv(filenames[4], nrows=num_rows)
    cc, cat_cols = one_hot_encoder(cc, nan_as_category=True)
    # General aggregations
    cc.drop(["SK_ID_PREV"], axis=1, inplace =True)
    cc_agg = cc.groupby("SK_ID_CURR").agg(["min", "max", "mean", "sum", "var"])
    cc_agg.columns = pd.Index(["CC_" + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    # Count credit card lines
    cc_agg["CC_COUNT"] = cc.groupby("SK_ID_CURR").size()
    del cc
    gc.collect()
    #print(cc_agg.dtypes.value_counts())
    return cc_agg

# 3. functions from the original notebook

In [25]:
# LightGBM GBDT with KFold or Stratified KFold
# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
def kfold_lightgbm(df, num_folds, stratified=False, debug=False):
    # Divide in training/validation and test data
    train_df = df[df["TARGET"].notnull()]
    test_df = df[df["TARGET"].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ["TARGET","SK_ID_CURR","SK_ID_BUREAU","SK_ID_PREV","index"]]

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df["TARGET"])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df["TARGET"].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df["TARGET"].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1,
        )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric="auc", verbose=200, early_stopping_rounds=200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print("Fold %2d AUC : %.6f" % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print("Full AUC score %.6f" % roc_auc_score(train_df["TARGET"], oof_preds))
    # Write submission file and plot feature importance
    if not debug:
        test_df["TARGET"] = sub_preds
        test_df[["SK_ID_CURR", "TARGET"]].to_csv(submission_file_name, index=False)
    display_importances(feature_importance_df)
    return feature_importance_df

In [26]:
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature")\
           .mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title("LightGBM Features (avg over folds)")
    plt.tight_layout()
    plt.savefig("lgbm_importances01.png")

In [27]:
def main(debug=False):
    num_rows = 1000 if debug else None
#    mini = 0 if debug else non sinon il est jamais full le df
#    if debug: mini = 0 En fait même celui-ci ne fontionne pas car lors de la
#    def des fcts mini valait 1 et lors le l'appel des fcts soit le "if mini"
#    ne sera pas lu car il a déjà été évalué lors du def ou alors le "if mini"
#    est lu mais ne regarde pas la valeur actuelle de mini seulement celle qui
#    était active lors du def. Je ne sais pas exactement mais le résultat est
#    que c'est trop tard ici pour dire "mini = 0".
    df = application_train_test(num_rows)
    print("Application train test df shape:", df.shape)
#    print(df.dtypes.value_counts())
    with timer("Process bureau and bureau_balance"):
        bureau = bureau_and_balance(num_rows)
        print("Bureau df shape:", bureau.shape)
        df = df.join(bureau, how="left", on="SK_ID_CURR")
#        print(df.dtypes.value_counts())
        del bureau
        gc.collect()
    with timer("Process previous_applications"):
        prev = previous_applications(num_rows)
        print("Previous applications df shape:", prev.shape)
        df = df.join(prev, how="left", on="SK_ID_CURR")
#        print(df.dtypes.value_counts())
        del prev
        gc.collect()
    with timer("Process POS-CASH balance"):
        pos = pos_cash(num_rows)
        print("Pos-cash balance df shape:", pos.shape)
        df = df.join(pos, how="left", on="SK_ID_CURR")
#        print(df.dtypes.value_counts())
        del pos
        gc.collect()
    with timer("Process installments payments"):
        ins = installments_payments(num_rows)
        print("Installments payments df shape:", ins.shape)
        df = df.join(ins, how="left", on="SK_ID_CURR")
#        print(df.dtypes.value_counts())
        del ins
        gc.collect()
    with timer("Process credit card balance"):
        cc = credit_card_balance(num_rows)
        print("Credit card balance df shape:", cc.shape)
        df = df.join(cc, how="left", on="SK_ID_CURR")
#        print(df.dtypes.value_counts())
        del cc
        gc.collect()
#    with timer("Run LightGBM with kfold"):
#        feat_importance = kfold_lightgbm(df, num_folds=10, stratified=False, debug=debug)
    return df

In [28]:
1234567890123456789012345678901234567890123456789012345678901234567890123456789

1234567890123456789012345678901234567890123456789012345678901234567890123456789

# 4. Run the preprocessing

## 4.1. A first full run just to measure the target imbalance

In [29]:
%%script echo
# I ran this cell only once, just to get the exact values of zo and oz.
if __name__ == "__main__":
    submission_file_name = "submission_kernel02.csv"
    with timer("Full model run"):
#        main()
        df = main(debug=False)

zeros_full = df.TARGET.value_counts(
    sort=True,
    ascending=False,
    dropna=True,
)[0]
ones_full = df.TARGET.value_counts(
    sort=True,
    ascending=False,
    dropna=True,
)[1]
nans_full = df.TARGET.isna().sum()
print("TARGET has",
      f"{zeros_full:10.0f} zeros,",
      f"{ones_full:10.0f} ones and",
      f"{nans_full:10.0f} NaNs")

Train samples: 307511, test samples: 48744
Application train test df shape: (356251, 248)
Bureau df shape: (305811, 116)
Process bureau and bureau_balance - done in 14s
Previous applications df shape: (338857, 249)
Process previous_applications - done in 16s
Pos-cash balance df shape: (337252, 18)
Process POS-CASH balance - done in 8s
Installments payments df shape: (339587, 26)
Process installments payments - done in 22s
Credit card balance df shape: (103558, 141)
Process credit card balance - done in 11s
Full model run - done in 74s
282682.00   24825.00   48744.00


In [31]:
zeros_full, ones_full, nans_full = 282682, 24825, 48744

In [36]:
zo = zeros_full/ones_full
oz = ones_full/zeros_full
print("There is zo =",
      f"{zo:.2f} more zeros than ones in TARGET. (and oz =",
      f"{oz:.2f})")

There is zo = 11.39 more zeros than ones in TARGET. (and oz = 0.09)


## 4.2. Subsampled run

### 4.2.1. kinda the original run code

In [None]:
%%script echo
if __name__ == "__main__":
    submission_file_name = "submission_kernel02.csv"
    with timer("Full model run"):
#        main()
        df = main(debug=True)

zeros = df.TARGET.value_counts(
    sort=True,
    ascending=False,
    dropna=True,
)[0]
ones = df.TARGET.value_counts(
    sort=True,
    ascending=False,
    dropna=True,
)[1]
nans = df.TARGET.isna().sum()
print("TARGET has",
      f"{zeros:10.0f} zeros,",
      f"{ones:10.0f} ones and",
      f"{nans:10.0f} NaNs")

### 4.2.2. An easier to debug version

In [None]:
if __name__ == "__main__":
    submission_file_name = "submission_kernel02.csv"
    with timer("Full model run"):
        debug = True
        num_rows = 1000 if debug else None
#        if debug: mini = 0
        mini = 0 if debug else mini
        df = application_train_test(num_rows)
        print("Application train test df shape:", df.shape)
#        print(df.dtypes.value_counts())
        with timer("Process bureau and bureau_balance"):
            bureau = bureau_and_balance(num_rows)
            print("Bureau df shape:", bureau.shape)
            df = df.join(bureau, how="left", on="SK_ID_CURR")
#            print(df.dtypes.value_counts())
            del bureau
            gc.collect()
        with timer("Process previous_applications"):
            prev = previous_applications(num_rows)
            print("Previous applications df shape:", prev.shape)
            df = df.join(prev, how="left", on="SK_ID_CURR")
#            print(df.dtypes.value_counts())
            del prev
            gc.collect()
        with timer("Process POS-CASH balance"):
            pos = pos_cash(num_rows)
            print("Pos-cash balance df shape:", pos.shape)
            df = df.join(pos, how="left", on="SK_ID_CURR")
#            print(df.dtypes.value_counts())
            del pos
            gc.collect()
        with timer("Process installments payments"):
            ins = installments_payments(num_rows)
            print("Installments payments df shape:", ins.shape)
            df = df.join(ins, how="left", on="SK_ID_CURR")
#            print(df.dtypes.value_counts())
            del ins
            gc.collect()
        with timer("Process credit card balance"):
            cc = credit_card_balance(num_rows)
            print("Credit card balance df shape:", cc.shape)
            df = df.join(cc, how="left", on="SK_ID_CURR")
#            print(df.dtypes.value_counts())
            del cc
            gc.collect()

zeros = df.TARGET.value_counts(
    sort=True,
    ascending=False,
    dropna=True,
)[0]
ones = df.TARGET.value_counts(
    sort=True,
    ascending=False,
    dropna=True,
)[1]
nans = df.TARGET.isna().sum()
print("TARGET has",
      f"{zeros:10.0f} zeros,",
      f"{ones:10.0f} ones and",
      f"{nans:10.0f} NaNs")

### 4.2.3. Measure of the target imbalance after the subsampling is done.

In [None]:
zo_sub = zeros/ones
oz_sub = ones/zeros
print("There is zo_sub =",
      f"{zo_sub:.2f} more zeros than ones in TARGET. (and oz_sub =",
      f"{oz_sub:.2f})")

In [45]:
if (zo/zo_sub >= 3/2 or zo/zo_sub <= 2/3):
    for _ in range(8):
        print("!")
    print(
        "L'imbalance des targets a été fortement modifiée par le subsampling.",
        f"Elle est passée de {zo:.2f} à {zo_sub:.2f}.",
    )
    for _ in range(8):
        print("!")
else:
    print(
        f"L'imbalance de {zo:.2f} des targets du dataset est de {zo_sub:.2f}",
        "après subsampling.",
    )

L'imbalance de 11.39 des targets du dataset est de 11.00 après subsampling.


In [None]:
1234567890123456789012345678901234567890123456789012345678901234567890123456789

# 5. Tentatives infructueuses de sauvegarde du dataset nettoyé dans Kaggle

In [None]:
#df.to_csv("df_agg.csv")

In [None]:
#df.to_csv("../input/df_agg.csv")

In [None]:
#df.to_csv("../df_agg.csv")

In [None]:
#df.to_csv("../working/df_agg.csv")

In [None]:
#pd.read_csv('../df_agg.csv')

In [None]:
%%script echo skipped
df = pd.read_csv("df_agg.csv")
print(df.shape)
df.columns

In [None]:
%%script echo skipped
for dirname, _, filenames in os.walk(".."):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
%%script echo skipped
for dirname, _, filenames in os.walk("/kaggle"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#!pwd

In [None]:
#!ls

In [None]:
#df.dtypes.value_counts()

In [None]:
#df.info()

In [None]:
#df.isna().mean()

# 6. Suppression du caractère illisible

In [None]:
cols_of_df = list(df.columns)

In [None]:
%%script echo skipped
# Ce code prend un temps infini à run. Prende la cell en-dessous.
for j in cols_of_df:
    df = df.rename(columns={j: re.sub(r"[ ]", r"_a_", j)})
    df = df.rename(columns={j: re.sub(r"[-]", r"_b_", j)})
    df = df.rename(columns={j: re.sub(r"[:]", r"_c_", j)})
    df = df.rename(columns={j: re.sub(r"[/]", r"_d_", j)})
    df = df.rename(columns={j: re.sub(r"[,]", r"_e_", j)})
    df = df.rename(columns={j: re.sub(r"[:]", r"_f_", j)})

In [None]:
%%script echo skipped
df = df.rename(columns=lambda x: x.replace(" ", "_a_")\
                                  .replace("-", "_b_")\
                                  .replace(":", "_c_")\
                                  .replace("/", "_d_")\
                                  .replace(",", "_e_")\
                                  .replace(":", "_f_"))

In [None]:
df = df.rename(columns=lambda x: x.replace(":", "_f_"))

# 7. Classification run from the original notebook

In [None]:
%%script echo skipped
with timer("Run LightGBM with kfold"):
    feat_importance = kfold_lightgbm(df, num_folds=10, stratified=False, debug=False)

# 8. Classifications

## 8.1. train_test_split "à la main"

In [None]:
%%script echo mauvaise idée
#surtout pas de random split car TARGET a des NaN en plus des 0 ou 1
X = df.drop("TARGET", axis="columns")
y = df["TARGET"]
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.2, random_state=42)

In [None]:
1234567890123456789012345678901234567890123456789012345678901234567890123456789

In [None]:
train_df = df[df["TARGET"].notnull()]
X_train = train_df.drop("TARGET", axis="columns")
y_train = train_df["TARGET"]
test_df = df[df["TARGET"].isnull()]
X_test = test_df.drop("TARGET", axis="columns")
y_test = test_df["TARGET"]

In [None]:
y_train.value_counts()

## 8.2. SimpleImputer

In [None]:
X_test_old = X_test.copy()

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
X_train_i = imputer.fit_transform(X_train)
X_test = imputer.fit_transform(X_test_old)

## 8.3. Balancing the target

In [None]:
%%script echo skipped
from imblearn.under_sampling import RandomUnderSampler
undersampler = RandomUnderSampler(sampling_strategy="majority")
X_train_u, y_train_u = undersampler.fit_resample(X_train_i, y_train)

In [None]:
from imblearn.over_sampling import SMOTE
#oversampler = RandomOverSampler(sampling_strategy='minority')
oversampler = SMOTE()
X_train_o, y_train_o = oversampler.fit_resample(X_train_i, y_train)

## 8.4. Declaring the classifiers

In [None]:
%%script echo Cette approche ne permet pas le SearchCV multi-estimateurs
classifiers = {
    "dummy": DummyClassifier(),
    "naive_bayes": GaussianNB(),
    "sgd": SGDClassifier(),
    "knn": KNeighborsClassifier(),
    "svc": SVC(),
    "random_forest": RandomForestClassifier(),
    "xgb": XGBClassifier(),
    "catboost": CatBoostClassifier(),
    "adaboost": AdaBoostClassifier(),
    "lightgbm": LGBMClassifier(),
}

In [None]:
# Pas très joli mais ça fonctionne.
clf0 = DummyClassifier()
clf1 = LogisticRegression(random_state=42)
clf2 = SGDClassifier()
clf3 = GaussianNB()
clf4 = MultinomialNB()
clf5 = KNeighborsClassifier()
clf6 = SVC(probability=True, random_state=42)
clf7 = DecisionTreeClassifier(random_state=42)
clf8 = RandomForestClassifier(random_state=42)
clf9 = GradientBoostingClassifier(random_state=42)
clf10 = AdaBoostClassifier(random_state=42)
clf11 = XGBClassifier(random_state=42)
clf12 = CatBoostClassifier(random_state=42)
clf13 = LGBMClassifier(random_state=42)

In [None]:
# Ça me plaît davantage comme ça mais ça pourrait bloquer le fonctionnement du
# SearchCV si je ne range pas chaque classifier dans un objet (?)
classifiers = []
classifiers.append(DummyClassifier())
classifiers.append(LogisticRegression(random_state=42))
classifiers.append(SGDClassifier())
classifiers.append(GaussianNB())
classifiers.append(MultinomialNB())
classifiers.append(KNeighborsClassifier())
classifiers.append(SVC(probability=True, random_state=42))
classifiers.append(DecisionTreeClassifier(random_state=42))
classifiers.append(RandomForestClassifier(random_state=42))
classifiers.append(GradientBoostingClassifier(random_state=42))
classifiers.append(AdaBoostClassifier(random_state=42))
classifiers.append(XGBClassifier(random_state=42))
classifiers.append(CatBoostClassifier(random_state=42))
classifiers.append(LGBMClassifier(random_state=42))
#classifiers

## 8.5. Declaring the classifiers' parameters

In [None]:
params = []

### 8.5.0. DummyClassifier

In [None]:
params.append({})
params[0]["classifier__strategy"] = ["most_frequent",
                                     "prior"]
params[0]["classifier"] = [clf0]
params[0]["classifier"] = [classifiers[0]]

### 8.5.1. LogisticRegression

In [None]:
params.append({})
params[1]["classifier__C"] = [10**-2,
                              10**-1,
                              10**0,
                              10**1,
                              10**2]
params[1]["classifier__penalty"] = ["l1",
                                    "l2",
                                    "elasticnet"]
#params[1]["classifier__class_weight"] = [{0: 1, 1: 1},
#                                         {0: oz, 1: zo}]
#params[1]["classifier__class_weight"] = [None]
params[1]["classifier__class_weight"] = [None,
                                         {0: oz, 1: zo}]
# J'ai déjà fait un oversampling donc normalement je n'ai plus besoin de mettre
# les poids. Je teste juste ici.
params[1]["classifier"] = [clf1]
params[1]["classifier"] = [classifiers[1]]

### 8.5.2. SGDClassifier

In [None]:
params.append({})
params[2]["classifier__loss"] = ["hinge",
                                 "log",
                                 "squared_hinge",
                                 "modified_huber"]
params[2]["classifier__penalty"] = ["l2",
                                    "l1",
                                    "elasticnet"]
params[2]["classifier"] = [clf2]
params[2]["classifier"] = [classifiers[2]]

### 8.5.3. GaussianNB

In [None]:
params.append({})
params[3]["classifier"] = [clf3]
params[3]["classifier"] = [classifiers[3]]

### 8.5.4. MultinomialNB

In [None]:
params.append({})
params[4]["classifier__alpha"] = [10**0,
                                  10**1,
                                  10**2]
params[4]["classifier"] = [clf4]
params[4]["classifier"] = [classifiers[4]]

### 8.5.5. KNeighborsClassifier

In [None]:
params.append({})
params[5]["classifier__n_neighbors"] = [10**.5,
                                        10**1,
                                        10**1.5,
                                        10**2]
params[5]["classifier__weights"] = ["uniform",
                                    "distance"]
params[5]["classifier"] = [clf5]
params[5]["classifier"] = [classifiers[5]]

### 8.5.6. SVC

In [None]:
params.append({})
params[6]["classifier__kernel"] = ["linear",
                                   "rbf",
                                   "poly",
                                   "sigmoid"]
params[6]["classifier__C"] = [10**-2,
                              10**-1,
                              10**0,
                              10**1,
                              10**2,
                              10**3]
params[6]["classifier__gamma"] = ["auto",
                                  "scale"]
#params[6]["classifier__class_weight"] = [None,
#                                         {0: oz, 1: zo}]
# J'ai déjà fait un oversampling donc plus besoin de mettre les poids.
params[6]["classifier__class_weight"] = [None]
params[6]["classifier"] = [clf6]
params[6]["classifier"] = [classifiers[6]]

### 8.5.7. DecisionTreeClassifier

In [None]:
params.append({})
params[7]["classifier__max_depth"] = [3,
                                      10**1,
                                      30,
                                      None]
params[7]["classifier__min_samples_split"] = [10**.5,
                                              10**1]
#params[7]["classifier__class_weight"] = [{0: oz, 1: zo}]
# J'ai déjà fait un oversampling donc plus besoin de mettre les poids.
params[7]["classifier__class_weight"] = [None]
params[7]["classifier"] = [clf7]
params[7]["classifier"] = [classifiers[7]]

### 8.5.8. RandomForestClassifier

In [None]:
params.append({})
params[8]["classifier__n_estimators"] = [10**1,
                                         10**2,
                                         10**3]
params[8]["classifier__max_depth"] = [3,
                                      10**1,
                                      30]
params[8]["classifier__criterion"] = ["gini",
                                      "entropy"]
#params[8]["classifier__class_weight"] = [None,
#                                         {0: oz, 1: zo}]
# J'ai déjà fait un oversampling donc plus besoin de mettre les poids.
params[8]["classifier__class_weight"] = [None]
params[8]["classifier"] = [clf8]
params[8]["classifier"] = [classifiers[8]]

### 8.5.9. GradientBoostingClassifier

In [None]:
params.append({})
params[9]["classifier__n_estimators"] = [10**1,
                                         10**2,
                                         10**3]
params[9]["classifier__max_depth"] = [3,
                                      10**1,
                                      30]
params[9]["classifier"] = [clf9]
params[9]["classifier"] = [classifiers[9]]

### 8.5.10. AdaBoostClassifier

In [None]:
params.append({})
params[10]["classifier__n_estimators"] = [10**1,
                                          10**2,
                                          10**3]
params[10]["classifier__learning_rate"] = [10**-3,
                                           10**-2,
                                           10**-1]
params[10]["classifier"] = [clf10]
params[10]["classifier"] = [classifiers[10]]

### 8.5.11. XGBClassifier

In [None]:
params.append({})
params[11]["classifier__booster"] = ["gbtree",
                                     "gblinear",
                                     "dart"]
params[11]["classifier__learning_rate"] = [10**-3,
                                           10**-2,
                                           10**-1]
params[11]["classifier__max_depth"] = [10**0,
                                       3,
                                       10**1]
params[11]["classifier"] = [clf11]
params[11]["classifier"] = [classifiers[11]]

### 8.5.12. CatBoostClassifier

In [None]:
params.append({})
params[12]["classifier__iterations"] = [10**1,
                                        10**2,
                                        10**3]
params[12]["classifier__learning_rate"] = [10**-3,
                                           10**-2,
                                           10**-1]
params[12]["classifier__depth"] = [10**0,
                                   3,
                                   10**1]
params[12]["classifier"] = [clf12]
params[12]["classifier"] = [classifiers[12]]

### 8.5.13. LGBMClassifier

In [None]:
params.append({})
params[13]["classifier__boosting_type"] = ["gbdt",
                                           "dart",
                                           "goss"]
params[13]["classifier__learning_rate"] = [10**-3,
                                           10**-2,
                                           10**-1]
params[13]["classifier__num_leaves"] = [10**1,
                                        10**1.5,
                                        10**2]
params[13]["classifier"] = [clf13]
params[13]["classifier"] = [classifiers[13]]

### 8.5.14. Déclaration des paramètres sous la forme d'un dictionnaire

In [None]:
%%script echo
# J"aurais préféré input comme ceci mais il faut toujours exactement le keyword
# "classifier" pour que le SearchCV multi-estimateur fonctionne et donc je ne
# peux pas utiliser un dictionnaire pour ça.
param_distributions = {
    "dummy": {"class_weight": [{0: 1, 1: 1}, {0: oz, 1: zo}]},
    "sgd": {"loss": ["hinge", "log", "squared_hinge", "modified_huber"],
            "penalty": ["l2", "l1", "elasticnet"],
            "class_weight": [{0: oz, 1: zo}]},
    "naive_bayes": {"class_weight": [{0: oz, 1: zo}]},
    "knn": {"n_neighbors": range(1, 11),
            "weights": ["uniform", "distance"],
            "class_weight": [{0: oz, 1: zo}]},
    "svc": {"kernel": ["linear", "rbf", "poly", "sigmoid"],
            "C": [0.1, 1, 10, 100, 1000],
            "gamma": ["auto", "scale"],
            "class_weight": [{0: oz, 1: zo}]},
    "random_forest": {"n_estimators": [10, 100, 1000],
                      "criterion": ["gini", "entropy"],
                      "class_weight": [{0: oz, 1: zo}]},
    "adaboost": {"n_estimators": [10, 100, 1000],
                 "learning_rate": [0.1, 0.01, 0.001],
                 "class_weight": [{0: oz, 1: zo}]},
    "xgb": {"booster": ["gbtree", "gblinear", "dart"],
            "learning_rate": [0.1, 0.01, 0.001],
            "max_depth": range(1, 11),
            "class_weight": [{0: oz, 1: zo}]},
    "catboost": {"iterations": [10, 100, 1000],
                 "learning_rate": [0.1, 0.01, 0.001],
                 "depth": range(1, 11),
                 "class_weight": [{0: oz, 1: zo}]},
    "lightgbm": {"boosting_type": ["gbdt", "dart", "goss"],
                 "learning_rate": [0.1, 0.01, 0.001],
                 "num_leaves": range(10, 110, 10),
                 "class_weight": [{0: oz, 1: zo}]},
}

In [None]:
%%script echo
param_distributions = {
    "dummy": {},
    "naive_bayes": {},
}

## 8.6. Tentative infructueuse d'itérer sur les classifiers

In [None]:
%%script echo
# Non. Pipeline ne fonctionne pas du tout comme ça.
pipe = Pipeline(steps=[(name, estimator) for name, estimator in classifiers.items()])

In [None]:
%%script echo
# Non. Pipeline ne fonctionne pas comme ça non plus. Elle ne prend qu'un seul
# estimateur à la fois.
# Peut-être cependant qu'en mettant en bout de pipe un columntransformer qui
# définit tous les estimateurs en parallèle ça pourrait fonctionner mais j'y
# crois peu.
rs_cv = RandomizedSearchCV(estimator=Pipeline(classifiers),
                           param_distributions=param_distributions,
                           n_iter=10,
                           cv=5,
                           scoring=["roc_auc", "accuracy"],
                           refit="roc_auc",
                           random_state=42)

In [None]:
%%script echo
#rs_cv.fit(X_train, y_train)
#rs_cv.fit(X_train_u, y_train_u)
rs_cv.fit(X_train_o, y_train_o)

for algorithm in classifiers.keys():
    print(f"Best parameters for {algorithm}: {rs_cv.best_params_[algorithm]}")
    print(f"Best AUC score for {algorithm}: {rs_cv.best_score_[algorithm]['roc_auc']:.3f}")
    print(f"Best accuracy score for {algorithm}: {rs_cv.best_score_[algorithm]['accuracy']:.3f}")

#best_algorithm = rs_cv.best_estimator_.keys()[0]
best_algorithm = rs_cv.best_estimator_.named_steps.keys()
print(f"Overall best algorithm: {best_algorithm}")
print(f"Best AUC score: {rs_cv.best_score_[best_algorithm]['roc_auc']:.3f}")
print(f"Best accuracy score: {rs_cv.best_score_[best_algorithm]['accuracy']:.3f}")

## 8.7. Itération sur les classifiers avec une méthode étrange
En fait ici on ne donne au SearchCV qu'une seule pipe de travail ne contenant
qu'un seul estimateur, mais il va quand même itérer sur tous les classifiers
car ils sont tous renseignés dans ```params```.

In [None]:
pipeline = Pipeline([("classifier", clf0)])

In [None]:
%%time
rs = RandomizedSearchCV(
    pipeline,
    params,
    n_iter=5,
    cv=3,
    n_jobs=-1,
    scoring="roc_auc"
#).fit(X_train, y_train)
#).fit(X_train_u, y_train_u)
).fit(X_train_o, y_train_o)

## 8.8. Results

In [None]:
rs.best_params_

In [None]:
rs.best_score_

In [None]:
y_test.value_counts(dropna=False)

In [None]:
%%script echo
# Il n'y a pas de targets dans notre test set, c'est en fait un predict set
# pas un test set, donc on ne peut pas calculer ces scores.
print("Precision:",
      precision_score(rs.predict(X_test), y_test))
print("Recall:",
      recall_score(rs.predict(X_test), y_test))
print("ROC AUC Score:",
      roc_auc_score(rs.predict(X_test), y_test))

# 9. Fine tuning with the best classifier

## 9.1. GridSearchCV

## 9.2. Results

# 10. Applying the fine-tuned best classifier for the final results

In [None]:
#y_pred = rs_cv.best_estimator_[best_algorithm].predict(X_test)

# 11. Features' importance

## 11.1. SHAP

## 11.2. LIME