# 0. Introduction

# 1. Initialization

## 1.1. (most) package imports

In [2]:
# kaggle/python Docker image: https://github.com/kaggle/docker-python

import gc
import time
import os
import re
from contextlib import contextmanager
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

import math
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 20, "display.max_columns", 30)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
import imblearn.pipeline
#from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

## 1.3. Global functions

In [108]:
def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == "object"]
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [109]:
le_path = "home-credit-default-risk/"

In [110]:
filenames = []
for dirname, _, filenamess in os.walk(le_path):
    for filenamee in filenamess:
#                        HomeCredit_columns_description.csv est illisible.
        if filenamee != "HomeCredit_columns_description.csv":
            filename = os.path.join(dirname, filenamee)
#            print(filename)
            filenames.append(filename)
#            df = pd.read_csv(filename)
#            display(df[-1:])

In [3]:
df = pd.DataFrame({
    "A": ["foo", "bar", "foo", "bar", "foo", "xin"],
    "B": [1, 1, 2, 2, 8, -2],
    "C": [1, 2.2, 3.7, 4, 5, 6],
})
display(df)

df_gba = df.groupby("A")

for key, item in df_gba:
    print(df_gba.get_group(key), "\n\n")

result = df_gba.agg({"C": ["mean", "min", "max", "size"],
                     "B": ["min", "max", "size"]})
display(result)

Unnamed: 0,A,B,C
0,foo,1,1.0
1,bar,1,2.2
2,foo,2,3.7
3,bar,2,4.0
4,foo,8,5.0
5,xin,-2,6.0


     A  B    C
1  bar  1  2.2
3  bar  2  4.0 


     A  B    C
0  foo  1  1.0
2  foo  2  3.7
4  foo  8  5.0 


     A  B    C
5  xin -2  6.0 




Unnamed: 0_level_0,C,C,C,C,B,B,B
Unnamed: 0_level_1,mean,min,max,size,min,max,size
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
bar,3.1,2.2,4.0,2,1,2,2
foo,3.233333,1.0,5.0,3,1,8,3
xin,6.0,6.0,6.0,1,-2,-2,1


Comment peut-il join le bb_agg sur le bureau alors que ces deux dfs n'ont ni
lignes ni colonnes en commun ?

# 2. Preprocessing

## 2.1. application_train and application_test

In [111]:
num_rows = 20

In [112]:
df = pd.read_csv(filenames[1], nrows=num_rows)
test_df = pd.read_csv(filenames[0], nrows=num_rows)
bureau = pd.read_csv(filenames[2], nrows=num_rows)
bb = pd.read_csv(filenames[3], nrows=num_rows)
prev = pd.read_csv(filenames[7], nrows=num_rows)
pos = pd.read_csv(filenames[6], nrows=num_rows)
ins = pd.read_csv(filenames[5], nrows=num_rows)
cc = pd.read_csv(filenames[4], nrows=num_rows)

In [117]:
#list(test_df.columns)

In [118]:
test_df = test_df.loc[:,[
    'SK_ID_CURR',
    'CODE_GENDER',
    'FLAG_OWN_CAR',
    'DAYS_BIRTH',
    'DAYS_EMPLOYED',
]]

In [119]:
dfs = [
    df,
    test_df,
    bureau,
    bb,
    prev,
    pos,
    ins,
    cc,
]

In [120]:
columns_in_common = []
for i, d in enumerate(dfs):
    for j, dd in enumerate(dfs[i+1:]):
        for k in d.columns:
            if k in dd.columns:
                columns_in_common.append(k)
                print(i, j+1, "--", end="")
columns_in_common = list(set(columns_in_common))
len(columns_in_common)

0 1 --0 1 --0 1 --0 1 --0 1 --0 2 --0 2 --0 4 --0 4 --0 4 --0 4 --0 4 --0 4 --0 4 --0 4 --0 5 --0 6 --0 7 --1 1 --1 3 --1 4 --1 5 --1 6 --2 1 --2 2 --2 2 --2 3 --2 4 --2 5 --3 2 --3 4 --4 1 --4 1 --4 1 --4 2 --4 2 --4 3 --4 3 --4 3 --5 1 --5 1 --5 2 --5 2 --5 2 --5 2 --5 2 --5 2 --6 1 --6 1 --

18

In [121]:
columns_in_common

['AMT_ANNUITY',
 'CODE_GENDER',
 'NAME_CONTRACT_STATUS',
 'SK_DPD_DEF',
 'MONTHS_BALANCE',
 'SK_ID_CURR',
 'FLAG_OWN_CAR',
 'NAME_TYPE_SUITE',
 'NAME_CONTRACT_TYPE',
 'AMT_CREDIT',
 'WEEKDAY_APPR_PROCESS_START',
 'HOUR_APPR_PROCESS_START',
 'SK_ID_BUREAU',
 'DAYS_BIRTH',
 'SK_ID_PREV',
 'SK_DPD',
 'AMT_GOODS_PRICE',
 'DAYS_EMPLOYED']

In [127]:
list(set(df.columns) & set(columns_in_common))

['AMT_ANNUITY',
 'CODE_GENDER',
 'SK_ID_CURR',
 'FLAG_OWN_CAR',
 'NAME_TYPE_SUITE',
 'NAME_CONTRACT_TYPE',
 'AMT_CREDIT',
 'WEEKDAY_APPR_PROCESS_START',
 'HOUR_APPR_PROCESS_START',
 'DAYS_BIRTH',
 'AMT_GOODS_PRICE',
 'DAYS_EMPLOYED']

In [132]:
df = df.loc[:,list(set(df.columns) & set(columns_in_common))]
df.columns

Index(['AMT_ANNUITY', 'CODE_GENDER', 'SK_ID_CURR', 'FLAG_OWN_CAR',
       'NAME_TYPE_SUITE', 'NAME_CONTRACT_TYPE', 'AMT_CREDIT',
       'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START', 'DAYS_BIRTH',
       'AMT_GOODS_PRICE', 'DAYS_EMPLOYED'],
      dtype='object')

In [128]:
def shrink_df(d):
    cols = list(set(d.columns) & set(columns_in_common))
    if len(cols) >= 2:
        d = d.loc[:,cols]
        return d
    else:
        print("Une seule colonne en commun, voire aucune.")

In [136]:
dfs_2 = []
for d in dfs:
    dfs_2.append(shrink_df(d))

In [163]:
def renames(d):
    for co in d.columns:
        d = d.rename(columns={co: co[:7]})
    return d

In [164]:
dfs_3 = []
for d in dfs_2:
    dfs_3.append(renames(d))

In [165]:
for i, d in enumerate(dfs_3):
    print(i, d.columns)

0 Index(['AMT_ANN', 'CODE_GE', 'SK_ID_C', 'FLAG_OW', 'NAME_TY', 'NAME_CO',
       'AMT_CRE', 'WEEKDAY', 'HOUR_AP', 'DAYS_BI', 'AMT_GOO', 'DAYS_EM'],
      dtype='object')
1 Index(['CODE_GE', 'SK_ID_C', 'FLAG_OW', 'DAYS_BI', 'DAYS_EM'], dtype='object')
2 Index(['AMT_ANN', 'SK_ID_C', 'SK_ID_B'], dtype='object')
3 Index(['MONTHS_', 'SK_ID_B'], dtype='object')
4 Index(['AMT_ANN', 'NAME_CO', 'SK_ID_C', 'NAME_TY', 'NAME_CO', 'AMT_CRE',
       'WEEKDAY', 'HOUR_AP', 'SK_ID_P', 'AMT_GOO'],
      dtype='object')
5 Index(['SK_DPD_', 'MONTHS_', 'SK_ID_C', 'SK_ID_P', 'SK_DPD', 'NAME_CO'], dtype='object')
6 Index(['SK_ID_P', 'SK_ID_C'], dtype='object')
7 Index(['SK_DPD_', 'MONTHS_', 'SK_ID_C', 'SK_ID_P', 'SK_DPD', 'NAME_CO'], dtype='object')


In [167]:
#dfs_3[4]

In [7]:
    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
    df = df.append(test_df).reset_index()

    # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
    df = df[df["CODE_GENDER"] != "XNA"]

    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ["CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY"]:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)

    # Aberrant values
    df["DAYS_EMPLOYED"].replace(365243, np.nan, inplace=True)

    # Some simple new features (percentages)
    df["DAYS_EMPLOYED_PERC"] = df["DAYS_EMPLOYED"] / df["DAYS_BIRTH"]
    df["INCOME_CREDIT_PERC"] = df["AMT_INCOME_TOTAL"] / df["AMT_CREDIT"]
    df["INCOME_PER_PERSON"] = df["AMT_INCOME_TOTAL"] / df["CNT_FAM_MEMBERS"]
    df["ANNUITY_INCOME_PERC"] = df["AMT_ANNUITY"] / df["AMT_INCOME_TOTAL"]
    df["PAYMENT_RATE"] = df["AMT_ANNUITY"] / df["AMT_CREDIT"]

## 2.2. bureau and bureau_balance

In [8]:
    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)

    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {"MONTHS_BALANCE": ["min", "max", "size"]}
    for col in bb_cat:
        bb_aggregations[col] = ["mean"]
    bb_agg = bb.groupby("SK_ID_BUREAU").agg(bb_aggregations)
    bb_agg.columns = pd.Index(
        [e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()]
    )
    bureau = bureau.join(bb_agg, how="left", on="SK_ID_BUREAU")
    bureau.drop(["SK_ID_BUREAU"], axis=1, inplace=True)
    del bb, bb_agg
    gc.collect()

    # Bureau and bureau_balance numeric features
    num_aggregations = {
        "DAYS_CREDIT": ["min", "max", "mean", "var"],
        "DAYS_CREDIT_ENDDATE": ["min", "max", "mean"],
        "DAYS_CREDIT_UPDATE": ["mean"],
        "CREDIT_DAY_OVERDUE": ["max", "mean"],
        "AMT_CREDIT_MAX_OVERDUE": ["mean"],
        "AMT_CREDIT_SUM": ["max", "mean", "sum"],
        "AMT_CREDIT_SUM_DEBT": ["max", "mean", "sum"],
        "AMT_CREDIT_SUM_OVERDUE": ["mean"],
        "AMT_CREDIT_SUM_LIMIT": ["mean", "sum"],
        "AMT_ANNUITY": ["max", "mean"],
        "CNT_CREDIT_PROLONG": ["sum"],
        "MONTHS_BALANCE_MIN": ["min"],
        "MONTHS_BALANCE_MAX": ["max"],
        "MONTHS_BALANCE_SIZE": ["mean", "sum"]
    }

    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ["mean"]
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ["mean"]

    bureau_agg = bureau.groupby("SK_ID_CURR").agg(
        {**num_aggregations, **cat_aggregations}
    )
    bureau_agg.columns = pd.Index([
        "BURO_" + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()
    ])

    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau["CREDIT_ACTIVE_Active"] == 1]
    active_agg = active.groupby("SK_ID_CURR").agg(num_aggregations)
    active_agg.columns = pd.Index([
        "ACTIVE_" + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()
    ])
    bureau_agg = bureau_agg.join(active_agg, how="left", on="SK_ID_CURR")
    del active, active_agg
    gc.collect()

    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau["CREDIT_ACTIVE_Closed"] == 1]
    closed_agg = closed.groupby("SK_ID_CURR").agg(num_aggregations)
    closed_agg.columns = pd.Index([
        "CLOSED_" + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()
    ])
    bureau_agg = bureau_agg.join(closed_agg, how="left", on="SK_ID_CURR")

## 2.3. previous_applications

In [9]:
prev, cat_cols = one_hot_encoder(prev, nan_as_category=True)

    # Aberrant values
    prev["DAYS_FIRST_DRAWING"].replace(365243, np.nan, inplace=True)
    prev["DAYS_FIRST_DUE"].replace(365243, np.nan, inplace=True)
    prev["DAYS_LAST_DUE_1ST_VERSION"].replace(365243, np.nan, inplace=True)
    prev["DAYS_LAST_DUE"].replace(365243, np.nan, inplace=True)
    prev["DAYS_TERMINATION"].replace(365243, np.nan, inplace=True)

    # Add feature: value ask / value received percentage
    prev["APP_CREDIT_PERC"] = prev["AMT_APPLICATION"] / prev["AMT_CREDIT"]

    # Previous applications numeric features
    num_aggregations = {
        "AMT_ANNUITY": ["min", "max", "mean"],
        "AMT_APPLICATION": ["min", "max", "mean"],
        "AMT_CREDIT": ["min", "max", "mean"],
        "APP_CREDIT_PERC": ["min", "max", "mean", "var"],
        "AMT_DOWN_PAYMENT": ["min", "max", "mean"],
        "AMT_GOODS_PRICE": ["min", "max", "mean"],
        "HOUR_APPR_PROCESS_START": ["min", "max", "mean"],
        "RATE_DOWN_PAYMENT": ["min", "max", "mean"],
        "DAYS_DECISION": ["min", "max", "mean"],
        "CNT_PAYMENT": ["mean", "sum"],
    }

    # Previous applications categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ["mean"]

    prev_agg = prev.groupby("SK_ID_CURR").agg({**num_aggregations,
                                               **cat_aggregations})
    prev_agg.columns = pd.Index([
        "PREV_" + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()
    ])

    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev["NAME_CONTRACT_STATUS_Approved"] == 1]
    approved_agg = approved.groupby("SK_ID_CURR").agg(num_aggregations)
    approved_agg.columns = pd.Index([
        "APPROVED_" + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()
    ])
    prev_agg = prev_agg.join(approved_agg, how="left", on="SK_ID_CURR")

    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev["NAME_CONTRACT_STATUS_Refused"] == 1]
    refused_agg = refused.groupby("SK_ID_CURR").agg(num_aggregations)
    refused_agg.columns = pd.Index([
        "REFUSED_" + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()
    ])
    prev_agg = prev_agg.join(refused_agg, how="left", on="SK_ID_CURR")

## 2.4. pos_cash

In [10]:
pos, cat_cols = one_hot_encoder(pos, nan_as_category=True)

    # Features
    aggregations = {
        "MONTHS_BALANCE": ["max", "mean", "size"],
        "SK_DPD": ["max", "mean"],
        "SK_DPD_DEF": ["max", "mean"]
    }
    for cat in cat_cols:
        aggregations[cat] = ["mean"]

    pos_agg = pos.groupby("SK_ID_CURR").agg(aggregations)
    pos_agg.columns = pd.Index([
        "POS_" + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()
    ])

    # Count pos cash accounts
    pos_agg["POS_COUNT"] = pos.groupby("SK_ID_CURR").size()

## 2.5. installment_payments

In [11]:
ins, cat_cols = one_hot_encoder(ins, nan_as_category=True)

    # Features
    # Percentage and difference paid in each installment (amount paid and
    # installment value)
    ins["PAYMENT_PERC"] = ins["AMT_PAYMENT"] / ins["AMT_INSTALMENT"]
    ins["PAYMENT_DIFF"] = ins["AMT_INSTALMENT"] - ins["AMT_PAYMENT"]
    # Days past due and days before due (no negative values)
    ins["DPD"] = ins["DAYS_ENTRY_PAYMENT"] - ins["DAYS_INSTALMENT"]
    ins["DBD"] = ins["DAYS_INSTALMENT"] - ins["DAYS_ENTRY_PAYMENT"]
    ins["DPD"] = ins["DPD"].apply(lambda x: x if x > 0 else 0)
    ins["DBD"] = ins["DBD"].apply(lambda x: x if x > 0 else 0)

    # Features: Perform aggregations
    aggregations = {
        "NUM_INSTALMENT_VERSION": ["nunique"],
        "DPD": ["max", "mean", "sum"],
        "DBD": ["max", "mean", "sum"],
        "PAYMENT_PERC": ["max", "mean", "sum", "var"],
        "PAYMENT_DIFF": ["max", "mean", "sum", "var"],
        "AMT_INSTALMENT": ["max", "mean", "sum"],
        "AMT_PAYMENT": ["min", "max", "mean", "sum"],
        "DAYS_ENTRY_PAYMENT": ["max", "mean", "sum"]
    }
    for cat in cat_cols:
        aggregations[cat] = ["mean"]

    ins_agg = ins.groupby("SK_ID_CURR").agg(aggregations)
    ins_agg.columns = pd.Index([
        "INSTAL_" + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()
    ])

    # Count installments accounts
    ins_agg["INSTAL_COUNT"] = ins.groupby("SK_ID_CURR").size()

## 2.6. credit_card_balance

In [12]:
cc, cat_cols = one_hot_encoder(cc, nan_as_category=True)

    # General aggregations
    cc.drop(["SK_ID_PREV"], axis=1, inplace =True)

    cc_agg = cc.groupby("SK_ID_CURR").agg(["min", "max", "mean", "sum", "var"])
    cc_agg.columns = pd.Index([
        "CC_" + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()
    ])

    # Count credit card lines
    cc_agg["CC_COUNT"] = cc.groupby("SK_ID_CURR").size()

# 3. functions from the original notebook

In [13]:
# LightGBM GBDT with KFold or Stratified KFold
# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
def kfold_lightgbm(df, num_folds, stratified=False, debug=False):
    # Divide in training/validation and test data
    train_df = df[df["TARGET"].notnull()]
    test_df = df[df["TARGET"].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape
    ))
    del df
    gc.collect()

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=1001)
    else:
        folds = KFold(n_splits=num_folds,
                      shuffle=True,
                      random_state=1001)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ["TARGET",
                                                      "SK_ID_CURR",
                                                      "SK_ID_BUREAU",
                                                      "SK_ID_PREV",
                                                      "index"]]

    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df["TARGET"])
    ):
        train_x, train_y = train_df[feats].iloc[train_idx],\
                           train_df["TARGET"].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx],\
                           train_df["TARGET"].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1,
        )

        clf.fit(train_x,
                train_y,
                eval_set=[(train_x, train_y), (valid_x, valid_y)],
                eval_metric="auc",
                verbose=200,
                early_stopping_rounds=200)

        oof_preds[valid_idx] = clf.predict_proba(
            valid_x,
            num_iteration=clf.best_iteration_
        )[:, 1]
        sub_preds += clf.predict_proba(
            test_df[feats],
            num_iteration=clf.best_iteration_
        )[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df],
            axis=0
        )
        print("Fold %2d AUC : %.6f" % (
            n_fold + 1,
            roc_auc_score(valid_y, oof_preds[valid_idx])
        ))

        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    # Write submission file and plot feature importance
    print("Full AUC score %.6f" % roc_auc_score(train_df["TARGET"], oof_preds))
    if not debug:
        test_df["TARGET"] = sub_preds
        test_df[["SK_ID_CURR", "TARGET"]].to_csv(submission_file_name,
                                                 index=False)
    display_importances(feature_importance_df)
    return feature_importance_df

In [14]:
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby(
        "feature"
    ).mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[
        feature_importance_df_.feature.isin(cols)
    ]

    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance",
                y="feature",
                data=best_features.sort_values(by="importance", ascending=False))
    plt.title("LightGBM Features (avg over folds)")
    plt.tight_layout()

    plt.savefig("lgbm_importances01.png")

# 4. Run the preprocessing

In [15]:
df = df.join(bureau, how="left", on="SK_ID_CURR")
df = df.join(prev, how="left", on="SK_ID_CURR")
df = df.join(pos, how="left", on="SK_ID_CURR")
df = df.join(ins, how="left", on="SK_ID_CURR")
df = df.join(cc, how="left", on="SK_ID_CURR")

In [19]:
if __name__ == "__main__":
    with timer("preproc_subsampled"):
        zeros, ones, nans, df = preproc(debug=30)

Train samples: 10000, test samples: 10000
Application train test df shape: (20000, 246)
Process application train test - done in 1s
Bureau df shape: (2011, 108)
Process bureau and bureau_balance - done in 0s
Previous applications df shape: (9734, 242)
Process previous_applications - done in 0s
Pos-cash balance df shape: (9494, 15)
Process POS-CASH balance - done in 0s
Installments payments df shape: (8893, 26)
Process installments payments - done in 0s
Credit card balance df shape: (9520, 131)
Process credit card balance - done in 0s
-----------------------------------------------------------------------
subsampled df's TARGET has       9225 zeros,        775 ones and      10000 NaNs
preproc_subsampled - done in 2s


# 6. Suppression du caractère illisible

In [22]:
cols_of_df = list(df.columns)

In [24]:
df = df.rename(columns=lambda x: x.replace(" ", "_a_")\
                                  .replace("-", "_b_")\
                                  .replace(":", "_c_")\
                                  .replace("/", "_d_")\
                                  .replace(",", "_e_")\
                                  .replace(":", "_f_"))

Couldn't find program: 'echo'


In [25]:
#df = df.rename(columns=lambda x: x.replace(":", "deuxpoints"))