In [1]:
!pip install autogluon
!pip install --upgrade ipykernel

Collecting autogluon
  Downloading autogluon-0.3.1-py3-none-any.whl (9.9 kB)
Collecting autogluon.features==0.3.1
  Downloading autogluon.features-0.3.1-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 684 kB/s 
[?25hCollecting autogluon.vision==0.3.1
  Downloading autogluon.vision-0.3.1-py3-none-any.whl (38 kB)
Collecting autogluon.tabular[all]==0.3.1
  Downloading autogluon.tabular-0.3.1-py3-none-any.whl (273 kB)
[K     |████████████████████████████████| 273 kB 1.6 MB/s 
[?25hCollecting autogluon.core==0.3.1
  Downloading autogluon.core-0.3.1-py3-none-any.whl (352 kB)
[K     |████████████████████████████████| 352 kB 11.7 MB/s 
[?25hCollecting autogluon.text==0.3.1
  Downloading autogluon.text-0.3.1-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.1 MB/s 
[?25hCollecting autogluon.mxnet==0.3.1
  Downloading autogluon.mxnet-0.3.1-py3-none-any.whl (33 kB)
Collecting autogluon.extra==0.3.1
  Downloading autogluo

# Load dependencies
---

In [2]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

from sklearn.metrics import f1_score, roc_auc_score, log_loss
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, KFold
from sklearn.calibration import CalibratedClassifierCV

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import shap

import torch
from sklearn.metrics import f1_score, roc_auc_score

# Imports from our package
from autogluon.tabular import TabularPredictor

pd.options.mode.chained_assignment = None 

In [3]:
train = pd.read_csv('../input/porto-seguro-data-challenge/train.csv', index_col='id').reset_index(drop=True)
test = pd.read_csv('../input/porto-seguro-data-challenge/test.csv', index_col='id').reset_index(drop=True)
sample_submission = pd.read_csv('../input/porto-seguro-data-challenge/submission_sample.csv')
meta = pd.read_csv('../input/porto-seguro-data-challenge/metadata.csv')

cat_nom = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Qualitativo nominal")].iloc[:,0]] 
cat_ord = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Qualitativo ordinal")].iloc[:,0]] 
num_dis = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Quantitativo discreto")].iloc[:,0]] 
num_con = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Quantitativo continua")].iloc[:,0]] 

In [4]:
# Sturge’s rule:
#Number of Bins = 1 + log2(N)
np.round(1+np.log2(train.shape[0]))

15.0

In [5]:
X_test = test[cat_nom+cat_ord+num_dis+num_con]
X = train[cat_nom+cat_ord+num_dis+num_con]
y = train.y

K=15
SEED=123
kf = StratifiedKFold(n_splits=K, random_state=SEED, shuffle=True)

In [6]:
knn_feat_train = pd.read_csv('../input/porto-seguro-knn-feature-extraction-k-1/knn_feat_train.csv')
knn_feat_test = pd.read_csv('../input/porto-seguro-knn-feature-extraction-k-1/knn_feat_test.csv')

In [7]:
#def feature_engineering(train, test):
#
#    train.loc[:, 'n_missing'] = train.copy().replace(-999, np.nan).isna().sum(axis=1).astype('int').values
#    train.loc[:, 'std'] = train.copy().replace(-999, np.nan).std(axis=1, ddof=0).astype('float').values
#    train.loc[:, 'mean_orig'] = train.copy().replace(-999, np.nan).mean(axis=1, skipna=True).astype('float').values
#    
#    test.loc[:, 'n_missing'] = test.copy().replace(-999, np.nan).isna().sum(axis=1).astype('int').values
#    test.loc[:, 'std'] = test.copy().replace(-999, np.nan).std(axis=1, ddof=0).astype('float').values
#    test.loc[:, 'mean_orig'] = test.copy().replace(-999, np.nan).mean(axis=1, skipna=True).astype('float').values
#
#    return train, test
#
#X, X_test = feature_engineering(X, X_test)

In [8]:
def get_threshold(y_true, y_pred):
    # Moving threshold
    thresholds = np.arange(0.0, 1.0, 0.01)
    f1_scores = []
    for thresh in thresholds:
        f1_scores.append(
            f1_score(y_true, [1 if m>thresh else 0 for m in y_pred]))
    f1s = np.array(f1_scores)
    return thresholds[f1s.argmax()]
    
    
def custom_f1(y_true, y_pred):
     
    max_f1_threshold =  get_threshold(y_true, y_pred)

    y_pred = np.where(y_pred>max_f1_threshold, 1, 0)
    f1_after = f1_score(y_true, y_pred) 
    
    return f1_after

# Stage 1:Calcule Shap
---


## XGBoost

https://www.kaggle.com/gomes555/porto-seguro-fork-of-xgboost

In [9]:
fixed_params = {
    'random_state': 9,
    "objective": "binary:logistic",
    "eval_metric": 'logloss',
    'use_label_encoder':False,
    'n_estimators':10000,
}

study_xgb = {'booster': 'gbtree',
 'lambda': 9.012384508756378e-07,
 'alpha': 0.7472040331088792,
 'max_depth': 5,
 'eta': 0.01507605562231303,
 'gamma': 1.0214961302342215e-08,
 'grow_policy': 'lossguide',
 'min_child_weight': 5,
 'subsample': 0.9331005225916879,
 'colsample_bytree': 0.25392142363325004,
 'max_delta_step': 5.685109389498008}

final_params_xgb = dict()
final_params_xgb['clf']=dict(**fixed_params, **study_xgb)

In [10]:
shap1_oof = np.zeros((X.shape[0], X.shape[1]))
shap1_test = np.zeros((X_test.shape[0], X_test.shape[1]))
model_shap1_oof = np.zeros(X.shape[0])

for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
    print(f"➜ FOLD :{fold}")
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_val = X.iloc[val_idx]
    y_val = y.iloc[val_idx]
    
    start = time.time()
    
    model = XGBClassifier(**final_params_xgb['clf'])
    
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=150,
              verbose=False)
    
    model_shap1_oof[val_idx] += model.predict_proba(X_val)[:,1]
    
    print("Final F1     :", custom_f1(y_val, model_shap1_oof[val_idx]))
    print("Final AUC    :", roc_auc_score(y_val, model_shap1_oof[val_idx]))
    print("Final LogLoss:", log_loss(y_val, model_shap1_oof[val_idx]))

    explainer = shap.TreeExplainer(model)
    
    shap1_oof[val_idx] = explainer.shap_values(X_val)

    shap1_test += explainer.shap_values(X_test) / K

    print(f"elapsed: {time.time()-start:.2f} sec\n")
    
shap1_oof = pd.DataFrame(shap1_oof, columns = [x+"_shap1" for x in X.columns])
shap1_test = pd.DataFrame(shap1_test, columns = [x+"_shap1" for x in X_test.columns])

print("Final F1     :", custom_f1(y, model_shap1_oof))
print("Final AUC    :", roc_auc_score(y, model_shap1_oof))
print("Final LogLoss:", log_loss(y, model_shap1_oof))

➜ FOLD :0
Final F1     : 0.7032967032967034
Final AUC    : 0.902330627099664
Final LogLoss: 0.2953604946129216


ntree_limit is deprecated, use `iteration_range` or model slicing instead.


elapsed: 62.58 sec

➜ FOLD :1
Final F1     : 0.6193853427895981
Final AUC    : 0.8613101903695408
Final LogLoss: 0.34227429854659686


ntree_limit is deprecated, use `iteration_range` or model slicing instead.


elapsed: 45.96 sec

➜ FOLD :2
Final F1     : 0.6793478260869567
Final AUC    : 0.8945898656215007
Final LogLoss: 0.3085819148842589


ntree_limit is deprecated, use `iteration_range` or model slicing instead.


elapsed: 58.84 sec

➜ FOLD :3
Final F1     : 0.7073791348600509
Final AUC    : 0.9058020716685331
Final LogLoss: 0.2881665477053405


ntree_limit is deprecated, use `iteration_range` or model slicing instead.


elapsed: 62.24 sec

➜ FOLD :4
Final F1     : 0.7239583333333334
Final AUC    : 0.9053121500559911
Final LogLoss: 0.29320601468396107


ntree_limit is deprecated, use `iteration_range` or model slicing instead.


elapsed: 93.74 sec

➜ FOLD :5
Final F1     : 0.7009803921568627
Final AUC    : 0.9076567749160134
Final LogLoss: 0.2872539995859452


ntree_limit is deprecated, use `iteration_range` or model slicing instead.


elapsed: 73.34 sec

➜ FOLD :6
Final F1     : 0.6736292428198434
Final AUC    : 0.8822788353863381
Final LogLoss: 0.320014158050091


ntree_limit is deprecated, use `iteration_range` or model slicing instead.


elapsed: 55.16 sec

➜ FOLD :7
Final F1     : 0.7135416666666666
Final AUC    : 0.9016657334826428
Final LogLoss: 0.29617989833438774


ntree_limit is deprecated, use `iteration_range` or model slicing instead.


elapsed: 74.49 sec

➜ FOLD :8
Final F1     : 0.7135135135135134
Final AUC    : 0.8893825776158104
Final LogLoss: 0.29351621553572266


ntree_limit is deprecated, use `iteration_range` or model slicing instead.


elapsed: 93.71 sec

➜ FOLD :9
Final F1     : 0.7391304347826086
Final AUC    : 0.9064054944284814
Final LogLoss: 0.28033187155768635


ntree_limit is deprecated, use `iteration_range` or model slicing instead.


elapsed: 95.65 sec

➜ FOLD :10
Final F1     : 0.684863523573201
Final AUC    : 0.9031046324199313
Final LogLoss: 0.29823173886367804


ntree_limit is deprecated, use `iteration_range` or model slicing instead.


elapsed: 64.70 sec

➜ FOLD :11
Final F1     : 0.704225352112676
Final AUC    : 0.8882052000840984
Final LogLoss: 0.30525241732057884


ntree_limit is deprecated, use `iteration_range` or model slicing instead.


elapsed: 50.06 sec

➜ FOLD :12
Final F1     : 0.6666666666666666
Final AUC    : 0.8905529469479291
Final LogLoss: 0.313654842143217


ntree_limit is deprecated, use `iteration_range` or model slicing instead.


elapsed: 78.45 sec

➜ FOLD :13
Final F1     : 0.6500000000000001
Final AUC    : 0.8745111780783517
Final LogLoss: 0.3300786509821235


ntree_limit is deprecated, use `iteration_range` or model slicing instead.


elapsed: 59.54 sec

➜ FOLD :14
Final F1     : 0.7135416666666666
Final AUC    : 0.9063284042329526
Final LogLoss: 0.29314716930177404


ntree_limit is deprecated, use `iteration_range` or model slicing instead.


elapsed: 70.28 sec

Final F1     : 0.6822461331540014
Final AUC    : 0.8945288307257988
Final LogLoss: 0.30301717097927483


In [11]:
#train = pd.concat([train, shap1_oof], axis=1)
#test = pd.concat([test, shap1_test], axis=1)

In [12]:
X = pd.concat([X, knn_feat_train], axis=1)
X_test = pd.concat([X_test, knn_feat_test], axis=1)

## CatBoost

In [13]:
shap2_oof = np.zeros((X.shape[0], X.shape[1]))
shap2_test = np.zeros((X_test.shape[0], X_test.shape[1]))
model_shap2_oof = np.zeros(X.shape[0])

for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
    print(f"➜ FOLD :{fold}")
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_val = X.iloc[val_idx]
    y_val = y.iloc[val_idx]
    
    start = time.time()
    
    model = CatBoostClassifier(random_seed=SEED,
                               verbose = 0,
                               n_estimators=10000,
                               loss_function= 'Logloss',
                               use_best_model=True,
                               eval_metric= 'Logloss')
    
    model.fit(X_train, y_train, 
              eval_set = [(X_val,y_val)], 
              early_stopping_rounds = 100,
              verbose = False)
    
    model_shap2_oof[val_idx] += model.predict_proba(X_val)[:,1]
    
    print("Final F1     :", custom_f1(y_val, model_shap2_oof[val_idx]))
    print("Final AUC    :", roc_auc_score(y_val, model_shap2_oof[val_idx]))
    print("Final LogLoss:", log_loss(y_val, model_shap2_oof[val_idx]))

    explainer = shap.TreeExplainer(model)
    
    shap2_oof[val_idx] = explainer.shap_values(X_val)

    shap2_test += explainer.shap_values(X_test) / K

    print(f"elapsed: {time.time()-start:.2f} sec\n")
    
shap2_oof = pd.DataFrame(shap2_oof, columns = [x+"_shap" for x in X.columns])
shap2_test = pd.DataFrame(shap2_test, columns = [x+"_shap" for x in X_test.columns])

print("Final F1     :", custom_f1(y, model_shap2_oof))
print("Final AUC    :", roc_auc_score(y, model_shap2_oof))
print("Final LogLoss:", log_loss(y, model_shap2_oof))

➜ FOLD :0
Final F1     : 0.6972010178117048
Final AUC    : 0.8954157334826428
Final LogLoss: 0.29952314366911725
elapsed: 22.84 sec

➜ FOLD :1
Final F1     : 0.6348448687350835
Final AUC    : 0.8628429451287795
Final LogLoss: 0.3407490151943705
elapsed: 12.59 sec

➜ FOLD :2
Final F1     : 0.6809651474530831
Final AUC    : 0.8949538073908175
Final LogLoss: 0.3066089330852162
elapsed: 18.03 sec

➜ FOLD :3
Final F1     : 0.702247191011236
Final AUC    : 0.9107992721164613
Final LogLoss: 0.2877216893570601
elapsed: 15.66 sec

➜ FOLD :4
Final F1     : 0.7131367292225201
Final AUC    : 0.9018687010078387
Final LogLoss: 0.2976481761596595
elapsed: 29.35 sec

➜ FOLD :5
Final F1     : 0.7055837563451777
Final AUC    : 0.909231522956327
Final LogLoss: 0.28834373773423566
elapsed: 15.35 sec

➜ FOLD :6
Final F1     : 0.6631578947368421
Final AUC    : 0.8796402575587906
Final LogLoss: 0.32303153676573987
elapsed: 19.13 sec

➜ FOLD :7
Final F1     : 0.6997389033942559
Final AUC    : 0.90163773796192

In [14]:
train = pd.concat([train, shap1_oof], axis=1)
test = pd.concat([test, shap1_test], axis=1)

train = pd.concat([train, shap2_oof], axis=1)
test = pd.concat([test, shap2_test], axis=1)

# Stage 2: AutoGluon

In [15]:
predictor = TabularPredictor(label="y",
                             problem_type='binary',
                             eval_metric="log_loss",
                             path='./AutoGlon/',
                             verbosity=1)

predictor.fit(train, presets='best_quality', time_limit=60*60*7.5)

results = predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                      model  score_val  pred_time_val      fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L2  -0.299310      30.410467   8888.826963                0.001654           2.456810            2       True         14
1           CatBoost_BAG_L1  -0.301038       3.051793   2376.887100                3.051793        2376.887100            1       True          7
2       WeightedEnsemble_L3  -0.301722     194.034947  22907.669139                0.001541           2.008858            3       True         26
3         LightGBMXT_BAG_L2  -0.302135     131.534432  17201.299530                1.378576         389.400378            2       True         15
4         LightGBMXT_BAG_L1  -0.302562       3.570399    969.385833                3.570399         969.385833            1       True          3
5           CatBoost_BAG_L2  -0.302646     131.912474  17619.9

In [16]:
y_oof = predictor.get_oof_pred_proba().iloc[:,1]
y_pred = predictor.predict_proba(test).iloc[:,1]

In [17]:
final_threshold = get_threshold(train.y, y_oof)
final_threshold

0.31

In [18]:
print("Final F1     :", custom_f1(y, y_oof))
print("Final AUC    :", roc_auc_score(y, y_oof))
print("Final LogLoss:", log_loss(y, y_oof))

Final F1     : 0.6846193682030037
Final AUC    : 0.8961328807692966
Final LogLoss: 0.2993098559321765


# Sub
---

In [19]:
# Write predictions to sub
sample_submission['predicted'] = np.where(y_pred>final_threshold, 1, 0).astype('int64')
sample_submission.to_csv('autogluon_shap_sub.csv',index=False)

In [20]:
# Write predictions to stack
sample_submission['predicted'] = y_pred

sample_submission.to_csv('autogluon_shap_sub_probs.csv',index=False)
pd.DataFrame({'id':train.index, 'autogluon_shap_oof':y_oof}).to_csv('autogluon_oof.csv',index=False)

In [21]:
import shutil
shutil.rmtree('./AutoGlon/')