In [1]:
!pip install autogluon
!pip install --upgrade ipykernel

Collecting autogluon
  Downloading autogluon-0.3.1-py3-none-any.whl (9.9 kB)
Collecting autogluon.vision==0.3.1
  Downloading autogluon.vision-0.3.1-py3-none-any.whl (38 kB)
Collecting autogluon.extra==0.3.1
  Downloading autogluon.extra-0.3.1-py3-none-any.whl (28 kB)
Collecting autogluon.features==0.3.1
  Downloading autogluon.features-0.3.1-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 626 kB/s 
[?25hCollecting autogluon.core==0.3.1
  Downloading autogluon.core-0.3.1-py3-none-any.whl (352 kB)
[K     |████████████████████████████████| 352 kB 602 kB/s 
[?25hCollecting autogluon.text==0.3.1
  Downloading autogluon.text-0.3.1-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 818 kB/s 
[?25hCollecting autogluon.tabular[all]==0.3.1
  Downloading autogluon.tabular-0.3.1-py3-none-any.whl (273 kB)
[K     |████████████████████████████████| 273 kB 2.8 MB/s 
[?25hCollecting autogluon.mxnet==0.3.1
  Downloading autogluon

# Load dependencies
---

In [2]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

from sklearn.metrics import f1_score, roc_auc_score, log_loss
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, KFold
from sklearn.calibration import CalibratedClassifierCV

# Imports from our package
from autogluon.tabular import TabularPredictor

# Everything for graphs
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
train = pd.read_csv('../input/porto-seguro-data-challenge/train.csv', index_col='id').reset_index(drop=True)
test = pd.read_csv('../input/porto-seguro-data-challenge/test.csv', index_col='id').reset_index(drop=True)
sample_submission = pd.read_csv('../input/porto-seguro-data-challenge/submission_sample.csv')
meta = pd.read_csv('../input/porto-seguro-data-challenge/metadata.csv')

cat_nom = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Qualitativo nominal")].iloc[:,0]] 
cat_ord = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Qualitativo ordinal")].iloc[:,0]] 
num_dis = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Quantitativo discreto")].iloc[:,0]] 
num_con = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Quantitativo continua")].iloc[:,0]] 

In [4]:
train.loc[:, 'sub1'] = pd.read_csv('../input/fork-of-porto-seguro-blending-2/stck2_oof.csv').iloc[:, 1]
#train.loc[:, 'sub2'] = pd.read_csv('../input/porto-seguro-blending-power-averaging/stck3_oof.csv').iloc[:, 1]

test.loc[:, 'sub1'] = pd.read_csv('../input/fork-of-porto-seguro-blending-2/stck2_sub.csv').iloc[:, 1]
#test.loc[:, 'sub2'] = pd.read_csv('../input/porto-seguro-blending-power-averaging/stck3_sub.csv').iloc[:, 1]

In [5]:
knn_feat_train = pd.read_csv('../input/porto-seguro-knn-feature-extraction-k-1/knn_feat_train.csv')
knn_feat_test = pd.read_csv('../input/porto-seguro-knn-feature-extraction-k-1/knn_feat_test.csv')

train = pd.concat([train, knn_feat_train], axis=1)
test = pd.concat([test, knn_feat_test], axis=1)

# Prepare
---

In [8]:
K=10
SEED=25
kf = KFold(n_splits=K, random_state=SEED, shuffle=True)

In [9]:
def get_threshold(y_true, y_pred):
    # Moving threshold
    thresholds = np.arange(0.0, 1.0, 0.01)
    f1_scores = []
    for thresh in thresholds:
        f1_scores.append(
            f1_score(y_true, [1 if m>thresh else 0 for m in y_pred]))
    f1s = np.array(f1_scores)
    return thresholds[f1s.argmax()]
    
    
def custom_f1(y_true, y_pred):
     
    max_f1_threshold =  get_threshold(y_true, y_pred)

    y_pred = np.where(y_pred>max_f1_threshold, 1, 0)
    f1_after = f1_score(y_true, y_pred) 
    
    return f1_after

# AutoGluon
---

In [10]:
predictor = TabularPredictor(label="y",
                             problem_type='binary',
                             eval_metric="log_loss",
                             path='./AutoGlon/',
                             verbosity=1)

predictor.fit(train, presets='best_quality', time_limit=60*60*6.5)

results = predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                      model  score_val  pred_time_val      fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L3  -0.309011      99.267147  14099.278236                0.002562           1.977791            3       True         26
1           CatBoost_BAG_L2  -0.309320      95.421898  13845.696623                1.338343         498.184760            2       True         19
2           LightGBM_BAG_L2  -0.309853      95.497803  13588.285393                1.414248         240.773530            2       True         16
3       WeightedEnsemble_L2  -0.310374      26.411617   7841.399439                0.001563           2.439550            2       True         14
4         LightGBMXT_BAG_L2  -0.310379      95.588387  13565.106750                1.504831         217.594887            2       True         15
5            XGBoost_BAG_L2  -0.310656      96.499739  13842.9

In [11]:
y_oof = predictor.get_oof_pred_proba().iloc[:,1]
y_pred = predictor.predict_proba(test).iloc[:,1]

In [12]:
final_threshold = get_threshold(train.y, y_oof)
final_threshold

0.37

In [13]:
print("Final F1     :", custom_f1(train.y, y_oof))
print("Final AUC    :", roc_auc_score(train.y, y_oof))
print("Final LogLoss:", log_loss(train.y, y_oof))

Final F1     : 0.6855903091665161
Final AUC    : 0.8879617966232001
Final LogLoss: 0.30901108559355006


# Sub
---

In [14]:
# Write predictions to sub
sample_submission['predicted'] = np.where(y_pred>final_threshold, 1, 0).astype('int64')
sample_submission.to_csv('autogluon_pseudo_sub.csv',index=False)

In [15]:
# Write predictions to stack
sample_submission['predicted'] = y_pred

sample_submission.to_csv('autogluon_pseudo_sub_probs.csv',index=False)
pd.DataFrame({'id':train.index, 'autogluon_pseudo_oof':y_oof}).to_csv('autogluon_pseudo_oof.csv',index=False)

In [16]:
import shutil
shutil.rmtree('./AutoGlon/')