In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import time
from tqdm import tqdm

from sklearn.metrics import f1_score, roc_auc_score, log_loss
from sklearn.model_selection import KFold
from sklearn.calibration import CalibratedClassifierCV
from scipy.optimize import minimize

import matplotlib.pyplot as plt
import plotly.figure_factory as ff
import plotly.express as px

%matplotlib inline

In [2]:
def get_threshold(y_true, y_pred):
    # Moving threshold
    thresholds = np.arange(0.0, 1.0, 0.01)
    f1_scores = []
    for thresh in thresholds:
        f1_scores.append(
            f1_score(y_true, [1 if m>thresh else 0 for m in y_pred]))
    f1s = np.array(f1_scores)
    return thresholds[f1s.argmax()]
    
    
def custom_f1(y_true, y_pred):
     
    max_f1_threshold =  get_threshold(y_true, y_pred)

    y_pred = np.where(y_pred>max_f1_threshold, 1, 0)
    f1_after = f1_score(y_true, y_pred) 
    
    return f1_after

In [3]:
def ensembling(main, support, coeff): 
    
    suba  = main.copy() 
    subav = suba.values
       
    subb  = support.copy()
    subbv = subb.values    
           
    ense  = main.copy()    
    ensev = ense.values  
 
    for i in range (len(main)):
        
        pera = subav[i, 1]
        perb = subbv[i, 1]
        per = (pera * coeff) + (perb * (1.0 - coeff))   
        ensev[i, 1] = per
        
    ense.iloc[:, 1] = ensev[:, 1]  
    
    ###############################    
    X  = suba.iloc[:, 1]
    Y1 = subb.iloc[:, 1]
    Y2 = ense.iloc[:, 1]
    
    plt.style.use('seaborn-whitegrid') 
    plt.figure(figsize=(9, 9), facecolor='lightgray')
    plt.title(f'\nE N S E M B L I N G\n')   
      
    plt.scatter(X, Y1, s=1.5, label='Support')    
    plt.scatter(X, Y2, s=1.5, label='Generated')
    plt.scatter(X, X , s=0.1, label='Main(X=Y)')
    
    plt.legend(fontsize=12, loc=2)
    #plt.savefig('Ensembling_1.png')
    plt.show()     
    ###############################   
    ense.iloc[:, 1] = ense.iloc[:, 1].astype(float)
    hist_data = [subb.iloc[:, 1], ense.iloc[:, 1], suba.iloc[:, 1]] 
    group_labels = ['Support', 'Ensembling', 'Main']
    
    fig = ff.create_distplot(hist_data, group_labels, bin_size=.2, show_hist=False, show_rug=False)
    fig.show()   
    ###############################       
    
    return ense   

In [4]:
def vote(r, columns):
    """https://www.kaggle.com/belov38/catboost-lb/"""
    ones = 0
    zeros = 0
    for i in columns:
        if r[i]==0:
            zeros+=1
        else:
            ones+=1
    if ones>zeros:
        return 1
    else:
        return 0

In [5]:
train = pd.read_csv('../input/porto-seguro-data-challenge/train.csv')
test = pd.read_csv('../input/porto-seguro-data-challenge/test.csv')
sample_submission = pd.read_csv('../input/porto-seguro-data-challenge/submission_sample.csv')
meta = pd.read_csv('../input/porto-seguro-data-challenge/metadata.csv')

In [6]:
cat_nom = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Qualitativo nominal")].iloc[:,0]] # 0.66 / 0.56
cat_ord = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Qualitativo ordinal")].iloc[:,0]] # 0.36 / 0.36
num_dis = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Quantitativo discreto")].iloc[:,0]] # 0.40
num_con = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Quantitativo continua")].iloc[:,0]] # 0.38

In [7]:
X_test = test[cat_nom+cat_ord+num_dis+num_con]
X = train[cat_nom+cat_ord+num_dis+num_con]
y = train.y

K=10
SEED=314
kf = KFold(n_splits=K, shuffle=True, random_state=SEED)

In [8]:
#to_stack = pd.concat([
#    pd.read_csv('../input/porto-seguro-lightautoml-knn-pseudolabel/lightautoml_pseudo_oof.csv', index_col='id').reset_index(drop=True),
#    pd.read_csv('../input/porto-seguro-lightautoml-pseudolabel/lightautoml_pseudo_oof.csv', index_col='id').reset_index(drop=True),
#    pd.read_csv('../input/fork-of-porto-seguro-knn-autogluon-pseudolabel/autogluon_pseudo_oof.csv', index_col='id').reset_index(drop=True)
#], axis=1)
#
#to_pred = pd.concat([
#    pd.read_csv('../input/porto-seguro-lightautoml-knn-pseudolabel/lightautoml_pseudo_sub_probs.csv', index_col='id').reset_index(drop=True),
#    pd.read_csv('../input/porto-seguro-lightautoml-pseudolabel/lightautoml_pseudo_sub_probs.csv', index_col='id').reset_index(drop=True),
#    pd.read_csv('../input/fork-of-porto-seguro-knn-autogluon-pseudolabel/autogluon_pseudo_sub_probs.csv', index_col='id').reset_index(drop=True)
#], axis=1)

In [9]:
#to_pred = pd.concat([
#    pd.read_csv('../input/porto-seguro-lightgbm-shap-sequencial-tun/lgb_seq_sub.csv', index_col='id').reset_index(drop=True),
#    pd.read_csv('../input/porto-seguro-catboost-knn-shap/cat_shap_sub.csv', index_col='id').reset_index(drop=True),
#    pd.read_csv('../input/porto-seguro-catboost-shap/cat_shap_sub.csv', index_col='id').reset_index(drop=True)
#], axis=1)

In [10]:
#def simplex(w, *args):
#    yp_stacked = np.sum(to_stack * [w[0], w[1], w[2]], 1) / np.sum(w)
#    return -custom_f1(y, yp_stacked)

In [11]:
#w0 =  [1, 1, 1]
#res1 = minimize(simplex, w0, method='nelder-mead')
#res1

In [12]:
#y_stacked = np.sum(to_stack * res1.x, 1) / np.sum(res1.x)
#y_pred = np.sum(to_pred * res1.x, 1) / np.sum(res1.x)

In [13]:
#oof1 = pd.read_csv('../input/porto-seguro-lightautoml-knn-pseudolabel/lightautoml_pseudo_oof.csv')
#oof2 = pd.read_csv('../input/porto-seguro-lightautoml-pseudolabel/lightautoml_pseudo_oof.csv')
#oof3 = pd.read_csv('../input/fork-of-porto-seguro-knn-autogluon-pseudolabel/autogluon_pseudo_oof.csv') 

# sub1 = pd.read_csv('../input/porto-seguro-lightautoml-knn-pseudolabel/lightautoml_pseudo_sub_probs.csv')
# sub2 = pd.read_csv('../input/porto-seguro-lightautoml-pseudolabel/lightautoml_pseudo_sub_probs.csv')
# sub3 = pd.read_csv('../input/fork-of-porto-seguro-knn-autogluon-pseudolabel/autogluon_pseudo_sub_probs.csv') 


sub2 = pd.read_csv('../input/porto-seguro-lightautoml-pseudolabel/lightautoml_pseudo_sub.csv').predicted
sub4 = pd.read_csv('../input/porto-seguro-autogluon-pseudolabel/autogluon_pseudo_sub.csv').predicted
#sub5 = pd.read_csv('../input/porto-seguro-catboost-pseudolabel/cat_pseudo_sub.csv').predicted

sub1 = pd.read_csv('../input/porto-seguro-lightautoml-knn-pseudolabel/lightautoml_pseudo_sub.csv').predicted
sub3 = pd.read_csv('../input/fork-of-porto-seguro-knn-autogluon-pseudolabel/autogluon_pseudo_sub.csv').predicted
sub6 = pd.read_csv('../input/porto-seguro-fork-of-lightautoml-pseudolabel/lightautoml_pseudo_sub.csv').predicted

In [14]:
#to_ens = pd.concat([sub1.predicted, sub2.predicted, sub3.predicted, sub4.predicted, sub5.predicted], axis=1)
to_ens = pd.concat([sub1, sub2, sub3, sub4, sub6], axis=1)

In [15]:
to_ens.columns = ['sub1', 'sub2', 'sub3', 'sub4','sub6']

In [16]:
#to_ens.groupby(['sub1', 'sub2', 'sub3', 'sub4', 'sub5']).size()

In [17]:
#pd.crosstab(to_ens.sub1, to_ens.sub2)

In [18]:
#pd.crosstab(to_ens.sub1, to_ens.sub3)

In [19]:
#pd.crosstab(to_ens.sub2, to_ens.sub3)

In [20]:
## numero de instancias que tera desempate
#n = to_ens[((to_ens.sub1==0)&(to_ens.sub2==1))|((to_ens.sub1==1)&(to_ens.sub2==0))].shape[0]
#prop = to_ens[((to_ens.sub1==0)&(to_ens.sub2==1))|((to_ens.sub1==1)&(to_ens.sub2==0))].shape[0] / to_ens.shape[0]
#
#print(f"{n} ({np.round(prop*100, 2)}%) instancias onde os dois melhores scores precisarao de um desempate")

In [21]:
# ----------------------

In [22]:
#ens_oof1 = ensembling(oof1, oof3, 0.80)
#ens1 = ensembling(sub1, sub3, 0.80)

In [23]:
#ens_oof2 = ensembling(ens_oof1, oof3, 0.80)
#ens2 = ensembling(ens1, sub3, 0.80)

In [24]:
# -------------------

In [25]:
#w1=0.8
#w2=0.2
#y_stacked = (w1*to_stack.iloc[:, 0]) + (w2*to_stack.iloc[:, 1]) / (w1+w2)
#y_pred = (w1*to_pred.iloc[:, 0]) + (w2*to_pred.iloc[:, 1]) / (w1+w2)

In [26]:
#y_stacked = np.mean(to_stack, axis=1)
#y_pred = np.mean(to_pred, axis=1)

In [27]:
#y_stacked = ens_oof1.lightautoml_pseudo_oof
#y_pred = ens1.predicted

In [28]:
#final_threshold = get_threshold(y, y_stacked)
#final_threshold

In [29]:
#print("Final F1     :", custom_f1(y, y_stacked))
#print("Final AUC    :", roc_auc_score(y, y_stacked))
#print("Final LogLoss:", log_loss(y, y_stacked))

In [30]:
## Get predictions
#sample_submission['predicted'] = np.where(y_pred>final_threshold, 1, 0).astype('int64')
#sample_submission.to_csv('stck2_sub.csv',index=False)
#
#sample_submission['predicted'] = y_pred
#sample_submission.to_csv('stck2_sub_probs.csv',index=False)
#
#pd.DataFrame({'id':train.id, 'stck2_oof':y_stacked}).to_csv('stck2_oof.csv',index=False)

In [31]:
ens = to_ens.apply(lambda x:vote(x, to_ens.columns.tolist()),axis=1)

In [32]:
#sample_submission['predicted'] = np.where(y_pred>final_threshold, 1, 0).astype('int64')
sample_submission['predicted'] = ens.astype('int64')
sample_submission.to_csv('stck2_sub.csv',index=False)