In [1]:
# Importando bibliotecas
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import linear_model
import catboost as cat
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
# Lendo arquivos CSV
df_train = pd.read_csv("/users/diegobernardo/downloads/base-santander/train-2.csv")
df_test  = pd.read_csv("/users/diegobernardo/downloads/base-santander/test-2.csv")

# Deletando a coluna ID
df_train.drop("ID", axis=1, inplace=True)

# Armazenado o target em outra variável para excluir do dataframe de treinamento
train_target = df_train['TARGET']
df_train.drop("TARGET", axis=1, inplace=True)

# Armazenando o ID em outra variável para não ser usado no teste
test_id = df_test['ID']
df_test.drop('ID', axis=1, inplace=True)

In [3]:
# Criando os folds
def train_test_fold(n_folds, n_fold_test):
    train = pd.DataFrame()
    target = np.array([])
    test = pd.DataFrame()

    for i in range(n_folds):
        inicio = df_train.shape[0]//n_folds*(i)
        fim    = df_train.shape[0]//n_folds*(i+1)
        if i == n_fold_test:
            test = df_train.iloc[inicio:fim, :]
        else:
            train = pd.concat([train, df_train.iloc[inicio:fim, :]])
            target = np.append(target, train_target[inicio:fim])

    return train, target, test

In [109]:
####################################################################################################################
#######################################          XGBOOST          ##################################################
####################################################################################################################

In [62]:
# Convertendo os dados para um estrutura que o XGBoost utiliza
data_train_xgboost = xgb.DMatrix(data=df_train, label=train_target, weight=df_train['var15'])
data_test_xgboost = xgb.DMatrix(data=df_test, weight=df_train['var15'])

# Criando os parâmetros para o XGBoost
objective = ['reg:logistic', 'binary:logistic']
booster = ['gbtree', 'gblinear', 'dart']
params_xgboost = []
for obj in range(len(objective)):

    for boo in range(len(booster)):
        
        for dep in range(2,16):
    
            for eta in range(1, 50, 5):
                params_xgboost.append(
                    { 'objective':objective[obj]
                    , 'booster':booster[boo]
                    , 'eta':eta/100
                    , 'max_depth':dep
                    , 'gamma':2
                    , 'lambda':2
                    , 'subsample':0.9
                    , 'silent':True
                    , 'colsample_bytree':0.50
                    , 'colsample_bylevel':0.20
                    , 'eval_metric':'auc'
                    , 'seed': 1990}
                )

#params_xgboost = [
#              { 'objective':'reg:logistic', 'booster':'gbtree', 'eta':0.01, 'max_depth':9, 'subsample':0.9, 'silent':True, 'colsample_bytree':0.50, 'colsample_bylevel':0.20, 'eval_metric':'auc', 'seed': 1990}
#            , { 'objective':'reg:logistic', 'booster':'gbtree', 'eta':0.1,  'max_depth':9, 'subsample':0.9, 'silent':True, 'colsample_bytree':0.50, 'colsample_bylevel':0.20, 'eval_metric':'auc', 'seed': 1990}
#            , { 'objective':'reg:logistic', 'booster':'gbtree', 'eta':0.4,  'max_depth':9, 'subsample':0.9, 'silent':True, 'colsample_bytree':0.50, 'colsample_bylevel':0.20, 'eval_metric':'auc', 'seed': 1990}
#            , { 'objective':'reg:logistic', 'booster':'gbtree', 'eta':0.01, 'max_depth':6, 'subsample':0.9, 'silent':True, 'colsample_bytree':0.50, 'colsample_bylevel':0.20, 'eval_metric':'auc', 'seed': 1990}
#            , { 'objective':'reg:logistic', 'booster':'gbtree', 'eta':0.1,  'max_depth':6, 'subsample':0.9, 'silent':True, 'colsample_bytree':0.50, 'colsample_bylevel':0.20, 'eval_metric':'auc', 'seed': 1990}
#            , { 'objective':'reg:logistic', 'booster':'gbtree', 'eta':0.4,  'max_depth':6, 'subsample':0.9, 'silent':True, 'colsample_bytree':0.50, 'colsample_bylevel':0.20, 'eval_metric':'auc', 'seed': 1990}
#        ]

In [111]:
num_round = 20
n_folds = 10
n_iteracoes = 30
preds_xgboost_train = pd.DataFrame()
preds_xgboost_test  = pd.DataFrame()

# Loop de parâmetros
for p in range(len(params_xgboost)):
    
    param = params_xgboost[p]
    column = "xgboost_p" + str(p+1)
    preds_param = np.array([])
    
    # Loop de Folds
    for f in range(n_folds):
        train, target, test = train_test_fold(n_folds, f)
        train_xgboost = xgb.DMatrix(data=train, label=target, weight=train['var15'])
        test_xgboost  = xgb.DMatrix(data=test, weight=test['var15'])
        preds_fold = np.zeros(test_xgboost.num_row())

        # Loop para retirar o ruído
        for i in range(n_iteracoes): 
            param['seed'] = 1990+i
            model_xgboost = xgb.train(param, train_xgboost, num_round)
            preds_fold += model_xgboost.predict(test_xgboost)

        preds_fold /= n_iteracoes
        preds_param = np.append(preds_param, preds_fold)

    print('XGBoost Param: ', p ,'/', len(params_xgboost))

    meta_feature = pd.DataFrame(data=preds_param, columns=[column])
    meta_feature.loc[meta_feature[column] > 1, column] = 1.0
    meta_feature.loc[meta_feature[column] < 0, column] = 0.0
    preds_xgboost_train = pd.concat([preds_xgboost_train, meta_feature], axis=1)
    
    
    ############################################
    ### CRIANDO AS FEATURES NA BASE DE TESTE ###
    ############################################
    
    preds_param = np.zeros(data_test_xgboost.num_row())
    
    # Loop para retirar o ruído
    for i in range(n_iteracoes):
        param['seed'] = 1990+i
        model_xgboost = xgb.train(param, data_train_xgboost, num_round)
        preds_param += model_xgboost.predict(data_test_xgboost)
        
    preds_param /= n_iteracoes
    meta_feature = pd.DataFrame(data=preds_param, columns=[column])
    meta_feature.loc[meta_feature[column] > 1, column] = 1.0
    meta_feature.loc[meta_feature[column] < 0, column] = 0.0
    preds_xgboost_test = pd.concat([preds_xgboost_test, meta_feature], axis=1)
        

In [112]:
####################################################################################################################
#######################################          LIGHTGBM          #################################################
####################################################################################################################

In [80]:
# Separando dados para treino e validação do treino
x_train, x_valid, y_train, y_valid = train_test_split(df_train, train_target, test_size=0.2)

# Convertendo os dados para um estrutura que o LightGBM utiliza
data_train_lgbm = lgb.Dataset(x_train, label=y_train)
data_valid_lgbm = lgb.Dataset(x_valid, label=y_valid)
data_test_lgbm  = lgb.Dataset(df_test)

# Criando os parâmetros para o LightGBM
booster = ['gbdt', 'random_forest', 'dart', 'goss']
params_lgbm = []

for boo in range(len(booster)):

    for dep in range(2,16):

        for eta in range(1, 50, 5):
            params_lgbm.append(
                {  'objective':'regression'
                 , 'boosting':booster[boo]
                 , 'learning_rate':eta/100
                 , 'max_depth':dep
                 , 'lambda_l2':2
                 , 'categorical_feature=name':'var15'
                 , 'bagging_fraction':0.9
                 , 'bagging_freq':10
                 , 'colsample_bytree':0.50
                 , 'metric':'auc'
                 , 'seed': 1990}
            )

"""
params_lgbm = [
             { 'objective':'regression', 'boosting':'gbdt',         'learning_rate':0.01, 'max_depth':9, 'bagging_fraction':0.9, 'bagging_freq':10, 'colsample_bytree':0.50, 'metric':'auc', 'seed': 1990}
           , { 'objective':'regression', 'boosting':'gbdt',         'learning_rate':0.1,  'max_depth':9, 'bagging_fraction':0.9, 'bagging_freq':10, 'colsample_bytree':0.50, 'metric':'auc', 'seed': 1990} 
           , { 'objective':'regression', 'boosting':'gbdt',         'learning_rate':0.4,  'max_depth':9, 'bagging_fraction':0.9, 'bagging_freq':10, 'colsample_bytree':0.50, 'metric':'auc', 'seed': 1990} 
           , { 'objective':'regression', 'boosting':'gbdt',         'learning_rate':0.01, 'max_depth':6, 'bagging_fraction':0.9, 'bagging_freq':10, 'colsample_bytree':0.50, 'metric':'auc', 'seed': 1990}
           , { 'objective':'regression', 'boosting':'gbdt',         'learning_rate':0.1,  'max_depth':6, 'bagging_fraction':0.9, 'bagging_freq':10, 'colsample_bytree':0.50, 'metric':'auc', 'seed': 1990} 
           , { 'objective':'regression', 'boosting':'gbdt',         'learning_rate':0.4,  'max_depth':6, 'bagging_fraction':0.9, 'bagging_freq':10, 'colsample_bytree':0.50, 'metric':'auc', 'seed': 1990} 
           , { 'objective':'regression', 'boosting':'random_forest', 'learning_rate':0.01, 'max_depth':9, 'bagging_fraction':0.9, 'bagging_freq':10, 'colsample_bytree':0.50, 'metric':'auc', 'seed': 1990}
           , { 'objective':'regression', 'boosting':'random_forest', 'learning_rate':0.1,  'max_depth':9, 'bagging_fraction':0.9, 'bagging_freq':10, 'colsample_bytree':0.50, 'metric':'auc', 'seed': 1990} 
           , { 'objective':'regression', 'boosting':'random_forest', 'learning_rate':0.4,  'max_depth':9, 'bagging_fraction':0.9, 'bagging_freq':10, 'colsample_bytree':0.50, 'metric':'auc', 'seed': 1990} 
           , { 'objective':'regression', 'boosting':'random_forest', 'learning_rate':0.01, 'max_depth':6, 'bagging_fraction':0.9, 'bagging_freq':10, 'colsample_bytree':0.50, 'metric':'auc', 'seed': 1990}
           , { 'objective':'regression', 'boosting':'random_forest', 'learning_rate':0.1,  'max_depth':6, 'bagging_fraction':0.9, 'bagging_freq':10, 'colsample_bytree':0.50, 'metric':'auc', 'seed': 1990} 
           , { 'objective':'regression', 'boosting':'random_forest', 'learning_rate':0.4,  'max_depth':6, 'bagging_fraction':0.9, 'bagging_freq':10, 'colsample_bytree':0.50, 'metric':'auc', 'seed': 1990} 

         ]
"""

"\nparams_lgbm = [\n             { 'objective':'regression', 'boosting':'gbdt',         'learning_rate':0.01, 'max_depth':9, 'bagging_fraction':0.9, 'bagging_freq':10, 'colsample_bytree':0.50, 'metric':'auc', 'seed': 1990}\n           , { 'objective':'regression', 'boosting':'gbdt',         'learning_rate':0.1,  'max_depth':9, 'bagging_fraction':0.9, 'bagging_freq':10, 'colsample_bytree':0.50, 'metric':'auc', 'seed': 1990} \n           , { 'objective':'regression', 'boosting':'gbdt',         'learning_rate':0.4,  'max_depth':9, 'bagging_fraction':0.9, 'bagging_freq':10, 'colsample_bytree':0.50, 'metric':'auc', 'seed': 1990} \n           , { 'objective':'regression', 'boosting':'gbdt',         'learning_rate':0.01, 'max_depth':6, 'bagging_fraction':0.9, 'bagging_freq':10, 'colsample_bytree':0.50, 'metric':'auc', 'seed': 1990}\n           , { 'objective':'regression', 'boosting':'gbdt',         'learning_rate':0.1,  'max_depth':6, 'bagging_fraction':0.9, 'bagging_freq':10, 'colsample_byt

In [114]:
num_round = 20
n_folds = 10
n_iteracoes = 30
preds_lgbm_train = pd.DataFrame()
preds_lgbm_test  = pd.DataFrame()

# Loop de parâmetros 
for p in range(len(params_lgbm)):
    
    param = params_lgbm[p]
    column = "lightgbm_p" + str(p+1)
    preds_param = np.array([])

    # Loop de Folds
    for f in range(n_folds):
        
        # Separa os dados por folds
        train, target, test = train_test_fold(n_folds, f)
        
        # Converte os dados para uma estrutura que o LightGBM utiliza
        x_train, x_valid, y_train, y_valid = train_test_split(train, target, test_size=0.2)
        train_lgbm = lgb.Dataset(x_train, label=y_train)
        valid_lgbm = lgb.Dataset(x_valid, label=y_valid)
        preds_fold = np.zeros(len(test))
        
        # Loop para retirar o ruído
        for i in range(n_iteracoes): 
            param['seed'] = 1990+i
            model_lgbm = lgb.train(param
                       , train_lgbm
                       , num_round
                       , valid_sets=[train_lgbm, valid_lgbm]
                       , valid_names=['train', 'valid']
                       , early_stopping_rounds=50
                       , verbose_eval=False)
            preds_fold += model_lgbm.predict(test, num_iteration=model_lgbm.best_iteration)

        preds_fold /= n_iteracoes
        preds_param = np.append(preds_param, preds_fold)
        
    print('LightGBM Param: ', p ,'/', len(params_lgbm))
        
    meta_feature = pd.DataFrame(data=preds_param, columns=[column])
    meta_feature.loc[meta_feature[column] > 1, column] = 1.0
    meta_feature.loc[meta_feature[column] < 0, column] = 0.0
    preds_lgbm_train = pd.concat([preds_lgbm_train, meta_feature], axis=1)


    ############################################
    ### CRIANDO AS FEATURES NA BASE DE TESTE ###
    ############################################
    
    preds_param = np.zeros(df_test.shape[0])
    
    # Loop para retirar o ruído
    for i in range(n_iteracoes):
        param['seed'] = 1990+i
        model_lgbm = lgb.train(param
                                , data_train_lgbm
                                , num_round
                                , valid_sets=[data_train_lgbm, data_valid_lgbm]
                                , valid_names=['train', 'valid']
                                , early_stopping_rounds=50
                                , verbose_eval=False)
        preds_param += model_lgbm.predict(df_test, num_iteration=model_lgbm.best_iteration)
        
    preds_param /= n_iteracoes
    meta_feature = pd.DataFrame(data=preds_param, columns=[column])
    meta_feature.loc[meta_feature[column] > 1, column] = 1.0
    meta_feature.loc[meta_feature[column] < 0, column] = 0.0
    preds_lgbm_test = pd.concat([preds_lgbm_test, meta_feature], axis=1)

In [None]:
####################################################################################################################
#######################################          CATBOOST         ##################################################
####################################################################################################################

In [None]:

params_cat = [
             { 'objective':'RMSE', 'custom_metric':'AUC', 'eval_metric':'AUC', 'learning_rate':0.01, 'random_seed':1990, 'max_depth':6, 'colsample_bylevel':0.5}
            #,{ 'objective':'RMSE', 'custom_metric':'AUC', 'eval_metric':'AUC', 'learning_rate':0.1, 'random_seed':1990, 'max_depth':6, 'colsample_bylevel':0.5}
            #,{ 'objective':'RMSE', 'custom_metric':'AUC', 'eval_metric':'AUC', 'learning_rate':0.4, 'random_seed':1990, 'max_depth':6, 'colsample_bylevel':0.5}
            #,{ 'objective':'RMSE', 'custom_metric':'AUC', 'eval_metric':'AUC', 'learning_rate':0.01, 'random_seed':1990, 'max_depth':9, 'colsample_bylevel':0.5}
            #,{ 'objective':'RMSE', 'custom_metric':'AUC', 'eval_metric':'AUC', 'learning_rate':0.1, 'random_seed':1990, 'max_depth':9, 'colsample_bylevel':0.5}
            #,{ 'objective':'RMSE', 'custom_metric':'AUC', 'eval_metric':'AUC', 'learning_rate':0.4, 'random_seed':1990, 'max_depth':9, 'colsample_bylevel':0.5}
            #,{ 'objective':'RMSE', 'custom_metric':'AUC', 'eval_metric':'AUC', 'learning_rate':0.01, 'random_seed':1990, 'max_depth':12, 'colsample_bylevel':0.5}
            #,{ 'objective':'RMSE', 'custom_metric':'AUC', 'eval_metric':'AUC', 'learning_rate':0.1, 'random_seed':1990, 'max_depth':12, 'colsample_bylevel':0.5}
            #,{ 'objective':'RMSE', 'custom_metric':'AUC', 'eval_metric':'AUC', 'learning_rate':0.4, 'random_seed':1990, 'max_depth':12, 'colsample_bylevel':0.5}
            #,{ 'objective':'RMSE', 'custom_metric':'AUC', 'eval_metric':'AUC', 'learning_rate':0.01, 'random_seed':1990, 'max_depth':15, 'colsample_bylevel':0.5}
            #,{ 'objective':'RMSE', 'custom_metric':'AUC', 'eval_metric':'AUC', 'learning_rate':0.1, 'random_seed':1990, 'max_depth':15, 'colsample_bylevel':0.5}
            ,{ 'objective':'RMSE', 'custom_metric':'AUC', 'eval_metric':'AUC', 'learning_rate':0.4, 'random_seed':1990, 'max_depth':15, 'colsample_bylevel':0.5}
        ]

num_round = 20
n_folds = 10
n_iteracoes = 30
preds_catboost_train = pd.DataFrame()
preds_catboost_test  = pd.DataFrame()
data_train_catboost  = cat.Pool(df_train, label=train_target)
data_test_catboost   = cat.Pool(df_test)

# Loop de parâmetros 
for p in range(len(params_cat)):
    
    param = params_cat[p]
    column = "catboost_p" + str(p+1)
    preds_param = np.array([])

    # Loop de Folds
    for f in range(n_folds):
        
        # Separa os dados por folds
        train, target, test = train_test_fold(n_folds, f)
        
        # Converte os dados para uma estrutura que o CATBOOST utiliza
        train_catboost = cat.Pool(train, label=target)
        test_catboost  = cat.Pool(test)
        preds_fold = np.zeros(test_catboost.num_row())
        
        # Loop para retirar o ruído
        for i in range(n_iteracoes): 
            param['random_seed'] = 1990+i
            model_catboost = cat.train(params=param, pool=train_catboost, num_boost_round=num_round, logging_level='Silent')
            preds_fold += model_catboost.predict(test_catboost)
            
        preds_fold /= n_iteracoes
        preds_param = np.append(preds_param, preds_fold)
        
    print('CatBoost Param: ', p ,'/', len(params_cat))
        
    meta_feature = pd.DataFrame(data=preds_param, columns=[column])
    meta_feature.loc[meta_feature[column] > 1, column] = 1.0
    meta_feature.loc[meta_feature[column] < 0, column] = 0.0
    preds_catboost_train = pd.concat([preds_catboost_train, meta_feature], axis=1)
    
    
    ############################################
    ### CRIANDO AS FEATURES NA BASE DE TESTE ###
    ############################################
    
    preds_param = np.zeros(data_test_catboost.num_row())
    
    # Loop para retirar o ruído
    for i in range(n_iteracoes):
        param['random_seed'] = 1990+i
        model_catboost = cat.train(params=param, pool=data_train_catboost, num_boost_round=num_round, logging_level='Silent')
        preds_param += model_catboost.predict(data_test_catboost)
        
    preds_param /= n_iteracoes
    meta_feature = pd.DataFrame(data=preds_param, columns=[column])
    meta_feature.loc[meta_feature[column] > 1, column] = 1.0
    meta_feature.loc[meta_feature[column] < 0, column] = 0.0
    preds_catboost_test = pd.concat([preds_catboost_test, meta_feature], axis=1)


CatBoost Param:  0 / 2



Iteration with suspicious time -1.32e+04 sec ignored in overall statistics.

Iteration with suspicious time -35.6 sec ignored in overall statistics.

Iteration with suspicious time -43.6 sec ignored in overall statistics.

Iteration with suspicious time -0.93 sec ignored in overall statistics.


In [None]:
####################################################################################################################
########################     JUNTANDO AS META FEATURES DO XGBOOST - LIGHTGBM E CATBOOST     ########################
####################################################################################################################

In [147]:
# Juntando as Meta Features do XGBoost - LightGBM e CatBoost
meta_features_train = pd.concat([preds_xgboost_train, preds_lgbm_train, preds_catboost_train], axis=1)
meta_features_test = pd.concat([preds_xgboost_test, preds_lgbm_test, preds_catboost_test], axis=1)

# Salvando as Meta Features em um arquivo .csv
meta_features_train.to_csv('/users/diegobernardo/downloads/base-santander/meta_features_train_001.csv', sep=',', index=False)
meta_features_test.to_csv('/users/diegobernardo/downloads/base-santander/meta_features_test_001.csv', sep=',', index=False)

meta_features_test

Unnamed: 0,xgboost_p1,xgboost_p2,xgboost_p3,xgboost_p4,xgboost_p5,xgboost_p6,lightgbm_p1,lightgbm_p2,lightgbm_p3,lightgbm_p4,...,lightgbm_p9,lightgbm_p10,lightgbm_p11,lightgbm_p12,catboost_p1,catboost_p2,catboost_p3,catboost_p4,catboost_p5,catboost_p6
0,0.415721,0.097202,0.042828,0.415151,0.095824,0.051976,0.038326,0.036761,0.044406,0.038865,...,0.033345,0.037256,0.037256,0.037256,0.009733,0.047941,0.056483,0.008408,0.044714,0.050532
1,0.416529,0.100956,0.052814,0.415314,0.097803,0.055960,0.039659,0.042465,0.063659,0.040003,...,0.043101,0.045748,0.045748,0.045748,0.009770,0.048732,0.057247,0.008675,0.047394,0.054229
2,0.411698,0.075681,0.002523,0.412123,0.078379,0.005697,0.034558,0.009907,0.000417,0.034524,...,0.009579,0.008015,0.008015,0.008015,0.001534,0.005833,0.004568,0.001419,0.004775,0.002751
3,0.418683,0.101366,0.014062,0.420119,0.107787,0.017989,0.038399,0.022037,0.006973,0.038076,...,0.035022,0.030968,0.030968,0.030968,0.002671,0.010831,0.010048,0.002452,0.009966,0.009669
4,0.411532,0.075520,0.003606,0.412088,0.078156,0.006529,0.034534,0.010064,0.001004,0.034572,...,0.009462,0.008250,0.008250,0.008250,0.001609,0.005400,0.003600,0.001456,0.004505,0.002639
5,0.436388,0.207404,0.223897,0.434440,0.197179,0.219029,0.059274,0.171075,0.225850,0.057470,...,0.166743,0.162629,0.162629,0.162629,0.036364,0.191683,0.229837,0.039539,0.197686,0.235108
6,0.417094,0.102855,0.038031,0.417951,0.108334,0.022224,0.043018,0.063572,0.056381,0.043660,...,0.060888,0.065457,0.065457,0.065457,0.008193,0.047765,0.065961,0.011391,0.060371,0.066223
7,0.434541,0.188525,0.165028,0.433910,0.188646,0.164309,0.053870,0.128298,0.159471,0.055220,...,0.131200,0.147904,0.147904,0.147904,0.035928,0.138741,0.158281,0.037659,0.140164,0.157870
8,0.412955,0.084585,0.023091,0.413026,0.084503,0.027572,0.035828,0.022094,0.021774,0.036197,...,0.017801,0.019401,0.019401,0.019401,0.004148,0.025217,0.029418,0.004187,0.022857,0.029823
9,0.413165,0.083132,0.012546,0.413612,0.086278,0.021072,0.035916,0.021287,0.022722,0.036292,...,0.018326,0.020287,0.020287,0.020287,0.004147,0.024443,0.027147,0.004252,0.022383,0.022125


In [None]:
####################################################################################################################
########################################     FIM DO PRIMEIRO NÍVEL     #############################################
####################################################################################################################

In [148]:
####################################################################################################################
########################################     INÍCIO DO SEGUNDO NÍVEL     #############################################
####################################################################################################################

In [163]:
# Segundo nível
# Regressão linear utilizando somente as Meta Features

model = linear_model.LinearRegression().fit(meta_features_train, train_target)
output = model.predict(meta_features_test)
output[output < 0] = 0
output[output > 1] = 1

df_output = pd.DataFrame({'ID':test_id, 'TARGET':output})
df_output.to_csv('/users/diegobernardo/downloads/base-santander/output_stacking_001.csv', sep=',', index=False)

In [99]:
# Segundo nível
# Regressão linear utilizando somente as Meta Features 

model = linear_model.LinearRegression().fit(meta_features_train, train_target)
pred_linear = model.predict(meta_features_test)
pred_linear[pred_linear < 0] = 0
pred_linear[pred_linear > 1] = 1


# XGBoost utilizando Meta Features e Features originais
param = { 'objective':'reg:logistic', 'booster':'gbtree', 'eta':0.01, 'max_depth':3, 'subsample':0.9, 'silent':True, 'colsample_bytree':0.50, 'colsample_bylevel':0.20, 'eval_metric':'auc', 'seed': 1990}
data_train =  pd.concat([df_train, meta_features_train], axis=1)
data_test  =  pd.concat([df_test,  meta_features_test], axis=1)
data_train_xgboost = xgb.DMatrix(data=data_train, label=train_target, weight=data_train['var15'])
data_test_xgboost = xgb.DMatrix(data=data_test, weight=data_test['var15'])

preds_param = np.zeros(data_test_xgboost.num_row())
    
# Loop para retirar o ruído
for i in range(n_iteracoes):
    param['seed'] = 1990+i
    model_xgboost = xgb.train(param, data_train_xgboost, num_round)
    preds_param += model_xgboost.predict(data_test_xgboost)
    
pred_xgboost /= n_iteracoes
pred_xgboost[pred_xgboost < 0] = 0
pred_xgboost[pred_xgboost > 1] = 1

# Juntando Regressão linear e XGBoost
# Regressão linear com peso 2 e XGBoost com peso 1
output = ((pred_linear*2) + pred_xgboost) /3
output[output < 0] = 0
output[output > 1] = 1

df_output = pd.DataFrame({'ID':test_id, 'TARGET':output})
df_output.to_csv('/users/diegobernardo/downloads/base-santander/output_stacking_001.csv', sep=',', index=False)

NameError: name 'meta_features_train' is not defined

In [9]:
teste1 = np.array([3,6,9,12])
teste2 = np.array([1,2,3,4])
teste = ((teste1*2)+teste2)/3
#teste
x_train = np.array(df_train)
x_train.shape[1]

369

In [92]:
#############
### Keras ###
#############

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

# Convertendo os dados para um formato que o Keras trabalha
x_teste = np.array(df_test)
x_train = np.array(df_train)
y_train = np.array(train_target)

#Crie um modelo sequencial
model_nn = Sequential()

# 1a Camada - Adicione uma camada de entrada de 32 nós com o mesmo formato de entrada que as amostras de treinamento
model_nn.add(Dense(32, input_dim=x_train.shape[1]))

# 2a Camada - Adicione uma camada com 128 nós e ativação Tanh
model_nn.add(Dense(128, activation='tanh'))
model_nn.add(Dropout(0.2))

# 3a Camada - Adicione uma camada com 128 nós e ativação Tanh
model_nn.add(Dense(64, activation='tanh'))
model_nn.add(Dropout(0.2))

# 4a Camada - Adicione uma camada de saída completamente conectada
model_nn.add(Dense(1))

# 5a Camada - Adicione uma camada de ativação sigmóide
model_nn.add(Activation('sigmoid'))

# Compilando o modelo
model_nn.compile(loss="mean_squared_error", optimizer="sgd", metrics = ["accuracy"])

# Treinando o modelo
model_nn.fit(x_train, y_train, epochs=50, verbose=0, batch_size=64)

# Testando o modelo
pred_neural_network = model_nn.predict(x_teste, batch_size=64, verbose=1)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [74]:
teste = pred_neural_network[pred_neural_network > 0]
len(teste)
teste

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.