# Model Tuning Process

In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.linear_model import LassoLarsCV, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from scipy.stats import gmean
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from xgboost import DMatrix


from sklearn.ensemble import VotingClassifier
from mlxtend.classifier import StackingClassifier

from sklearn.metrics import roc_auc_score

In [2]:
xgb.__version__

'0.90'

In [3]:
import warnings
warnings.filterwarnings('ignore')

# Design a CV to evaluate the model

In [4]:
from sklearn.model_selection import KFold, cross_val_score, train_test_split


def acc(model):
    kf = KFold(n_splits=5, shuffle=True, random_state=42).get_n_splits(X_train.values)
    cv_auc = cross_val_score(model,X_train ,y_train, cv = kf, scoring = "roc_auc")
    return (cv_auc)


In [5]:
X_train = pd.read_csv('xtrain6.csv')
y_train = pd.read_csv('ytrain.csv')

In [6]:
X_train.drop('Unnamed: 0',axis=1, inplace = True)

In [7]:
X_train.head()

Unnamed: 0,var15,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var41_efect_ult1,imp_op_var41_efect_ult3,imp_op_var39_efect_ult1,imp_op_var39_efect_ult3,ind_var1,ind_var5,ind_var8_0,...,saldo_medio_var12_ult1,saldo_medio_var12_ult3,saldo_medio_var13_corto_hace2,saldo_medio_var13_corto_hace3,saldo_medio_var13_corto_ult1,saldo_medio_var13_corto_ult3,saldo_medio_var13_largo_hace2,saldo_medio_var13_largo_ult1,saldo_medio_var13_largo_ult3,var38
0,23,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17
1,34,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0.0,0.0,300.0,122.22,300.0,240.75,0.0,0.0,0.0,49278.03
2,23,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77
3,37,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97
4,39,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,85501.89,85501.89,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016


In [8]:
X_train.shape

(76020, 111)

In [14]:
y_train = np.ravel(y_train)

# Logistic model Tuning

In [15]:
# log model
logmodel = Pipeline([
        ("scl", StandardScaler()),
        ("clf", LogisticRegressionCV())
    ])



#logmodel.fit(X_train,y_train)


Pipeline(memory=None,
         steps=[('scl',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv=None,
                                      dual=False, fit_intercept=True,
                                      intercept_scaling=1.0, l1_ratios=None,
                                      max_iter=100, multi_class='auto',
                                      n_jobs=None, penalty='l2',
                                      random_state=None, refit=True,
                                      scoring=None, solver='lbfgs', tol=0.0001,
                                      verbose=0))],
         verbose=False)

In [16]:
log_score = np.mean(acc(logmodel))

In [17]:
print(log_score)

0.7734144228579438


In [18]:

from sklearn.linear_model import LogisticRegression
steps = [
         ('scaler', StandardScaler()),
         ('model', LogisticRegression())]

params = {
   'model__penalty' : ['l1', 'l2'],
    'model__C' :np.logspace(0, 4, 10)}

pipe = Pipeline(steps)

GridSearch = GridSearchCV(pipe, param_grid=params, scoring = "roc_auc")

logmodel2 = GridSearch.fit(X_train,y_train)



log2_score = np.mean(acc(logmodel2))

In [19]:
logmodel2.best_params_

{'model__C': 3593.813663804626, 'model__penalty': 'l2'}

In [20]:
print(log2_score)

0.786097797268995


# XGBoost Model Tuning

0.03

In [21]:
xgbc = xgb.XGBClassifier(max_depth=5, n_estimators=200, learning_rate=0.03, nthread=4, 
                          subsample=0.6815, colsample_bytree=0.701, seed=1234)
xgbc.fit(X_train, y_train, eval_metric="auc", verbose=2)

xgc1_score = np.mean(acc(xgbc))

In [22]:
print(xgc1_score)

0.8361782076072346


### 1, adjust mac_depth and min_child_weight

In [23]:

params = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)}

gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
     min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
     objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
     param_grid = params, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch1.fit(X_train,y_train)




#xgbnew1_score = np.mean(acc(xgbnew1))

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.8, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=5, min_child_weight=1,
                                     missing=None, n_estimators=140, n_jobs=1,
                                     nthread=4, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=27, silent=None,
                                     subsample=0.8, verbosity=1),
             iid=False, n_jobs=4,
             param_grid={'max_depth': range(3, 10, 2),
                         'min_child_weight': range(1, 6, 2)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=Fa

In [24]:
gsearch1.best_params_, gsearch1.best_score_

({'max_depth': 3, 'min_child_weight': 1}, 0.8347904707445188)

In [25]:
params = {
    'max_depth':[2,3,4],
    'min_child_weight':[1,2,3]}

gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
     min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
     objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
     param_grid = params, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch1.fit(X_train,y_train)



gsearch1.best_params_, gsearch1.best_score_

({'max_depth': 3, 'min_child_weight': 2}, 0.8351032251979087)

### 2,adjust gamma

In [26]:
params = {
    'gamma':[i/10.0 for i in range(0,5)]}

gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=3,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = params, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(X_train, y_train)


gsearch3.best_params_, gsearch3.best_score_

({'gamma': 0.1}, 0.8351047357795581)

### 3,Tune subsample and colsample_bytree

In [28]:
params = {
    'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]}

gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=3,
 min_child_weight=2, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = params, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch3.fit(X_train, y_train)


gsearch3.best_params_, gsearch3.best_score_

({'colsample_bytree': 0.6, 'subsample': 0.7}, 0.8353054767145917)

### 4,Tuning Regularization Parameters  #1e-5, 1e-7

In [30]:
params = {
    'reg_alpha':[1e-9,1e-8,1e-7,1e-6]}

gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=3,
 min_child_weight=2, gamma=0.1, subsample=0.7, colsample_bytree=0.6,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = params, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch3.fit(X_train, y_train)


gsearch3.best_params_, gsearch3.best_score_

({'reg_alpha': 1e-06}, 0.8353055108427159)

### 5,Tuning Learning Rate  #

In [116]:
params = {
    'learning_rate':[0.08,0.1,0.3,0.5]}

gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=2,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = params, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch3.fit(X_train, y_train)


gsearch3.best_params_, gsearch3.best_score_

({'learning_rate': 0.1}, 0.8323960975026541)

best parameters: 'max_depth': 2, 'min_child_weight': 2,'colsample_bytree': 0.8, gamma:0, 'subsample': 0.8,'reg_alpha': 1e-07, 'learning_rate': 0.1

In [266]:
xgb_model = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=2,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27)


In [267]:
xgb1score= np.mean(acc(xgb_model))

In [268]:
print(xgb1score)

0.8343810818254859


# RF Model Tuning

In [31]:
rfc = RandomForestClassifier(n_estimators=10, criterion="entropy", max_features=None, max_depth=7,
                             min_samples_leaf=9, n_jobs=4, random_state=1)
#rfc.fit(X_train, y_train)

rf_score = np.mean(acc(rfc))

In [32]:
print(rf_score)

0.8311620942159887


### 1, Max_depth and min_sample_leaf tuning

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [33]:
params = {
   'max_depth': [4,5,6,7]}

gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=10, criterion="entropy", max_features=None, max_depth=7,
                             min_samples_leaf=9, n_jobs=4, random_state=1),
                        param_grid = params,scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch3.fit(X_train, y_train)


gsearch3.best_params_, gsearch3.best_score_

({'max_depth': 6}, 0.8312632695687527)

In [34]:
params = {
   'min_samples_leaf': [3,4,5],
    'max_depth': [5,6,7]}

gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=10, criterion="entropy", max_features=None, max_depth=6,
                             min_samples_leaf=5, n_jobs=4, random_state=1),
                        param_grid = params,scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch3.fit(X_train, y_train)


gsearch3.best_params_, gsearch3.best_score_

({'max_depth': 6, 'min_samples_leaf': 4}, 0.8313834069215551)

### 2, Min_Sample_Split tuning

In [37]:
params = {
   'min_samples_split': [1,2,3]}

gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=10, criterion="entropy", max_features=None, max_depth=6,
                             min_samples_leaf=4, n_jobs=4, random_state=1, min_samples_split = 4),
                        param_grid = params,scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch3.fit(X_train, y_train)


gsearch3.best_params_, gsearch3.best_score_

({'min_samples_split': 2}, 0.8313834069215551)

### 3, n_estimators tuning

In [38]:
params = {
   'n_estimators': [8,9,10,11,12]}

gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=11, criterion="entropy", max_features=None, max_depth=6,
                             min_samples_leaf=4, n_jobs=4, random_state=1, min_samples_split = 2),
                        param_grid = params,scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch3.fit(X_train, y_train)


gsearch3.best_params_, gsearch3.best_score_

({'n_estimators': 12}, 0.8320846245664354)

best parameters for RF: {'max_depth': 6, 'min_samples_leaf': 5},{'min_samples_split': 2},{'n_estimators': 11}

In [41]:
rf_model =RandomForestClassifier(n_estimators=12, criterion="entropy", max_features=None, max_depth=6,
                             min_samples_leaf=45, n_jobs=4, random_state=1, min_samples_split = 2)

In [42]:
rfscore= np.mean(acc(rf_model))

In [43]:
print(rfscore)

0.8319959588364888


# LightGBM model tuning

In [45]:
from lightgbm import LGBMClassifier

 'num_leaves': [31, 127],
    'feature_fraction': [0.5, 1.0],
    'bagging_fraction': [0.75, 0.95], 
    'reg_alpha': [0.1, 0.5]

### 1, num_leaves tuning

In [46]:
params = {
    'num_leaves': [48,49,50,51,52]}

gsearch3 = GridSearchCV(estimator = LGBMClassifier(boosting_type='gbdt',
                                  objective='binary',
                                  bagging_freq=5,
                                  num_boost_round=50,
                                  learning_rate=0.01,
                                  eval_metric='l1'),

                        param_grid = params,scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch3.fit(X_train, y_train)


gsearch3.best_params_, gsearch3.best_score_

({'num_leaves': 52}, 0.8305473356393909)

### 2, feature_fraction and bagging_fraction tuning

In [52]:
params = {
     'feature_fraction': [0.87,0.88,0.89,0.9,0.92],
    'bagging_fraction': [0.96]}

gsearch3 = GridSearchCV(estimator = LGBMClassifier(boosting_type='gbdt',
                                  objective='binary',
                                  bagging_freq=5,
                                  num_boost_round=50,
                                  learning_rate=0.01,
                                  eval_metric='l1',
                                  num_leaves = 52),

                        param_grid = params,scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch3.fit(X_train, y_train)


gsearch3.best_params_, gsearch3.best_score_

({'bagging_fraction': 0.96, 'feature_fraction': 0.88}, 0.8332757469731522)

### 3, reg_alpha tuning

In [58]:
params = {
    'reg_alpha': [0.48,0.49,0.5,0.51,0.52]}

gsearch3 = GridSearchCV(estimator = LGBMClassifier(boosting_type='gbdt',
                                  objective='binary',
                                  bagging_freq=5,
                                  num_boost_round=50,
                                  learning_rate=0.01,
                                  eval_metric='l1',
                                  num_leaves = 52,
                                  bagging_fraction=0.96,
                                  feature_fraction= 0.88 
                                                  ),
                                  param_grid = params,scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch3.fit(X_train, y_train)


gsearch3.best_params_, gsearch3.best_score_

({'reg_alpha': 0.5}, 0.8333928157220697)

### 4, learning_rate tuning

In [59]:
params = {
    'learning_rate': [0.02,0.022,0.024,0.023]}

gsearch3 = GridSearchCV(estimator = LGBMClassifier(boosting_type='gbdt',
                                  objective='binary',
                                  bagging_freq=5,
                                  num_boost_round=50,
                                  learning_rate=0.01,
                                  eval_metric='l1',
                                  num_leaves = 52,
                                  bagging_fraction=0.96,
                                  feature_fraction= 0.88,
                                  reg_alpha = 0.4  
                                                  ),
                                  param_grid = params,scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch3.fit(X_train, y_train)


gsearch3.best_params_, gsearch3.best_score_

({'learning_rate': 0.024}, 0.8339931531427094)

In [104]:
params = {
    'learning_rate': [0.02,0.031,0.032,0.033,0.034,0.035]}

gsearch3 = GridSearchCV(estimator = LGBMClassifier(boosting_type='gbdt',
                                  objective='binary',
                                  bagging_freq=5,
                                  num_boost_round=50,
                                  learning_rate=0.022,
                                  eval_metric='l1',
                                  num_leaves = 52,
                                  bagging_fraction=0.96,
                                  feature_fraction= 0.88,
                                  reg_alpha = 0.4 
                                  ),
                                  param_grid = params, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch3.fit(X_train, y_train)


gsearch3.best_params_, gsearch3.best_score_

({'learning_rate': 0.032}, 0.8342345112661758)

In [69]:
light_model = LGBMClassifier(boosting_type='gbdt',
                                  objective='binary',
                                  bagging_freq=5,
                                  num_boost_round=50,
                                  learning_rate=0.032,
                                  eval_metric='l1',
                                  num_leaves = 52,
                                  bagging_fraction=0.96,
                                  feature_fraction= 0.88,
                                  reg_alpha = 0.4)

In [70]:
light_score = np.mean(acc(light_model))

In [71]:
print(light_score)

0.833363317820486


# AVG Model

In [105]:
from sklearn.ensemble import VotingClassifier
estimators = [('xgb', xgbc), ('rf', rf_model),('light', gsearch3)
              ]
av_model = VotingClassifier(estimators = estimators, voting='soft', weights =[0.5,0.2,0.3] )

In [77]:
avg_score = np.mean(acc(av_model))

In [78]:
print(avg_score)

0.8362249133106248


In [75]:
 #weights =[0.5,0.2,0.3] 

# Stacking model

In [80]:
reg_stack = StackingClassifier(classifiers=[rf_model,xgbc,av_model,logmodel2,light_model],
                              meta_classifier=av_model,use_features_in_secondary=True)
stk_score =np.mean(acc(reg_stack))


In [81]:
print(stk_score)

0.8361132202977473


# Predict

In [93]:
X_test = pd.read_csv('xtest6.csv')

In [94]:
X_test.head()

Unnamed: 0.1,Unnamed: 0,var15,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var41_efect_ult1,imp_op_var41_efect_ult3,imp_op_var39_efect_ult1,imp_op_var39_efect_ult3,ind_var1,ind_var5,...,saldo_medio_var12_ult1,saldo_medio_var12_ult3,saldo_medio_var13_corto_hace2,saldo_medio_var13_corto_hace3,saldo_medio_var13_corto_ult1,saldo_medio_var13_corto_ult3,saldo_medio_var13_largo_hace2,saldo_medio_var13_largo_ult1,saldo_medio_var13_largo_ult3,var38
0,76020,32,0.0,0.0,0.0,0.0,0.0,0.0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40532.1
1,76021,35,0.0,0.0,0.0,0.0,0.0,0.0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45486.72
2,76022,23,0.0,0.0,60.0,60.0,60.0,60.0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46993.95
3,76023,24,0.0,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,187898.61
4,76024,23,0.0,0.0,0.0,0.0,0.0,0.0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,73649.73


In [95]:
id_test = X_test['Unnamed: 0']

In [96]:
id_test.head()

0    76020
1    76021
2    76022
3    76023
4    76024
Name: Unnamed: 0, dtype: int64

In [97]:
X_test.drop('Unnamed: 0',axis=1, inplace = True)

In [98]:
X_test.head()

Unnamed: 0,var15,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var41_efect_ult1,imp_op_var41_efect_ult3,imp_op_var39_efect_ult1,imp_op_var39_efect_ult3,ind_var1,ind_var5,ind_var8_0,...,saldo_medio_var12_ult1,saldo_medio_var12_ult3,saldo_medio_var13_corto_hace2,saldo_medio_var13_corto_hace3,saldo_medio_var13_corto_ult1,saldo_medio_var13_corto_ult3,saldo_medio_var13_largo_hace2,saldo_medio_var13_largo_ult1,saldo_medio_var13_largo_ult3,var38
0,32,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40532.1
1,35,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45486.72
2,23,0.0,0.0,60.0,60.0,60.0,60.0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46993.95
3,24,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,187898.61
4,23,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,73649.73


In [88]:
X_train.head()

Unnamed: 0,var15,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var41_efect_ult1,imp_op_var41_efect_ult3,imp_op_var39_efect_ult1,imp_op_var39_efect_ult3,ind_var1,ind_var5,ind_var8_0,...,saldo_medio_var12_ult1,saldo_medio_var12_ult3,saldo_medio_var13_corto_hace2,saldo_medio_var13_corto_hace3,saldo_medio_var13_corto_ult1,saldo_medio_var13_corto_ult3,saldo_medio_var13_largo_hace2,saldo_medio_var13_largo_ult1,saldo_medio_var13_largo_ult3,var38
0,23,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17
1,34,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0.0,0.0,300.0,122.22,300.0,240.75,0.0,0.0,0.0,49278.03
2,23,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77
3,37,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97
4,39,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,85501.89,85501.89,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016


In [106]:
av_model = xgbc.fit(X_train,y_train)

In [107]:
y_pred= av_model.predict_proba(X_test)[:,1]




In [108]:
y_pred

array([0.0429065 , 0.04924943, 0.0041141 , ..., 0.00562191, 0.06823904,
       0.0033214 ], dtype=float32)

In [109]:
samplesubmission = pd.read_csv('sample_submission.csv')

output = pd.DataFrame({'ID': samplesubmission.ID, 'TARGET': y_pred})
output.to_csv('submission.csv', index=False)