In [1]:
import pandas as pd 
import numpy as np 
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
#from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn import metrics
import gc



In [2]:
train = pd.read_csv('../input/merged-train.csv')
test = pd.read_csv('../input/merged-test.csv')

In [3]:
train.dropna(axis=0, inplace=True) # temporary
X_train, X_holdout, y_train, y_holdout = train_test_split(train.drop(['msno', 'is_churn'], axis=1), train['is_churn'], test_size=0.05)

#X_train = train.drop(['msno','is_churn'], axis=1)
#y_train = train['is_churn']

#print(X_train.shape, len(y_train))
#X_train.head()

train_meta = X_train.copy()
train_meta['fold_id'] = -999
train_meta['M1'] = np.nan
train_meta['M2'] = np.nan

(869896, 18) 869896


In [29]:
def fit_lgb_model(seed, train_index, test_index=None):
    d_valid = None
    es_rounds = None
    
    lgb_params = {
        'num_leaves': 108,
        'boosting_type': 'gbdt',
        'objective':'binary',
        'metric':'binary_logloss',
        'learning_rate': 0.01,
        'feature_fraction': 0.8,
        'bagging_freq': 1,
        'max_bin': 128,
        'max_depth': 10,
        'seed':seed,
    }

    d_train = lgb.Dataset(X_train.iloc[train_index], label=y_train.iloc[train_index])
    if test_index is not None:
        d_valid = lgb.Dataset(X_train.iloc[test_index], label=y_train.iloc[test_index], reference=d_train)
 
    lgb_fitted = lgb.train(lgb_params, d_train, num_boost_round=15, valid_sets=d_valid, early_stopping_rounds=es_rounds)
    
    return lgb_fitted

def fit_xgb_model(seed, train_index, test_index=None):
    watchlist = None
    es_rounds = None
    
    xgb_params = {
        'eta': 0.02, #use 0.002
        'max_depth': 7,
        'objective': 'binary:logistic',
        'silent': True,
        'seed': seed,
    }
    
    if test_index is not None:
        watchlist = [(xgb.DMatrix(X_train.iloc[train_index],y_train.iloc[train_index]), 'train'), 
                     (xgb.DMatrix(X_train.iloc[test_index], y_train.iloc[test_index]), 'valid')]
        es_rounds = 5

        
    xgb_fitted = xgb.train(xgb_params, xgb.DMatrix(X_train.iloc[train_index], y_train.iloc[train_index]), 
                           15,  evals=watchlist, maximize=False, verbose_eval=5, early_stopping_rounds=5) #use 1500
    
    return xgb_fitted
    

In [30]:
#X_train['foldID'] = -1
fold_id = 0
kf = StratifiedKFold(n_splits=3, shuffle=True)

for train_index, test_index in kf.split(X_train, y_train):    
    ### lgbm
    print('training lgbm')
    
    lgb_fitted = fit_lgb_model(fold_id, train_index, test_index)
    lgb_preds = lgb_fitted.predict(X_train.iloc[test_index])
    train_meta.iloc[test_index, train_meta.columns.get_loc('M1')] = lgb_preds
    
    ### xgb
    print('training xgbm')

    xgb_fitted = fit_xgb_model(fold_id, train_index, test_index)
    xgb_preds = xgb_fitted.predict(xgb.DMatrix(X_train.iloc[test_index]))
    train_meta.iloc[test_index, train_meta.columns.get_loc('M2')] = xgb_preds
    
    train_meta.iloc[test_index, train_meta.columns.get_loc('fold_id')] = fold_id # test fold
    fold_id = fold_id + 1


training lgbm
[1]	valid_0's binary_logloss: 0.684254
[2]	valid_0's binary_logloss: 0.67551
[3]	valid_0's binary_logloss: 0.666951
[4]	valid_0's binary_logloss: 0.658547
[5]	valid_0's binary_logloss: 0.650346
[6]	valid_0's binary_logloss: 0.642263
[7]	valid_0's binary_logloss: 0.634333
[8]	valid_0's binary_logloss: 0.626546
[9]	valid_0's binary_logloss: 0.61907
[10]	valid_0's binary_logloss: 0.611568
[11]	valid_0's binary_logloss: 0.6042
[12]	valid_0's binary_logloss: 0.597259
[13]	valid_0's binary_logloss: 0.590173
[14]	valid_0's binary_logloss: 0.583191
[15]	valid_0's binary_logloss: 0.576336
training xgbm
[0]	train-error:0.033816	valid-error:0.034294
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 5 rounds.
[5]	train-error:0.033697	valid-error:0.034183
[10]	train-error:0.032685	valid-error:0.033173
training lgbm
[1]	valid_0's binary_logloss: 0.684341
[2]	valid_0's binary_logloss: 0.675669
[3]	vali

In [6]:
'''
### model stacking ###





"\n### model stacking ###\n\n#lgb_preds = lgb_fitted.predict(test.drop(['msno', 'is_churn'], axis=1)).clip(0.0000001, 0.999999)\n\nsubmission = pd.DataFrame()\nsubmission['msno'] = test['msno']\nsubmission['is_churn'] = lgb_preds\n\nsubmission.to_csv('../output/submission.csv', index=False)\n"

In [9]:
train_meta.shape

(869896, 21)

In [32]:
train_meta.head(1)

Unnamed: 0,city,bd,registered_via,registration_init_time,expiration_date,total_order,plan_net_worth,mean_payment_each_transaction,total_actual_payment,cancel_times,...,last_listen_date,dist_last_listen_registration,dist_expiration_last_listen,listens_per_day,female,male,unknown_gender,fold_id,M1,M2
0,18.0,36.0,9.0,20050406.0,20170907.0,2,149,74.5,149,0,...,20170208.0,119802.0,699.0,0.000217,1,0,0,2,0.475003,0.449428


In [None]:
#lgb_preds = lgb_fitted.predict(test.drop(['msno', 'is_churn'], axis=1)).clip(0.0000001, 0.999999)

submission = pd.DataFrame()
submission['msno'] = test['msno']
submission['is_churn'] = lgb_preds

submission.to_csv('../output/submission.csv', index=False)
'''