In [15]:
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from sklearn.cross_validation import train_test_split
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)



In [5]:
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [35]:
cat_features = [a for a in test.columns if a.endswith('cat')]
model_vars = [a for a in train.columns if 'id' not in a and 'target' not in a]
id_train = train['id'].values
id_test = test['id'].values
x_train, x_valid, y_train, y_valid = train_test_split(train[model_vars], train.target, test_size=0.2, random_state=4242)

In [47]:
d_train = lgb.Dataset(x_train, y_train)
d_valid = lgb.Dataset(x_valid, y_valid, reference=d_train)
d_test =  lgb.Dataset(test[model_vars])

params = {
        'task' : 'train',
        'boosting_type' : 'gbdt',
        'objective' : 'binary',
        'metric' : {'binary_logloss'},
        'num_leaves' : 100,
        'max_depth' : 6,
        'learning_rate' : 0.1,
        'feature_fraction' : 0.9,
        'bagging_fraction' : 0.9,
        'bagging_freq': 5,
        'min_data_in_leaf' : 20,
        #'max_bin': 200,
        'verbose' : 0
}

#preds = gbm2.predict_proba(test[model_vars], num_iteration=gbm2.best_iteration)[:,1]
# model.fit(params,train[model_vars], train.target, categorical_feature=cat_features)
# preds = model.predict_proba(test[model_vars])[:,1]

In [48]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

# Create an XGBoost-compatible metric from Gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', 1.0/gini_score,0)]

# This is the data xgboost will test on after eachboosting round
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

# Train the model! We pass in a max of 10,000 rounds (with early stopping after 100)
# and the custom metric (maximize=True tells xgb that higher metric is better)

mdl = lgb.train(params, 
                d_train, 
                num_boost_round=10000, 
                valid_sets = [d_valid,d_train],
                valid_names = ['eval','train'],
                early_stopping_rounds=50, 
                feval=gini_xgb)
#mdl = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, feval=gini_xgb, maximize=True, verbose_eval=10)



[1]	train's binary_logloss: 0.611382	train's gini: 4.51212	eval's binary_logloss: 0.61138	eval's gini: 4.86936
Train until valid scores didn't improve in 50 rounds.
[2]	train's binary_logloss: 0.544497	train's gini: 4.01674	eval's binary_logloss: 0.544509	eval's gini: 4.40752
[3]	train's binary_logloss: 0.488933	train's gini: 3.87803	eval's binary_logloss: 0.488947	eval's gini: 4.15273
[4]	train's binary_logloss: 0.442233	train's gini: 3.76607	eval's binary_logloss: 0.442261	eval's gini: 4.06658
[5]	train's binary_logloss: 0.402645	train's gini: 3.76173	eval's binary_logloss: 0.402685	eval's gini: 4.05332
[6]	train's binary_logloss: 0.36885	train's gini: 3.67834	eval's binary_logloss: 0.368908	eval's gini: 4.02962
[7]	train's binary_logloss: 0.339843	train's gini: 3.64318	eval's binary_logloss: 0.339909	eval's gini: 3.99622
[8]	train's binary_logloss: 0.314833	train's gini: 3.63028	eval's binary_logloss: 0.314928	eval's gini: 3.9957
[9]	train's binary_logloss: 0.293206	train's gini: 3.

[74]	train's binary_logloss: 0.146838	train's gini: 2.51321	eval's binary_logloss: 0.151998	eval's gini: 3.63732
[75]	train's binary_logloss: 0.146772	train's gini: 2.50271	eval's binary_logloss: 0.151989	eval's gini: 3.6327
[76]	train's binary_logloss: 0.146676	train's gini: 2.48963	eval's binary_logloss: 0.151996	eval's gini: 3.63232
[77]	train's binary_logloss: 0.146592	train's gini: 2.4799	eval's binary_logloss: 0.152003	eval's gini: 3.63385
[78]	train's binary_logloss: 0.146525	train's gini: 2.47158	eval's binary_logloss: 0.15199	eval's gini: 3.62882
[79]	train's binary_logloss: 0.146441	train's gini: 2.4668	eval's binary_logloss: 0.151989	eval's gini: 3.63018
[80]	train's binary_logloss: 0.146342	train's gini: 2.45498	eval's binary_logloss: 0.151989	eval's gini: 3.62797
[81]	train's binary_logloss: 0.146261	train's gini: 2.44855	eval's binary_logloss: 0.151987	eval's gini: 3.62989
[82]	train's binary_logloss: 0.14616	train's gini: 2.43833	eval's binary_logloss: 0.151996	eval's gi

In [49]:
# Predict on our test data
p_test = mdl.predict(test[model_vars],num_iteration=mdl.best_iteration)

# Create a submission file
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = p_test
sub.to_csv('xgb1.csv.gz', index=False,compression='gzip')

print(sub.head())

   id    target
0   0  0.026392
1   1  0.030347
2   2  0.030897
3   3  0.017845
4   4  0.038344


In [51]:
train.columns

Index(['id', 'target', 'ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03',
       'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin',
       'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin',
       'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15',
       'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01',
       'ps_reg_02', 'ps_reg_03', 'ps_car_01_cat', 'ps_car_02_cat',
       'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
       'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat',
       'ps_car_11_cat', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14',
       'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04',
       'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09',
       'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14',
       'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin',
       'ps_calc_19_bin', 'ps_calc_20_bin'],


In [52]:
[i for i, x in enumerate(train) if x == -1]

[]

In [53]:
np.where(train.values == -1)[0]

array([     0,      1,      1, ..., 595211, 595211, 595211], dtype=int64)