In [2]:
import os
import numpy as np
import pandas as pd
from sklearn import preprocessing, manifold, ensemble
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb

%matplotlib inline

In [3]:
train_dat = pd.read_csv('features/zeros_ones_train.csv')
test_dat = pd.read_csv('features/zeros_ones_test.csv')
train_labels = np.load('features/train_labels.npy')
train_dat.head()

Unnamed: 0,Field6,Field7,Field8,Field9,Field10,Field11,Field12,CoverageField1A,CoverageField1B,CoverageField2A,...,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64,Month,Year,Weekday
0,1,23,0.9403,0.0006,7,1.02,0,17,23,17,...,8,-1,18,-1,10,1,0,8,2013,4
1,5,7,1.0006,0.004,4,1.2433,0,6,8,6,...,11,-1,17,-1,20,1,2,4,2014,1
2,5,7,1.0006,0.004,4,1.2433,0,7,12,7,...,21,-1,11,-1,8,1,2,8,2014,0
3,6,10,0.9769,0.0004,1,1.2665,0,3,2,3,...,10,-1,9,-1,21,1,3,4,2013,0
4,4,23,0.9472,0.0006,3,1.3045,0,8,13,8,...,10,-1,11,-1,12,1,1,1,2014,5


In [None]:
base = xgb.XGBClassifier(nthread=4)

clf = ensemble.BaggingClassifier(base, n_estimators=10, max_samples=1.0, max_features=1.0)
N = int(0.5*len(train_labels))
clf.fit(train_dat.iloc[0:N], train_labels[0:N])

In [None]:
clf.score(train_dat.iloc[N:], train_labels[N:])

# Bagging

In [5]:
bags = 50
bag_ratio = 0.85
models = []
N = int(0.9*len(train_dat))

xg_test = xgb.DMatrix( train_dat.iloc[N:].as_matrix(), label=train_labels[N:])

X = train_dat.iloc[:N]
Y = train_labels[:N]

param = {   'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'nthread':4,
            'max_depth':6,
            'colsample_bytree':0.77,
            'subsample':0.85,
            'min_child_weight':4,
            'eta':0.02,
        }
num_round = 1000

for i in range(bags):
    print('\rBag {} of {}'.format(i+1,bags),end='')
    
    r = np.random.rand(X.shape[0])
    X_bag = X[r<bag_ratio]
    Y_bag = Y[r<bag_ratio]
    xg_train = xgb.DMatrix( X_bag.as_matrix(), label=Y_bag)
    
    watchlist = [(xg_train,'train'),(xg_test,'eval')]
    model = xgb.train(param, xg_train, num_round, watchlist,
                      early_stopping_rounds=100)
    model.save_model('bag_models/model{}'.format(i+49))
    models.append(model)
    del xg_train

Will train until train error hasn't decreased in 100 rounds.
[0]	eval-auc:0.905343	train-auc:0.903531
[1]	eval-auc:0.935870	train-auc:0.934311
[2]	eval-auc:0.937232	train-auc:0.935070
[3]	eval-auc:0.939361	train-auc:0.936913
[4]	eval-auc:0.940755	train-auc:0.938322
[5]	eval-auc:0.941774	train-auc:0.939605
[6]	eval-auc:0.941790	train-auc:0.939567
[7]	eval-auc:0.942111	train-auc:0.940042
[8]	eval-auc:0.942891	train-auc:0.940879
[9]	eval-auc:0.943295	train-auc:0.941058
[10]	eval-auc:0.943547	train-auc:0.941161
[11]	eval-auc:0.945002	train-auc:0.942518
[12]	eval-auc:0.945938	train-auc:0.943730
[13]	eval-auc:0.945838	train-auc:0.943702
[14]	eval-auc:0.945830	train-auc:0.943718
[15]	eval-auc:0.946022	train-auc:0.943961
[16]	eval-auc:0.946070	train-auc:0.943945
[17]	eval-auc:0.945991	train-auc:0.943832
[18]	eval-auc:0.946135	train-auc:0.943979
[19]	eval-auc:0.946338	train-auc:0.944152
[20]	eval-auc:0.946342	train-auc:0.944192
[21]	eval-auc:0.946562	train-auc:0.944345
[22]	eval-auc:0.946510	tr

Bag 1 of 50Bag 2 of 50

[999]	eval-auc:0.967381	train-auc:0.976473
Will train until train error hasn't decreased in 100 rounds.
[0]	eval-auc:0.921657	train-auc:0.917875
[1]	eval-auc:0.933326	train-auc:0.931086
[2]	eval-auc:0.938252	train-auc:0.935874
[3]	eval-auc:0.942612	train-auc:0.940130
[4]	eval-auc:0.942561	train-auc:0.940059
[5]	eval-auc:0.943574	train-auc:0.941102
[6]	eval-auc:0.945107	train-auc:0.942662
[7]	eval-auc:0.946806	train-auc:0.944325
[8]	eval-auc:0.946108	train-auc:0.943730
[9]	eval-auc:0.945370	train-auc:0.943097
[10]	eval-auc:0.945098	train-auc:0.942944
[11]	eval-auc:0.944896	train-auc:0.942748
[12]	eval-auc:0.945553	train-auc:0.943355
[13]	eval-auc:0.945888	train-auc:0.943552
[14]	eval-auc:0.945767	train-auc:0.943358
[15]	eval-auc:0.946341	train-auc:0.943970
[16]	eval-auc:0.946207	train-auc:0.943878
[17]	eval-auc:0.946036	train-auc:0.943740
[18]	eval-auc:0.946109	train-auc:0.943779
[19]	eval-auc:0.946334	train-auc:0.943968
[20]	eval-auc:0.946889	train-auc:0.944421
[21]	eval-auc:0.947274	t

Bag 3 of 50

Will train until train error hasn't decreased in 100 rounds.
[0]	eval-auc:0.906400	train-auc:0.903903
[1]	eval-auc:0.939062	train-auc:0.936751
[2]	eval-auc:0.942419	train-auc:0.940230
[3]	eval-auc:0.938614	train-auc:0.935987
[4]	eval-auc:0.944659	train-auc:0.942318
[5]	eval-auc:0.943356	train-auc:0.941236
[6]	eval-auc:0.940752	train-auc:0.938679
[7]	eval-auc:0.943206	train-auc:0.941381
[8]	eval-auc:0.942528	train-auc:0.940253
[9]	eval-auc:0.940895	train-auc:0.938561
[10]	eval-auc:0.940205	train-auc:0.937880
[11]	eval-auc:0.942803	train-auc:0.940616
[12]	eval-auc:0.942513	train-auc:0.940146
[13]	eval-auc:0.944193	train-auc:0.941849
[14]	eval-auc:0.944612	train-auc:0.942376
[15]	eval-auc:0.945789	train-auc:0.943523
[16]	eval-auc:0.946087	train-auc:0.943889
[17]	eval-auc:0.946256	train-auc:0.944142
[18]	eval-auc:0.946594	train-auc:0.944521
[19]	eval-auc:0.946812	train-auc:0.944724
[20]	eval-auc:0.947079	train-auc:0.944940
[21]	eval-auc:0.946905	train-auc:0.944763
[22]	eval-auc:0.947001	tr

Bag 4 of 50

[999]	eval-auc:0.967449	train-auc:0.976314
Will train until train error hasn't decreased in 100 rounds.
[0]	eval-auc:0.929092	train-auc:0.925985
[1]	eval-auc:0.933091	train-auc:0.930060
[2]	eval-auc:0.933405	train-auc:0.930667
[3]	eval-auc:0.939533	train-auc:0.937298
[4]	eval-auc:0.941318	train-auc:0.939180
[5]	eval-auc:0.941283	train-auc:0.939319
[6]	eval-auc:0.942738	train-auc:0.940334
[7]	eval-auc:0.944638	train-auc:0.942253
[8]	eval-auc:0.944798	train-auc:0.942393
[9]	eval-auc:0.944458	train-auc:0.942497
[10]	eval-auc:0.944311	train-auc:0.942357
[11]	eval-auc:0.945318	train-auc:0.943378
[12]	eval-auc:0.944909	train-auc:0.943037
[13]	eval-auc:0.944594	train-auc:0.942698
[14]	eval-auc:0.944458	train-auc:0.942742
[15]	eval-auc:0.944738	train-auc:0.943017
[16]	eval-auc:0.945239	train-auc:0.943627
[17]	eval-auc:0.945454	train-auc:0.943878
[18]	eval-auc:0.945598	train-auc:0.944012
[19]	eval-auc:0.945629	train-auc:0.944096
[20]	eval-auc:0.945708	train-auc:0.943990
[21]	eval-auc:0.945500	t

Bag 5 of 50

Will train until train error hasn't decreased in 100 rounds.
[0]	eval-auc:0.932305	train-auc:0.929568
[1]	eval-auc:0.939863	train-auc:0.937276
[2]	eval-auc:0.941827	train-auc:0.939240
[3]	eval-auc:0.946091	train-auc:0.943406
[4]	eval-auc:0.945847	train-auc:0.943232
[5]	eval-auc:0.945041	train-auc:0.942475
[6]	eval-auc:0.946044	train-auc:0.943390
[7]	eval-auc:0.946810	train-auc:0.944497
[8]	eval-auc:0.946536	train-auc:0.944369
[9]	eval-auc:0.946138	train-auc:0.944006
[10]	eval-auc:0.946175	train-auc:0.944084
[11]	eval-auc:0.945802	train-auc:0.943674
[12]	eval-auc:0.946276	train-auc:0.944022
[13]	eval-auc:0.946037	train-auc:0.943807
[14]	eval-auc:0.945837	train-auc:0.943713
[15]	eval-auc:0.945663	train-auc:0.943533
[16]	eval-auc:0.945791	train-auc:0.943674
[17]	eval-auc:0.945605	train-auc:0.943418
[18]	eval-auc:0.945973	train-auc:0.943869
[19]	eval-auc:0.946083	train-auc:0.944107
[20]	eval-auc:0.946194	train-auc:0.944235
[21]	eval-auc:0.946094	train-auc:0.944124
[22]	eval-auc:0.946470	tr

Bag 6 of 50

[999]	eval-auc:0.967497	train-auc:0.976463
Will train until train error hasn't decreased in 100 rounds.
[0]	eval-auc:0.925698	train-auc:0.923400
[1]	eval-auc:0.941735	train-auc:0.940440
[2]	eval-auc:0.943606	train-auc:0.941813
[3]	eval-auc:0.942916	train-auc:0.941482
[4]	eval-auc:0.942180	train-auc:0.940560
[5]	eval-auc:0.942473	train-auc:0.940905
[6]	eval-auc:0.942793	train-auc:0.940968
[7]	eval-auc:0.943778	train-auc:0.941762
[8]	eval-auc:0.945593	train-auc:0.943651
[9]	eval-auc:0.945261	train-auc:0.943292
[10]	eval-auc:0.945275	train-auc:0.943312
[11]	eval-auc:0.945302	train-auc:0.943414
[12]	eval-auc:0.945089	train-auc:0.943168
[13]	eval-auc:0.945040	train-auc:0.943142
[14]	eval-auc:0.945305	train-auc:0.943436
[15]	eval-auc:0.945556	train-auc:0.943577
[16]	eval-auc:0.945726	train-auc:0.943794
[17]	eval-auc:0.945682	train-auc:0.943695
[18]	eval-auc:0.945572	train-auc:0.943622
[19]	eval-auc:0.945573	train-auc:0.943615
[20]	eval-auc:0.945618	train-auc:0.943677
[21]	eval-auc:0.945669	t

Bag 7 of 50

Will train until train error hasn't decreased in 100 rounds.
[0]	eval-auc:0.904085	train-auc:0.901472
[1]	eval-auc:0.937956	train-auc:0.935552
[2]	eval-auc:0.941109	train-auc:0.938289
[3]	eval-auc:0.941309	train-auc:0.938677
[4]	eval-auc:0.941516	train-auc:0.938781
[5]	eval-auc:0.941450	train-auc:0.938585
[6]	eval-auc:0.941088	train-auc:0.938188
[7]	eval-auc:0.940893	train-auc:0.937946
[8]	eval-auc:0.941863	train-auc:0.939080
[9]	eval-auc:0.941624	train-auc:0.938904
[10]	eval-auc:0.942614	train-auc:0.940135
[11]	eval-auc:0.943286	train-auc:0.940845
[12]	eval-auc:0.943781	train-auc:0.941328
[13]	eval-auc:0.944073	train-auc:0.941610
[14]	eval-auc:0.943960	train-auc:0.941493
[15]	eval-auc:0.944286	train-auc:0.941782
[16]	eval-auc:0.944419	train-auc:0.941963
[17]	eval-auc:0.944552	train-auc:0.942050
[18]	eval-auc:0.944723	train-auc:0.942150
[19]	eval-auc:0.944593	train-auc:0.942044
[20]	eval-auc:0.945546	train-auc:0.943016
[21]	eval-auc:0.945449	train-auc:0.942953
[22]	eval-auc:0.945778	tr

Bag 8 of 50

[999]	eval-auc:0.967493	train-auc:0.976618
Will train until train error hasn't decreased in 100 rounds.
[0]	eval-auc:0.932110	train-auc:0.929681
[1]	eval-auc:0.934380	train-auc:0.932039
[2]	eval-auc:0.934375	train-auc:0.932440
[3]	eval-auc:0.938153	train-auc:0.936365
[4]	eval-auc:0.939393	train-auc:0.937606
[5]	eval-auc:0.939630	train-auc:0.937748
[6]	eval-auc:0.939213	train-auc:0.937341
[7]	eval-auc:0.939019	train-auc:0.937277
[8]	eval-auc:0.939962	train-auc:0.938048
[9]	eval-auc:0.941688	train-auc:0.939982
[10]	eval-auc:0.942622	train-auc:0.940890
[11]	eval-auc:0.942346	train-auc:0.940508
[12]	eval-auc:0.941989	train-auc:0.940215
[13]	eval-auc:0.942961	train-auc:0.941201
[14]	eval-auc:0.943760	train-auc:0.942303
[15]	eval-auc:0.943911	train-auc:0.942437
[16]	eval-auc:0.943758	train-auc:0.942275
[17]	eval-auc:0.944102	train-auc:0.942533
[18]	eval-auc:0.944325	train-auc:0.942661
[19]	eval-auc:0.944442	train-auc:0.942715
[20]	eval-auc:0.944945	train-auc:0.943052
[21]	eval-auc:0.944729	t

Bag 9 of 50

Will train until train error hasn't decreased in 100 rounds.
[0]	eval-auc:0.903253	train-auc:0.901978
[1]	eval-auc:0.937742	train-auc:0.936109
[2]	eval-auc:0.938861	train-auc:0.937485
[3]	eval-auc:0.940730	train-auc:0.938725
[4]	eval-auc:0.942011	train-auc:0.939829
[5]	eval-auc:0.944057	train-auc:0.942084
[6]	eval-auc:0.943856	train-auc:0.941899
[7]	eval-auc:0.943515	train-auc:0.941608
[8]	eval-auc:0.943568	train-auc:0.941809
[9]	eval-auc:0.944783	train-auc:0.943173
[10]	eval-auc:0.944615	train-auc:0.942926
[11]	eval-auc:0.944467	train-auc:0.942804
[12]	eval-auc:0.944271	train-auc:0.942508
[13]	eval-auc:0.944426	train-auc:0.942590
[14]	eval-auc:0.944296	train-auc:0.942397
[15]	eval-auc:0.944522	train-auc:0.942643
[16]	eval-auc:0.944657	train-auc:0.942787
[17]	eval-auc:0.944579	train-auc:0.942676
[18]	eval-auc:0.944475	train-auc:0.942532
[19]	eval-auc:0.944435	train-auc:0.942516
[20]	eval-auc:0.944853	train-auc:0.943176
[21]	eval-auc:0.945586	train-auc:0.943876
[22]	eval-auc:0.945980	tr

Bag 10 of 50

[999]	eval-auc:0.967400	train-auc:0.976725
Will train until train error hasn't decreased in 100 rounds.
[0]	eval-auc:0.932693	train-auc:0.929903
[1]	eval-auc:0.938242	train-auc:0.934726
[2]	eval-auc:0.942234	train-auc:0.939920
[3]	eval-auc:0.942758	train-auc:0.940442
[4]	eval-auc:0.944673	train-auc:0.942490
[5]	eval-auc:0.944199	train-auc:0.941939
[6]	eval-auc:0.943853	train-auc:0.941525
[7]	eval-auc:0.943714	train-auc:0.941521
[8]	eval-auc:0.944136	train-auc:0.941936
[9]	eval-auc:0.944038	train-auc:0.941857
[10]	eval-auc:0.943831	train-auc:0.941558
[11]	eval-auc:0.943851	train-auc:0.941567
[12]	eval-auc:0.944010	train-auc:0.941785
[13]	eval-auc:0.944114	train-auc:0.941903
[14]	eval-auc:0.944271	train-auc:0.942057
[15]	eval-auc:0.944493	train-auc:0.942192
[16]	eval-auc:0.944585	train-auc:0.942327
[17]	eval-auc:0.944541	train-auc:0.942363
[18]	eval-auc:0.944922	train-auc:0.942801
[19]	eval-auc:0.946189	train-auc:0.943937
[20]	eval-auc:0.946529	train-auc:0.944339
[21]	eval-auc:0.946593	t

Bag 11 of 50

Will train until train error hasn't decreased in 100 rounds.
[0]	eval-auc:0.926290	train-auc:0.922500
[1]	eval-auc:0.931079	train-auc:0.927859
[2]	eval-auc:0.933989	train-auc:0.930670
[3]	eval-auc:0.938283	train-auc:0.934835
[4]	eval-auc:0.938544	train-auc:0.936058
[5]	eval-auc:0.938701	train-auc:0.936280
[6]	eval-auc:0.938305	train-auc:0.935797
[7]	eval-auc:0.941718	train-auc:0.938993
[8]	eval-auc:0.942987	train-auc:0.940390
[9]	eval-auc:0.943360	train-auc:0.940695
[10]	eval-auc:0.943035	train-auc:0.940381
[11]	eval-auc:0.944789	train-auc:0.942472
[12]	eval-auc:0.944558	train-auc:0.942207
[13]	eval-auc:0.944625	train-auc:0.942337
[14]	eval-auc:0.944841	train-auc:0.942521
[15]	eval-auc:0.944799	train-auc:0.942481
[16]	eval-auc:0.945671	train-auc:0.943202
[17]	eval-auc:0.945591	train-auc:0.943161
[18]	eval-auc:0.945771	train-auc:0.943328
[19]	eval-auc:0.945832	train-auc:0.943410
[20]	eval-auc:0.945826	train-auc:0.943441
[21]	eval-auc:0.945611	train-auc:0.943333
[22]	eval-auc:0.946358	tr

Bag 12 of 50

[999]	eval-auc:0.967332	train-auc:0.976627
Will train until train error hasn't decreased in 100 rounds.
[0]	eval-auc:0.905458	train-auc:0.903730
[1]	eval-auc:0.936567	train-auc:0.935668
[2]	eval-auc:0.938473	train-auc:0.937294
[3]	eval-auc:0.940495	train-auc:0.939310
[4]	eval-auc:0.942668	train-auc:0.940848
[5]	eval-auc:0.943440	train-auc:0.941578
[6]	eval-auc:0.943076	train-auc:0.941198
[7]	eval-auc:0.942838	train-auc:0.940936
[8]	eval-auc:0.942643	train-auc:0.940743
[9]	eval-auc:0.942388	train-auc:0.940500
[10]	eval-auc:0.942908	train-auc:0.940904
[11]	eval-auc:0.942781	train-auc:0.940766
[12]	eval-auc:0.943216	train-auc:0.941153
[13]	eval-auc:0.943178	train-auc:0.941164
[14]	eval-auc:0.942900	train-auc:0.940903
[15]	eval-auc:0.942496	train-auc:0.940672
[16]	eval-auc:0.943049	train-auc:0.941245
[17]	eval-auc:0.943419	train-auc:0.941560
[18]	eval-auc:0.944493	train-auc:0.942450
[19]	eval-auc:0.944532	train-auc:0.942495
[20]	eval-auc:0.945127	train-auc:0.943195
[21]	eval-auc:0.945334	t

Bag 13 of 50

Will train until train error hasn't decreased in 100 rounds.
[0]	eval-auc:0.928726	train-auc:0.925819
[1]	eval-auc:0.933569	train-auc:0.930917
[2]	eval-auc:0.933355	train-auc:0.930719
[3]	eval-auc:0.933107	train-auc:0.930525
[4]	eval-auc:0.936123	train-auc:0.934305
[5]	eval-auc:0.936172	train-auc:0.934303
[6]	eval-auc:0.936126	train-auc:0.934288
[7]	eval-auc:0.941350	train-auc:0.938673
[8]	eval-auc:0.941444	train-auc:0.938948
[9]	eval-auc:0.941347	train-auc:0.938788
[10]	eval-auc:0.941293	train-auc:0.938745
[11]	eval-auc:0.941908	train-auc:0.939473
[12]	eval-auc:0.941705	train-auc:0.939333
[13]	eval-auc:0.942544	train-auc:0.940114
[14]	eval-auc:0.942697	train-auc:0.940260
[15]	eval-auc:0.942345	train-auc:0.940089
[16]	eval-auc:0.942833	train-auc:0.940508
[17]	eval-auc:0.943274	train-auc:0.941067
[18]	eval-auc:0.943331	train-auc:0.941115
[19]	eval-auc:0.943509	train-auc:0.941278
[20]	eval-auc:0.943786	train-auc:0.941450
[21]	eval-auc:0.943699	train-auc:0.941321
[22]	eval-auc:0.943949	tr

Bag 14 of 50

[999]	eval-auc:0.967511	train-auc:0.976718
Will train until train error hasn't decreased in 100 rounds.
[0]	eval-auc:0.929023	train-auc:0.925561
[1]	eval-auc:0.929106	train-auc:0.925733
[2]	eval-auc:0.937082	train-auc:0.934610
[3]	eval-auc:0.938382	train-auc:0.936245
[4]	eval-auc:0.939616	train-auc:0.937236
[5]	eval-auc:0.939507	train-auc:0.937448
[6]	eval-auc:0.938996	train-auc:0.936682
[7]	eval-auc:0.939403	train-auc:0.936956


KeyboardInterrupt: 

Helpful for bagging xgb models: https://github.com/andyh47/higgs

- Model0 is reaching 0.9683+ after 1750 rounds. Should verify this model's score on the LB
- Model0 peaked at 0.968262 after round 1999 (didn't hit early stop of 100)
- Model1 peak around 0.9684 then drops on the test data, maybe overfitting on train set
- Model2 peak around 0.9682
- Model3 peak around 0.9682
- Model4 peak around 0.9682

In [6]:
def predict(models, data):
    predicted = np.empty((data.num_row(),len(models)))
    predicted[:] = np.NAN
    for i,m in enumerate(models):
        print('\rPredicting {} of {}'.format(i+1,len(models)),end='')
        predicted[:,i] = m.predict(data)
    return np.apply_along_axis(np.mean,1,predicted)

In [7]:
xg_train = xgb.DMatrix( X.as_matrix(), label=Y)
p_train = predict(models, xg_train)
p_test = predict(models, xg_test)

Predicting 13 of 13

In [8]:
train_acc = sum(Y == np.round(p_train))/len(p_train)
test_acc = sum(train_labels[N:] == np.round(p_test))/len(p_test)
print(len(models))
print( train_acc )
print( test_acc)

13
0.937271228114
0.927749654855


~0.968 each for 29 models: 0.96746 LB

In [9]:
xg_pred = xgb.DMatrix( test_dat.as_matrix())

pred = predict(models, xg_pred)
sample = pd.read_csv('sample_submission.csv')
sample.QuoteConversion_Flag = pred
sample.to_csv('bagging_submission03.csv', index=False)

Predicting 13 of 13

In [None]:
xg_pred = xgb.DMatrix( test_dat.as_matrix())

pred = models[1].predict(xg_pred)
sample = pd.read_csv('sample_submission.csv')
sample.QuoteConversion_Flag = pred
sample.to_csv('ensemble_model1.csv', index=False)

In [None]:
j

In [None]:
for root, dirs, files in os.walk("/mydir"):
    for file in files:
        if file.endswith(".txt"):
             print(os.path.join(root, file))