In [1]:
import numpy as np 
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder

In [2]:
def reduce_dimen(dataset,column,toreplace):
    for index,i in dataset[column].duplicated(keep=False).iteritems():
        if i==False:
            dataset.set_value(index,column,toreplace)
    return dataset
    
def act_data_treatment(dsname):
    dataset = dsname
    
    for col in list(dataset.columns):
        if col not in ['people_id', 'activity_id', 'date', 'char_38', 'outcome']:
            if dataset[col].dtype == 'object':
                dataset[col].fillna('type 0', inplace=True)
                dataset[col] = dataset[col].apply(lambda x: x.split(' ')[1]).astype(np.int32)
            elif dataset[col].dtype == 'bool':
                dataset[col] = dataset[col].astype(np.int8)
    
    dataset['year'] = dataset['date'].dt.year
    dataset['month'] = dataset['date'].dt.month
    dataset['day'] = dataset['date'].dt.day
    dataset['isweekend'] = (dataset['date'].dt.weekday >= 5).astype(int)
    dataset = dataset.drop('date', axis = 1)
    
    return dataset

In [3]:
act_train_data = pd.read_csv("act_train.csv",dtype={'people_id': np.str, 'activity_id': np.str, 'outcome': np.int8}, parse_dates=['date'])
act_test_data  = pd.read_csv("act_test.csv", dtype={'people_id': np.str, 'activity_id': np.str}, parse_dates=['date'])
people_data    = pd.read_csv("people.csv", dtype={'people_id': np.str, 'activity_id': np.str, 'char_38': np.int32}, parse_dates=['date'])

act_train_data=act_train_data.drop('char_10',axis=1)
act_test_data=act_test_data.drop('char_10',axis=1)

print("Train data shape: " + format(act_train_data.shape))
print("Test data shape: " + format(act_test_data.shape))
print("People data shape: " + format(people_data.shape))

act_train_data  = act_data_treatment(act_train_data)
act_test_data   = act_data_treatment(act_test_data)
people_data = act_data_treatment(people_data)

train = act_train_data.merge(people_data, on='people_id', how='left', left_index=True)
test  = act_test_data.merge(people_data, on='people_id', how='left', left_index=True)

del act_train_data
del act_test_data
del people_data

Train data shape: (2197291, 14)
Test data shape: (498687, 13)
People data shape: (189118, 41)


In [4]:
train=train.sort_values(['people_id'], ascending=[1])
test=test.sort_values(['people_id'], ascending=[1])

train_columns = train.columns.values
test_columns = test.columns.values
features = list(set(train_columns) & set(test_columns))

train.fillna('NA', inplace=True)
test.fillna('NA', inplace=True)

y = train.outcome
train=train.drop('outcome',axis=1)

whole=pd.concat([train,test],ignore_index=True)
categorical=['group_1','activity_category','char_1_x','char_2_x','char_3_x','char_4_x','char_5_x','char_6_x','char_7_x','char_8_x','char_9_x','char_2_y','char_3_y','char_4_y','char_5_y','char_6_y','char_7_y','char_8_y','char_9_y']
for category in categorical:
    whole=reduce_dimen(whole,category,9999999)
    
X=whole[:len(train)]
X_test=whole[len(train):]

del train
del whole

In [5]:
X=X.sort_values(['people_id'], ascending=[1])

X = X[features].drop(['people_id', 'activity_id'], axis = 1)
X_test = X_test[features].drop(['people_id', 'activity_id'], axis = 1)

categorical=['group_1','activity_category','char_1_x','char_2_x','char_3_x','char_4_x','char_5_x','char_6_x','char_7_x','char_8_x','char_9_x','char_2_y','char_3_y','char_4_y','char_5_y','char_6_y','char_7_y','char_8_y','char_9_y']
not_categorical=[]
for category in X.columns:
    if category not in categorical:
        not_categorical.append(category)

enc = OneHotEncoder(handle_unknown='ignore')
enc=enc.fit(pd.concat([X[categorical],X_test[categorical]]))
X_cat_sparse=enc.transform(X[categorical])
X_test_cat_sparse=enc.transform(X_test[categorical])

from scipy.sparse import hstack
X_sparse=hstack((X[not_categorical], X_cat_sparse))
X_test_sparse=hstack((X_test[not_categorical], X_test_cat_sparse))

print("Training data: " + format(X_sparse.shape))
print("Test data: " + format(X_test_sparse.shape))
print("###########")
print("One Hot enconded Test Dataset Script")

Training data: (2197291, 31271)
Test data: (498687, 31271)
###########
One Hot enconded Test Dataset Script


In [7]:
dtrain = xgb.DMatrix(X_sparse,label=y)
dtest = xgb.DMatrix(X_test_sparse)

param = {'max_depth':10, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.7
param['colsample_bytree']= 0.7
param['min_child_weight'] = 0
param['booster'] = "gblinear"
param['seed'] = 12

watchlist  = [(dtrain,'train')]
num_round = 300
early_stopping_rounds=10
bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds, verbose_eval=1)

Will train until train error hasn't decreased in 10 rounds.
[0]	train-auc:0.887189
[1]	train-auc:0.895166
[2]	train-auc:0.903316
[3]	train-auc:0.911758
[4]	train-auc:0.920093
[5]	train-auc:0.928073
[6]	train-auc:0.935426
[7]	train-auc:0.942196
[8]	train-auc:0.948470
[9]	train-auc:0.954289
[10]	train-auc:0.959628
[11]	train-auc:0.964457
[12]	train-auc:0.968740
[13]	train-auc:0.972471
[14]	train-auc:0.975681
[15]	train-auc:0.978409
[16]	train-auc:0.980709
[17]	train-auc:0.982641
[18]	train-auc:0.984256
[19]	train-auc:0.985606
[20]	train-auc:0.986739
[21]	train-auc:0.987693
[22]	train-auc:0.988503
[23]	train-auc:0.989200
[24]	train-auc:0.989805
[25]	train-auc:0.990337
[26]	train-auc:0.990810
[27]	train-auc:0.991233
[28]	train-auc:0.991615
[29]	train-auc:0.991961
[30]	train-auc:0.992276
[31]	train-auc:0.992565
[32]	train-auc:0.992830
[33]	train-auc:0.993074
[34]	train-auc:0.993299
[35]	train-auc:0.993507
[36]	train-auc:0.993701
[37]	train-auc:0.993881
[38]	train-auc:0.994048
[39]	train-auc

In [57]:
param['seed'] = 102
bst1 = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)

Will train until train error hasn't decreased in 10 rounds.
[0]	train-auc:0.887108
[1]	train-auc:0.895113
[2]	train-auc:0.903312
[3]	train-auc:0.911787
[4]	train-auc:0.920134
[5]	train-auc:0.928104
[6]	train-auc:0.935459
[7]	train-auc:0.942222
[8]	train-auc:0.948490
[9]	train-auc:0.954308
[10]	train-auc:0.959647
[11]	train-auc:0.964475
[12]	train-auc:0.968757
[13]	train-auc:0.972487
[14]	train-auc:0.975696
[15]	train-auc:0.978421
[16]	train-auc:0.980720
[17]	train-auc:0.982651
[18]	train-auc:0.984264
[19]	train-auc:0.985614
[20]	train-auc:0.986745
[21]	train-auc:0.987698
[22]	train-auc:0.988508
[23]	train-auc:0.989204
[24]	train-auc:0.989809
[25]	train-auc:0.990340
[26]	train-auc:0.990813
[27]	train-auc:0.991236
[28]	train-auc:0.991617
[29]	train-auc:0.991963
[30]	train-auc:0.992278
[31]	train-auc:0.992567
[32]	train-auc:0.992832
[33]	train-auc:0.993076
[34]	train-auc:0.993301
[35]	train-auc:0.993509
[36]	train-auc:0.993702
[37]	train-auc:0.993882
[38]	train-auc:0.994050
[39]	train-auc

In [58]:
param['seed'] = 42
bst2 = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)

Will train until train error hasn't decreased in 10 rounds.
[0]	train-auc:0.887213
[1]	train-auc:0.895159
[2]	train-auc:0.903336
[3]	train-auc:0.911805
[4]	train-auc:0.920151
[5]	train-auc:0.928123
[6]	train-auc:0.935468
[7]	train-auc:0.942230
[8]	train-auc:0.948500
[9]	train-auc:0.954316
[10]	train-auc:0.959654
[11]	train-auc:0.964481
[12]	train-auc:0.968761
[13]	train-auc:0.972491
[14]	train-auc:0.975699
[15]	train-auc:0.978424
[16]	train-auc:0.980722
[17]	train-auc:0.982653
[18]	train-auc:0.984266
[19]	train-auc:0.985615
[20]	train-auc:0.986746
[21]	train-auc:0.987699
[22]	train-auc:0.988509
[23]	train-auc:0.989205
[24]	train-auc:0.989810
[25]	train-auc:0.990341
[26]	train-auc:0.990814
[27]	train-auc:0.991237
[28]	train-auc:0.991618
[29]	train-auc:0.991964
[30]	train-auc:0.992279
[31]	train-auc:0.992568
[32]	train-auc:0.992833
[33]	train-auc:0.993076
[34]	train-auc:0.993301
[35]	train-auc:0.993509
[36]	train-auc:0.993702
[37]	train-auc:0.993882
[38]	train-auc:0.994050
[39]	train-auc

In [59]:
ypred0 = bst.predict(dtest)
ypred1 = bst1.predict(dtest)
ypred2 = bst2.predict(dtest)

ypred = (ypred0 + ypred1 + ypred2)/3

output = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred })
output.head()
output.to_csv('without_leak.csv', index = False)

In [60]:
leak = pd.read_csv('Submission.csv')

In [61]:
output.sort_values(by=['activity_id']).head()

Unnamed: 0,activity_id,outcome
90322,act1_1,0.9985339
28834,act1_100006,2.029203e-06
136144,act1_100050,0.5123992
21012,act1_100065,8.192633e-07
43014,act1_100068,1.055876e-06


In [62]:
leak.sort_values(by=['activity_id']).head()

Unnamed: 0,activity_id,outcome
240682,act1_1,1.0
79698,act1_100006,0.0
358220,act1_100050,0.505659
59778,act1_100065,0.0
117803,act1_100068,0.0


In [63]:
total = ((output.sort_values(by=['activity_id']).outcome.values) + (leak.sort_values(by=['activity_id']).outcome.values))/2

In [64]:
result = pd.DataFrame({ 'activity_id' : test['activity_id'].sort_values(), 'outcome': total })
result.head()
result.to_csv('without_leak.csv', index = False)

In [68]:
good = pd.read_csv('good.csv')

In [69]:
good.head()

Unnamed: 0,activity_id,outcome
0,act1_1,1.0
1,act1_100006,0.0
2,act1_100050,0.842327
3,act1_100065,0.0
4,act1_100068,0.0


In [70]:
result.head()

Unnamed: 0,activity_id,outcome
90322,act1_1,1.0
28834,act1_100006,0.0
136144,act1_100050,0.509029
21012,act1_100065,0.0
43014,act1_100068,0.0


In [71]:
result['outcome'] = result['outcome'] * 0.1 + good['outcome'] * 0.9

TypeError: Could not operate 0.1 with block values can't multiply sequence by non-int of type 'float'

In [65]:
result.loc[result.outcome<0.01,'outcome'] = 0

In [66]:
result.loc[result.outcome>0.99,'outcome'] = 1

In [67]:
result.to_csv('without_leak.csv', index = False)