In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

In [2]:
from sklearn.preprocessing import OneHotEncoder

In [3]:
def reduce_dimen(dataset,column,toreplace):
    for index,i in dataset[column].duplicated(keep=False).iteritems():
        if i==False:
            dataset.set_value(index,column,toreplace)
    return dataset

In [4]:
def act_data_treatment(dsname):
    dataset = dsname
    
    for col in list(dataset.columns):
        if col not in ['people_id', 'activity_id', 'date', 'char_38', 'outcome']:
            if dataset[col].dtype == 'object':
                dataset[col].fillna('type 0', inplace=True)
                dataset[col] = dataset[col].apply(lambda x: x.split(' ')[1]).astype(np.int32)
            elif dataset[col].dtype == 'bool':
                dataset[col] = dataset[col].astype(np.int8)
    
    dataset['year'] = dataset['date'].dt.year
    dataset['month'] = dataset['date'].dt.month
    dataset['day'] = dataset['date'].dt.day
    dataset['isweekend'] = (dataset['date'].dt.weekday >= 5).astype(int)
    dataset = dataset.drop('date', axis = 1)
    
    return dataset

In [5]:
act_train_data = pd.read_csv("data/act_train.csv",dtype={'people_id': np.str, 'activity_id': np.str, 'outcome': np.int8}, parse_dates=['date'])
act_test_data  = pd.read_csv("data/act_test.csv", dtype={'people_id': np.str, 'activity_id': np.str}, parse_dates=['date'])
people_data    = pd.read_csv("data/people.csv", dtype={'people_id': np.str, 'activity_id': np.str, 'char_38': np.int32}, parse_dates=['date'])

In [6]:
act_train_data = act_train_data.drop('char_10', axis = 1)
act_test_data=act_test_data.drop('char_10',axis=1)

In [7]:
print("Train data shape: " + format(act_train_data.shape))
print("Test data shape: " + format(act_test_data.shape))
print("People data shape: " + format(people_data.shape))

Train data shape: (2197291, 14)
Test data shape: (498687, 13)
People data shape: (189118, 41)


In [8]:
act_train_data  = act_data_treatment(act_train_data)
act_test_data   = act_data_treatment(act_test_data)
people_data = act_data_treatment(people_data)

In [9]:
train = act_train_data.merge(people_data, on='people_id', how='left', left_index=True)
test  = act_test_data.merge(people_data, on='people_id', how='left', left_index=True)

In [10]:
train.columns

Index([u'people_id', u'activity_id', u'activity_category', u'char_1_x',
       u'char_2_x', u'char_3_x', u'char_4_x', u'char_5_x', u'char_6_x',
       u'char_7_x', u'char_8_x', u'char_9_x', u'outcome', u'year_x',
       u'month_x', u'day_x', u'isweekend_x', u'char_1_y', u'group_1',
       u'char_2_y', u'char_3_y', u'char_4_y', u'char_5_y', u'char_6_y',
       u'char_7_y', u'char_8_y', u'char_9_y', u'char_10', u'char_11',
       u'char_12', u'char_13', u'char_14', u'char_15', u'char_16', u'char_17',
       u'char_18', u'char_19', u'char_20', u'char_21', u'char_22', u'char_23',
       u'char_24', u'char_25', u'char_26', u'char_27', u'char_28', u'char_29',
       u'char_30', u'char_31', u'char_32', u'char_33', u'char_34', u'char_35',
       u'char_36', u'char_37', u'char_38', u'year_y', u'month_y', u'day_y',
       u'isweekend_y'],
      dtype='object')

In [11]:
act_test_data.columns

Index([u'people_id', u'activity_id', u'activity_category', u'char_1',
       u'char_2', u'char_3', u'char_4', u'char_5', u'char_6', u'char_7',
       u'char_8', u'char_9', u'year', u'month', u'day', u'isweekend'],
      dtype='object')

In [12]:
del act_train_data
del act_test_data
del people_data

In [13]:
train=train.sort_values(['people_id'], ascending=[1])
test=test.sort_values(['people_id'], ascending=[1])

In [19]:
train.head()

Unnamed: 0,people_id,activity_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,...,char_33,char_34,char_35,char_36,char_37,char_38,year_y,month_y,day_y,isweekend_y
0,ppl_100,act2_1734928,4,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
0,ppl_100,act2_2434093,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
0,ppl_100,act2_3404049,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
0,ppl_100,act2_3651215,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
0,ppl_100,act2_4109017,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0


In [14]:
train_columns = train.columns.values
test_columns = test.columns.values
features = list(set(train_columns) & set(test_columns))

In [15]:
train.fillna('NA', inplace=True)
test.fillna('NA', inplace=True)

In [16]:
y = train.outcome
train=train.drop('outcome',axis=1)

In [17]:
whole=pd.concat([train,test],ignore_index=True)

In [21]:
whole.head()

Unnamed: 0,people_id,activity_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,...,char_33,char_34,char_35,char_36,char_37,char_38,year_y,month_y,day_y,isweekend_y
0,ppl_100,act2_1734928,4,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
1,ppl_100,act2_2434093,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
2,ppl_100,act2_3404049,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
3,ppl_100,act2_3651215,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
4,ppl_100,act2_4109017,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0


In [22]:
whole=pd.concat([train,test],ignore_index=True)
categorical=['group_1','activity_category','char_1_x','char_2_x','char_3_x','char_4_x','char_5_x','char_6_x','char_7_x','char_8_x','char_9_x','char_2_y','char_3_y','char_4_y','char_5_y','char_6_y','char_7_y','char_8_y','char_9_y']
for category in categorical:
    whole=reduce_dimen(whole,category,9999999)
    
X=whole[:len(train)]
X_test=whole[len(train):]

In [23]:
whole.head()

Unnamed: 0,people_id,activity_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,...,char_33,char_34,char_35,char_36,char_37,char_38,year_y,month_y,day_y,isweekend_y
0,ppl_100,act2_1734928,4,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
1,ppl_100,act2_2434093,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
2,ppl_100,act2_3404049,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
3,ppl_100,act2_3651215,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
4,ppl_100,act2_4109017,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0


In [18]:
del train
del whole

In [19]:
X=X.sort_values(['people_id'], ascending=[1])

In [20]:
X = X[features].drop(['people_id', 'activity_id'], axis = 1)
X_test = X_test[features].drop(['people_id', 'activity_id'], axis = 1)

In [21]:
categorical=['group_1','activity_category','char_1_x','char_2_x','char_3_x','char_4_x','char_5_x','char_6_x','char_7_x','char_8_x','char_9_x','char_2_y','char_3_y','char_4_y','char_5_y','char_6_y','char_7_y','char_8_y','char_9_y']
not_categorical=[]
for category in X.columns:
    if category not in categorical:
        not_categorical.append(category)

In [22]:
enc = OneHotEncoder(handle_unknown='ignore')
enc=enc.fit(pd.concat([X[categorical],X_test[categorical]]))

In [23]:
enc

OneHotEncoder(categorical_features='all', dtype=<type 'float'>,
       handle_unknown='ignore', n_values='auto', sparse=True)

In [24]:
X_cat_sparse=enc.transform(X[categorical])
X_test_cat_sparse=enc.transform(X_test[categorical])

In [25]:
from scipy.sparse import hstack
X_sparse=hstack((X[not_categorical], X_cat_sparse))
X_test_sparse=hstack((X_test[not_categorical], X_test_cat_sparse))

In [26]:
print("Training data: " + format(X_sparse.shape))
print("Test data: " + format(X_test_sparse.shape))
print("###########")
print("One Hot enconded Test Dataset Script")

Training data: (2197291, 31271)
Test data: (498687, 31271)
###########
One Hot enconded Test Dataset Script


In [27]:
dtrain = xgb.DMatrix(X_sparse,label=y)
dtest = xgb.DMatrix(X_test_sparse)

In [28]:
param = {'max_depth':10, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.7
param['colsample_bytree']= 0.7
param['min_child_weight'] = 0
param['booster'] = "gblinear"

In [29]:
watchlist  = [(dtrain,'train')]
num_round = 300
early_stopping_rounds=10
bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)

[0]	train-auc:0.887099
Will train until train-auc hasn't improved in 10 rounds.
[1]	train-auc:0.895149
[2]	train-auc:0.903351
[3]	train-auc:0.911834
[4]	train-auc:0.920202
[5]	train-auc:0.928199
[6]	train-auc:0.935564
[7]	train-auc:0.942332
[8]	train-auc:0.948622
[9]	train-auc:0.954424
[10]	train-auc:0.959765
[11]	train-auc:0.964583
[12]	train-auc:0.968846
[13]	train-auc:0.972557
[14]	train-auc:0.975743
[15]	train-auc:0.978457
[16]	train-auc:0.980763
[17]	train-auc:0.98268
[18]	train-auc:0.984286
[19]	train-auc:0.985632
[20]	train-auc:0.98676
[21]	train-auc:0.987711
[22]	train-auc:0.98852
[23]	train-auc:0.989213
[24]	train-auc:0.989816
[25]	train-auc:0.990348
[26]	train-auc:0.990819
[27]	train-auc:0.99124
[28]	train-auc:0.991622
[29]	train-auc:0.991968
[30]	train-auc:0.992282
[31]	train-auc:0.992571
[32]	train-auc:0.992835
[33]	train-auc:0.993078
[34]	train-auc:0.993303
[35]	train-auc:0.993511
[36]	train-auc:0.993703
[37]	train-auc:0.993884
[38]	train-auc:0.994051
[39]	train-auc:0.9942

In [30]:
ypred = bst.predict(dtest)
output = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred })
output.head()
output.to_csv('without_leak.csv', index = False)

In [31]:
output.head()

Unnamed: 0,activity_id,outcome
3,act1_249281,0.001523
3,act2_230855,0.001628
5,act2_688604,0.999501
5,act2_659237,0.999497
5,act2_649143,0.99949
