In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
def LeaveOneOut(data1, data2, columnName, useLOO=False):
    grpOutcomes = data1.groupby(columnName).mean().reset_index()
    outcomes = data2['outcome'].values
    x = pd.merge(data2[[columnName, 'outcome']], grpOutcomes,
                 suffixes=('x_', ''),
                 how='left',
                 on=columnName,
                 left_index=True)['outcome']
    if(useLOO):
        x = ((x*x.shape[0])-outcomes)/(x.shape[0]-1)
    return x.fillna(x.mean())

In [3]:
directory = 'data/'
train = pd.read_csv(directory+'act_train.csv')
test = pd.read_csv(directory+'act_test.csv')
people = pd.read_csv(directory+'people.csv')

In [4]:
train = pd.merge(train, people,how='left',on='people_id',left_index=True)
train.fillna('-999', inplace=True)

In [17]:
features = train.columns.tolist()
features.remove('activity_id')
features.remove('people_id')
features

['date_x',
 'activity_category',
 'char_1_x',
 'char_2_x',
 'char_3_x',
 'char_4_x',
 'char_5_x',
 'char_6_x',
 'char_7_x',
 'char_8_x',
 'char_9_x',
 'char_10_x',
 'outcome',
 'char_1_y',
 'group_1',
 'char_2_y',
 'date_y',
 'char_3_y',
 'char_4_y',
 'char_5_y',
 'char_6_y',
 'char_7_y',
 'char_8_y',
 'char_9_y',
 'char_10_y',
 'char_11',
 'char_12',
 'char_13',
 'char_14',
 'char_15',
 'char_16',
 'char_17',
 'char_18',
 'char_19',
 'char_20',
 'char_21',
 'char_22',
 'char_23',
 'char_24',
 'char_25',
 'char_26',
 'char_27',
 'char_28',
 'char_29',
 'char_30',
 'char_31',
 'char_32',
 'char_33',
 'char_34',
 'char_35',
 'char_36',
 'char_37',
 'char_38']

In [18]:
lootrain = pd.DataFrame()
for col in features:
    if(col != 'outcome'):
        print(col)
        lootrain[col] = LeaveOneOut(train, train, col, True).values

date_x
activity_category
char_1_x
char_2_x
char_3_x
char_4_x
char_5_x
char_6_x
char_7_x
char_8_x
char_9_x
char_10_x
char_1_y
group_1
char_2_y
date_y
char_3_y
char_4_y
char_5_y
char_6_y
char_7_y
char_8_y
char_9_y
char_10_y
char_11
char_12
char_13
char_14
char_15
char_16
char_17
char_18
char_19
char_20
char_21
char_22
char_23
char_24
char_25
char_26
char_27
char_28
char_29
char_30
char_31
char_32
char_33
char_34
char_35
char_36
char_37
char_38


In [19]:
lr = LogisticRegression(C=100000.0)
lr.fit(lootrain, train['outcome'])
preds = lr.predict_proba(lootrain)[:, 1]
print('roc', roc_auc_score(train.outcome, preds))

('roc', 0.99703402430903532)


In [22]:
lootrain.head()

Unnamed: 0,date_x,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,char_8_x,...,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,0.443469,0.489206,0.446476,0.446476,0.446476,0.446476,0.446476,0.446476,0.446476,0.446476,...,0.402735,0.650801,0.652109,0.360108,0.388654,0.658206,0.65678,0.668789,0.356352,0.0
1,0.281099,0.510324,0.446476,0.446476,0.446476,0.446476,0.446476,0.446476,0.446476,0.446476,...,0.402735,0.650801,0.652109,0.360108,0.388654,0.658206,0.65678,0.668789,0.356352,0.0
2,0.281099,0.510324,0.446476,0.446476,0.446476,0.446476,0.446476,0.446476,0.446476,0.446476,...,0.402735,0.650801,0.652109,0.360108,0.388654,0.658206,0.65678,0.668789,0.356352,0.0
3,0.442657,0.510324,0.446476,0.446476,0.446476,0.446476,0.446476,0.446476,0.446476,0.446476,...,0.402735,0.650801,0.652109,0.360108,0.388654,0.658206,0.65678,0.668789,0.356352,0.0
4,0.443469,0.510324,0.446476,0.446476,0.446476,0.446476,0.446476,0.446476,0.446476,0.446476,...,0.402735,0.650801,0.652109,0.360108,0.388654,0.658206,0.65678,0.668789,0.356352,0.0
