# Feature Engineering - Non-Purchase Events data

In [2]:
import pandas as pd
from datetime import datetime

In [3]:
# load data
events = pd.read_pickle("data/LeanPlum/events.pkl")
events.head()

Unnamed: 0,session_id,event,event_timestamp,event_value,user_id_hash
0,5558845121177764917,45,1542215397132,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
1,5558845121177764917,45,1542215484895,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
2,7689508378645584666,.m5100869650219008,1541124410372,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
3,2201961907282901522,4,1543713091129,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
4,2201961907282901522,6,1543713093116,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...


In [6]:
# load user_id to user_id_hash dictionary and map to user_id to save memory
users = pd.read_csv('data/user_dict.csv', header=None)
user_dict = {row[1][0]:int(row[1][1]) for row in users.iterrows()}
events['user_id'] = events['user_id_hash'].map(user_dict)
events = events.drop('user_id_hash', axis=1)
events.head()

Unnamed: 0,session_id,event,event_timestamp,event_value,user_id
0,5558845121177764917,45,1542215397132,0.0,554721
1,5558845121177764917,45,1542215484895,0.0,554721
2,7689508378645584666,.m5100869650219008,1541124410372,0.0,554721
3,2201961907282901522,4,1543713091129,0.0,554721
4,2201961907282901522,6,1543713093116,0.0,554721


In [7]:
events.event.value_counts()

45                           75493931
1                             5363926
5                             4887922
6                             4068474
14                            3491117
4                             3435363
40                            2775997
7                             1970903
41                            1969223
3                             1828494
42                            1598212
.a5027911885258752             618037
.a5400102822346752             604699
44                             495590
.a5516611293544448             411053
0                              407118
63                             347198
47                             310038
57                             271268
8                              265034
55                             194341
.m5295687445250048             181875
.a5061295285075968              92645
50                              88641
9                               85832
.m6311698772393984              74149
11          

In [9]:
# load target labels
labels7 = pd.read_csv('labels7.csv').set_index('user_id')
events = events.set_index('user_id')
events = events.join(labels7)
events_full = events
events.head()

Unnamed: 0_level_0,session_id,event,event_timestamp,event_value,label
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,4792293076973185176,.a5400102822346752,1540291785435,0.0,0
0,4792293076973185176,.a5027911885258752,1540291785435,0.0,0
0,4792293076973185176,44,1540291811064,0.0,0
0,4792293076973185176,5,1540291811886,0.0,0
0,4792293076973185176,45,1540291856328,0.0,0


In [27]:
# filter to remove data during timeframe that we are predicting
events_full['datetime'] = events_full['event_timestamp']\
                        .apply(lambda x:datetime.fromtimestamp(x/1000))
events = events_full[events_full.datetime < '2018-12-01']
events.head()

Unnamed: 0_level_0,session_id,event,event_timestamp,event_value,label,datetime
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,4792293076973185176,.a5400102822346752,1540291785435,0.0,0,2018-10-23 03:49:45.435
0,4792293076973185176,.a5027911885258752,1540291785435,0.0,0,2018-10-23 03:49:45.435
0,4792293076973185176,44,1540291811064,0.0,0,2018-10-23 03:50:11.064
0,4792293076973185176,5,1540291811886,0.0,0,2018-10-23 03:50:11.886
0,4792293076973185176,45,1540291856328,0.0,0,2018-10-23 03:50:56.328


In [28]:
# get events from last 7 days and 14 days events
events7 = events[events.datetime >= '2018-11-24']
events14 = events[events.datetime >= '2018-11-17']

# same but for prediction (includes full timeframe)
events7_full = events_full[events_full.datetime >= '2018-12-08']
events14_full = events_full[events_full.datetime >= '2018-12-01']

## Compute event specific count features for training and testing

In [30]:
e3_7 = events7[events7.event == '3'].groupby('user_id').size()\
        .reset_index(name='e3_count_1_week').set_index('user_id')
e3_14 = events14[events14.event == '3'].groupby('user_id').size()\
        .reset_index(name='e3_count_2_week').set_index('user_id')
e4_7 = events7[events7.event == '4'].groupby('user_id').size()\
        .reset_index(name='e4_count_1_week').set_index('user_id')
e4_14 = events14[events14.event == '4'].groupby('user_id').size()\
        .reset_index(name='e4_count_2_week').set_index('user_id')
e5_7 = events7[events7.event == '5'].groupby('user_id').size()\
        .reset_index(name='e5_count_1_week').set_index('user_id')
e5_14 = events14[events14.event == '5'].groupby('user_id').size()\
        .reset_index(name='e5_count_2_week').set_index('user_id')
e6_7 = events7[events7.event == '6'].groupby('user_id').size()\
        .reset_index(name='e6_count_1_week').set_index('user_id')
e6_14 = events14[events14.event == '6'].groupby('user_id').size()\
        .reset_index(name='e6_count_2_week').set_index('user_id')
e7_7 = events7[events7.event == '7'].groupby('user_id').size()\
        .reset_index(name='e7_count_1_week').set_index('user_id')
e7_14 = events14[events14.event == '7'].groupby('user_id').size()\
        .reset_index(name='e7_count_2_week').set_index('user_id')
e40_7 = events7[events7.event == '40'].groupby('user_id').size()\
        .reset_index(name='e40_count_1_week').set_index('user_id')
e40_14 = events14[events14.event == '40'].groupby('user_id').size()\
        .reset_index(name='e40_count_2_week').set_index('user_id')
e41_7 = events7[events7.event == '41'].groupby('user_id').size()\
        .reset_index(name='e41_count_1_week').set_index('user_id')
e41_14 = events14[events14.event == '41'].groupby('user_id').size()\
        .reset_index(name='e41_count_2_week').set_index('user_id')
e42_7 = events7[events7.event == '42'].groupby('user_id').size()\
        .reset_index(name='e42_count_1_week').set_index('user_id')
e42_14 = events14[events14.event == '42'].groupby('user_id').size()\
        .reset_index(name='e42_count_2_week').set_index('user_id')

In [36]:
e3_7_full = events7_full[events7_full.event == '3'].groupby('user_id').size()\
        .reset_index(name='e3_count_1_week').set_index('user_id')
e3_14_full = events14_full[events14_full.event == '3'].groupby('user_id').size()\
        .reset_index(name='e3_count_2_week').set_index('user_id')
e4_7_full = events7_full[events7_full.event == '4'].groupby('user_id').size()\
        .reset_index(name='e4_count_1_week').set_index('user_id')
e4_14_full = events14_full[events14_full.event == '4'].groupby('user_id').size()\
        .reset_index(name='e4_count_2_week').set_index('user_id')
e5_7_full = events7_full[events7_full.event == '5'].groupby('user_id').size()\
        .reset_index(name='e5_count_1_week').set_index('user_id')
e5_14_full = events14_full[events14_full.event == '5'].groupby('user_id').size()\
        .reset_index(name='e5_count_2_week').set_index('user_id')
e6_7_full = events7_full[events7_full.event == '6'].groupby('user_id').size()\
        .reset_index(name='e6_count_1_week').set_index('user_id')
e6_14_full = events14_full[events14_full.event == '6'].groupby('user_id').size()\
        .reset_index(name='e6_count_2_week').set_index('user_id')
e7_7_full = events7_full[events7_full.event == '7'].groupby('user_id').size()\
        .reset_index(name='e7_count_1_week').set_index('user_id')
e7_14_full = events14_full[events14_full.event == '7'].groupby('user_id').size()\
        .reset_index(name='e7_count_2_week').set_index('user_id')
e40_7_full = events7_full[events7_full.event == '40'].groupby('user_id').size()\
        .reset_index(name='e40_count_1_week').set_index('user_id')
e40_14_full = events14_full[events14_full.event == '40'].groupby('user_id').size()\
        .reset_index(name='e40_count_2_week').set_index('user_id')
e41_7_full = events7_full[events7_full.event == '41'].groupby('user_id').size()\
        .reset_index(name='e41_count_1_week').set_index('user_id')
e41_14_full = events14_full[events14_full.event == '41'].groupby('user_id').size()\
        .reset_index(name='e41_count_2_week').set_index('user_id')
e42_7_full = events7_full[events7_full.event == '42'].groupby('user_id').size()\
        .reset_index(name='e42_count_1_week').set_index('user_id')
e42_14_full = events14_full[events14_full.event == '42'].groupby('user_id').size()\
        .reset_index(name='e42_count_2_week').set_index('user_id')

In [19]:
# e1 = events.loc[events.event == '1'].groupby('user_id').size().reset_index(name='e1_count')
# e5 = events.loc[events.event == '5'].groupby('user_id').size().reset_index(name='e5_count')
# e6 = events.loc[events.event == '6'].groupby('user_id').size().reset_index(name='e6_count')
# e14 = events.loc[events.event == '14'].groupby('user_id').size().reset_index(name='e14_count')
# e4 = events.loc[events.event == '4'].groupby('user_id').size().reset_index(name='e4_count')
# e40 = events.loc[events.event == '40'].groupby('user_id').size().reset_index(name='e40_count')
# e7 = events.loc[events.event == '7'].groupby('user_id').size().reset_index(name='e7_count')
# e41 = events.loc[events.event == '41'].groupby('user_id').size().reset_index(name='e41_count')
# e3 = events.loc[events.event == '3'].groupby('user_id').size().reset_index(name='e3_count')
# e42 = events.loc[events.event == '42'].groupby('user_id').size().reset_index(name='e42_count')
# ea50 = events.loc[events.event == '.a5027911885258752'].groupby('user_id').size().reset_index(name='e.a50_count')
# e44 = events.loc[events.event == '44'].groupby('user_id').size().reset_index(name='e44_count')
# ea54 = events.loc[events.event == '.a5400102822346752'].groupby('user_id').size().reset_index(name='e.a54_count')
# ea55 = events.loc[events.event == '.a5516611293544448'].groupby('user_id').size().reset_index(name='e.a55_count')
# e0 = events.loc[events.event == '0'].groupby('user_id').size().reset_index(name='e0_count')

In [20]:
# events_feat_lst = [e1, e5, e6, e14, e4, e40, e7, e41, e3, e42, ea50, e44, ea54, ea55, e0]
# events_feat = labels7
# for e in events_feat_lst:
#     e = e.set_index('user_id')
#     events_feat = events_feat.join(e)

## Compute correlation of events features with target labels

In [21]:
corr = events_feat.corr()
corr.style.background_gradient()

Unnamed: 0,label,e1_count,e5_count,e6_count,e14_count,e4_count,e40_count,e7_count,e41_count,e3_count,e42_count,e.a50_count,e44_count,e.a54_count,e.a55_count,e0_count
label,1.0,0.0687617,0.158675,0.14755,0.0883078,0.147327,0.138969,0.243401,0.11328,0.111106,0.107725,-0.00015029,0.000334466,-0.000184326,0.00116596,0.0104397
e1_count,0.0687617,1.0,0.822344,0.808274,0.736403,0.766358,0.730259,0.248757,0.696752,0.693115,0.68794,0.00151282,-0.00035984,0.00170697,0.00198487,0.0965404
e5_count,0.158675,0.822344,1.0,0.995402,0.76054,0.984073,0.948474,0.523259,0.862381,0.860149,0.857907,0.00235147,0.00139629,0.00219585,0.00264241,0.092971
e6_count,0.14755,0.808274,0.995402,1.0,0.724053,0.991267,0.955854,0.513934,0.87089,0.868134,0.866313,0.00242073,-0.00113056,0.00239108,0.00289431,0.0989411
e14_count,0.0883078,0.736403,0.76054,0.724053,1.0,0.657049,0.631063,0.322053,0.584797,0.582316,0.580744,0.00315661,0.00584149,0.0030799,0.00437019,0.0907272
e4_count,0.147327,0.766358,0.984073,0.991267,0.657049,1.0,0.95985,0.52557,0.871337,0.868725,0.866777,0.00211573,-0.00116145,0.00247652,0.00276616,0.0990012
e40_count,0.138969,0.730259,0.948474,0.955854,0.631063,0.95985,1.0,0.484443,0.908932,0.906347,0.903085,0.00208059,-0.00148118,0.00244809,0.00308457,0.0887609
e7_count,0.243401,0.248757,0.523259,0.513934,0.322053,0.52557,0.484443,1.0,0.368041,0.369999,0.369827,0.00196981,-0.000518538,0.00166057,0.00250666,0.0416687
e41_count,0.11328,0.696752,0.862381,0.87089,0.584797,0.871337,0.908932,0.368041,1.0,0.996,0.98921,-0.000153201,-0.00256315,0.000105997,0.000952944,0.080748
e3_count,0.111106,0.693115,0.860149,0.868134,0.582316,0.868725,0.906347,0.369999,0.996,1.0,0.993377,-5.73108e-05,-0.00252401,0.000222945,0.00127908,0.0800137


In [24]:
labels14 = pd.read_csv('labels14.csv').set_index('user_id')

In [25]:
events_feat14 = labels14
for e in events_feat_lst:
    e = e.set_index('user_id')
    events_feat14 = events_feat14.join(e)

In [26]:
corr14 = events_feat14.corr()
corr14.style.background_gradient()

Unnamed: 0,label,e1_count,e5_count,e6_count,e14_count,e4_count,e40_count,e7_count,e41_count,e3_count,e42_count,e.a50_count,e44_count,e.a54_count,e.a55_count,e0_count
label,1.0,0.0799583,0.178831,0.166512,0.100155,0.165837,0.156778,0.266991,0.128703,0.127246,0.123703,-0.000854038,0.000329071,-0.000893892,0.000350985,0.0112208
e1_count,0.0799583,1.0,0.822344,0.808274,0.736403,0.766358,0.730259,0.248757,0.696752,0.693115,0.68794,0.00151282,-0.00035984,0.00170697,0.00198487,0.0965404
e5_count,0.178831,0.822344,1.0,0.995402,0.76054,0.984073,0.948474,0.523259,0.862381,0.860149,0.857907,0.00235147,0.00139629,0.00219585,0.00264241,0.092971
e6_count,0.166512,0.808274,0.995402,1.0,0.724053,0.991267,0.955854,0.513934,0.87089,0.868134,0.866313,0.00242073,-0.00113056,0.00239108,0.00289431,0.0989411
e14_count,0.100155,0.736403,0.76054,0.724053,1.0,0.657049,0.631063,0.322053,0.584797,0.582316,0.580744,0.00315661,0.00584149,0.0030799,0.00437019,0.0907272
e4_count,0.165837,0.766358,0.984073,0.991267,0.657049,1.0,0.95985,0.52557,0.871337,0.868725,0.866777,0.00211573,-0.00116145,0.00247652,0.00276616,0.0990012
e40_count,0.156778,0.730259,0.948474,0.955854,0.631063,0.95985,1.0,0.484443,0.908932,0.906347,0.903085,0.00208059,-0.00148118,0.00244809,0.00308457,0.0887609
e7_count,0.266991,0.248757,0.523259,0.513934,0.322053,0.52557,0.484443,1.0,0.368041,0.369999,0.369827,0.00196981,-0.000518538,0.00166057,0.00250666,0.0416687
e41_count,0.128703,0.696752,0.862381,0.87089,0.584797,0.871337,0.908932,0.368041,1.0,0.996,0.98921,-0.000153201,-0.00256315,0.000105997,0.000952944,0.080748
e3_count,0.127246,0.693115,0.860149,0.868134,0.582316,0.868725,0.906347,0.369999,0.996,1.0,0.993377,-5.73108e-05,-0.00252401,0.000222945,0.00127908,0.0800137


In [None]:
# so that we get features for all users, first create a  dummy df with all user_ids
dummy = pd.DataFrame({'user_id': list(user_dict.values()), 'dummy': [0 for _ in range(len(user_dict.values()))]})

In [32]:
# join with events features, but only including the highest correlated features
features_lst = [e3_7, e4_7, e5_7, e6_7, e7_7, e40_7, e41_7, e42_7,
                e3_14, e4_14, e5_14, e6_14, e7_14, e40_14, e41_14, e42_14]
features = dummy.join(features_lst[:]).drop(axis=1, columns='dummy').fillna(value=0)
features.head()

Unnamed: 0,user_id,e3_count_1_week,e4_count_1_week,e5_count_1_week,e6_count_1_week,e7_count_1_week,e40_count_1_week,e41_count_1_week,e42_count_1_week,e3_count_2_week,e4_count_2_week,e5_count_2_week,e6_count_2_week,e7_count_2_week,e40_count_2_week,e41_count_2_week,e42_count_2_week
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [37]:
features_lst_full = [e3_7_full, e4_7_full, e5_7_full, e6_7_full, e7_7_full, e40_7_full, e41_7_full, e42_7_full,
                e3_14_full, e4_14_full, e5_14_full, e6_14_full, e7_14_full, e40_14_full, e41_14_full, e42_14_full]
features_full = dummy.join(features_lst_full[:]).drop(axis=1, columns='dummy').fillna(value=0)
features_full.head()

Unnamed: 0,user_id,e3_count_1_week,e4_count_1_week,e5_count_1_week,e6_count_1_week,e7_count_1_week,e40_count_1_week,e41_count_1_week,e42_count_1_week,e3_count_2_week,e4_count_2_week,e5_count_2_week,e6_count_2_week,e7_count_2_week,e40_count_2_week,e41_count_2_week,e42_count_2_week
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Save features to csv to be loaded into ML modeling notebook

In [33]:
features.to_csv(path_or_buf='features_events2.csv', index=False)

In [39]:
features_full.to_csv(path_or_buf='features_events2_full.csv', index=False)