# Feature Engineering - Attributes dataset

In [1]:
import pandas as pd
from datetime import datetime

In [2]:
# load attributes and sessions
attr = pd.read_pickle("data/LeanPlum/attr.pkl")
session = pd.read_pickle("data/LeanPlum/session.pkl")
session.head()

In [4]:
# load mapping of user_id_hash to user_id
users = pd.read_csv('data/user_dict.csv', header=None)
user_dict = {row[1][0]:int(row[1][1]) for row in users.iterrows()}

In [5]:
attr['user_id'] = attr['user_id_hash'].map(user_dict)
attr = attr.dropna(subset=['user_id'])\
    .drop('user_id_hash', axis=1)
attr['user_id'] = attr['user_id'].astype(int)
attr = attr.set_index('session_id')
attr.head()

Unnamed: 0_level_0,attr_0,attr_1,attr_2,attr_3,attr_4,attr_5,attr_6,attr_7,attr_8,attr_9,...,attr_20,attr_22,attr_23,attr_24,attr_25,attr_26,attr_27,attr_28,attr_29,user_id
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2662933751290886254,,,,0.029999,romance_the_royal_romance_01:3,3.0,romance_the_royal_romance_01:3,,3.0,,...,,,,,2.0,,2.0,,,550123
5793726669394503241,,,,0.0,romance_the_royal_romance_01:2,2.0,romance_the_royal_romance_01:2,,2.0,,...,,,,,0.0,,0.0,,,550123
6545122651035219326,0.0,0.0,0.0,0.059998,romance_the_royal_romance_01:6,6.0,romance_the_royal_romance_01:6,1.0,6.0,romance_the_royal_romance_01:1,...,picker ya_romance_vampire,,,0.0,5.0,0.0,5.0,1.0,,550123
2302333863698307661,0.0,0.0,0.0,0.0,romance_rules_of_engagement_01:1,0.0,,4.0,0.0,,...,picker ya_romance_vampire,,,,0.0,0.0,0.0,1.0,,54564
215362132811035045,0.0,0.0,0.0,0.0,romance_high_school_story_01:2,1.0,romance_high_school_story_01:1,1.0,1.0,romance_high_school_story_01:1,...,picker ya_romance_vampire,,,,0.0,0.0,0.0,1.0,,96049


In [6]:
# join attributes with with sessions on session_id to get timestamp
session = session[['session_id', 'session_index', 'start_timestamp']].set_index('session_id')
attr2 = attr.join(session)

In [8]:
print(sum(attr2['session_index'].isna()))
print(len(attr2))

561699
3945208


In [9]:
# drop rows that don't have a corresponding session index
attr_pred = attr2.dropna(subset=['session_index'])

In [13]:
# split into train and test (predict) based on timestamp
attr_pred['datetime'] = attr_pred['start_timestamp']\
                        .apply(lambda x:datetime.fromtimestamp(x/1000))
attr_train = attr_pred.loc[attr_pred.datetime < '2018-12-01']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
# compute mean attribute for each user_id
attr_pred2 = attr_pred.groupby('user_id').mean()
attr_train2 = attr_train.groupby('user_id').mean()
attr_train2.head()

Unnamed: 0_level_0,attr_0,attr_1,attr_2,attr_3,attr_5,attr_7,attr_8,attr_10,attr_13,attr_14,...,attr_18,attr_19,attr_24,attr_25,attr_26,attr_27,attr_28,attr_29,session_index,start_timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.029999,3.0,2.0,3.0,0.0,0.0,2.0,...,0.0,0.0,,2.0,0.0,2.0,1.0,,1.0,1540292000000.0
1,0.0,0.0,0.0,0.005001,2.333984,2.0,2.333984,0.0,0.0,2.333984,...,0.5,1.0,1.0,0.5,0.0,0.5,1.0,,3.75,1541099000000.0
2,,,,0.029999,5.800781,1.5,5.800781,,,0.0,...,0.5,2.0,,2.25,0.0,2.25,1.0,,4.5,1541314000000.0
3,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,2.0,...,0.0,2.0,,0.0,0.0,0.0,1.0,,1.0,1542540000000.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,2.0,,0.0,0.0,0.0,1.0,,2.0,1543090000000.0


In [16]:
# join attributes features with labels and compute correlation with target
labels14 = pd.read_csv('labels14.csv').set_index('user_id')
attr_train3 = labels14.join(attr_train2)
attr_train3 = attr_train3.fillna(0)
corr = attr_train3.corr()
corr.style.background_gradient()

  return umr_minimum(a, axis, None, out, keepdims, initial)
  return umr_maximum(a, axis, None, out, keepdims, initial)
  xa[xa < 0] = -1


Unnamed: 0,label,attr_0,attr_1,attr_2,attr_3,attr_5,attr_7,attr_8,attr_10,attr_13,attr_14,attr_15,attr_16,attr_17,attr_18,attr_19,attr_24,attr_25,attr_26,attr_27,attr_28,attr_29,session_index,start_timestamp
label,1.0,0.0130477,0.124648,0.0852359,0.0339555,0.0361812,0.0268783,0.0252081,-0.00786661,0.0666471,0.074491,0.00168875,-0.0370178,0.0302278,0.0721385,-0.0349272,0.0706809,0.0572429,,0.0572429,-0.0265638,0.0112506,0.156851,0.0436372
attr_0,0.0130477,1.0,0.119872,0.0823338,0.879212,0.90103,0.757705,0.971788,0.72265,0.477823,0.585649,0.532556,-0.0971941,0.357646,0.195453,-0.10808,0.00261575,0.68516,,0.68516,0.070074,-0.00756411,0.102393,0.0539128
attr_1,0.124648,0.119872,1.0,0.454566,0.101805,0.116804,0.0929994,0.125721,0.0243754,0.12794,0.301554,0.0702778,-0.0535999,0.017808,0.0814497,-0.0541992,0.0172418,0.131412,,0.131412,0.032757,-0.00276145,0.119557,0.0403878
attr_2,0.0852359,0.0823338,0.454566,1.0,0.0573361,0.0806202,0.0685575,0.0904977,0.00610893,0.110839,0.20787,0.0657626,-0.0394608,0.0186237,0.0196571,-0.0428648,-0.000128487,0.0811196,,0.0811196,0.0239614,-0.00264029,0.0779927,0.0288578
attr_3,0.0339555,0.879212,0.101805,0.0573361,1.0,0.858305,0.732169,0.861654,0.752433,0.272676,0.398249,0.412939,-0.110483,0.4115,0.238697,-0.126437,0.05554,0.833682,,0.833682,0.0804158,-0.00785164,0.186482,0.0694768
attr_5,0.0361812,0.90103,0.116804,0.0806202,0.858305,1.0,0.878801,0.870131,0.756584,0.306207,0.464984,0.320411,-0.139831,0.451147,0.295277,-0.161131,0.0542101,0.622741,,0.622741,0.105048,-0.00942439,0.202968,0.0919437
attr_7,0.0268783,0.757705,0.0929994,0.0685575,0.732169,0.878801,1.0,0.74959,0.634051,0.253634,0.394035,0.259164,-0.126706,0.438672,0.383978,-0.133109,0.1039,0.541486,,0.541486,0.240315,-0.00776531,0.265597,0.208004
attr_8,0.0252081,0.971788,0.125721,0.0904977,0.861654,0.870131,0.74959,1.0,0.659698,0.554811,0.616414,0.612856,-0.109797,0.369226,0.221361,-0.125394,0.0316642,0.71086,,0.71086,0.0810969,-0.00751777,0.139706,0.0695079
attr_10,-0.00786661,0.72265,0.0243754,0.00610893,0.752433,0.756584,0.634051,0.659698,1.0,0.0949916,0.176613,0.235842,-0.080853,0.26943,0.168731,-0.0875154,-0.0288027,0.276223,,0.276223,0.0559943,-0.00584463,-0.0211933,0.0386993
attr_13,0.0666471,0.477823,0.12794,0.110839,0.272676,0.306207,0.253634,0.554811,0.0949916,1.0,0.775087,0.576739,-0.0513934,0.0137809,0.0856259,-0.0545845,-0.00572686,0.328625,,0.328625,0.0342779,-0.00348914,0.0553374,0.0291845


In [17]:
# select best attributes only
attr_lst = ['attr_0', 'attr_1', 'attr_2', 'attr_3',
            'attr_5', 'attr_7', 'attr_8', 'attr_13',
            'attr_14', 'attr_15', 'attr_24', 'attr_25',
            'attr_27']
attr_train3 = attr_train3[attr_lst]
attr_pred3 = attr_pred2[attr_lst]

In [18]:
attr_train3 = attr_train3.fillna(0)
attr_pred3 = attr_pred3.fillna(0)

In [19]:
attr_pred3.head()

Unnamed: 0_level_0,attr_0,attr_1,attr_2,attr_3,attr_5,attr_7,attr_8,attr_13,attr_14,attr_15,attr_24,attr_25,attr_27
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,0.0,0.0,0.0,0.029999,3.0,2.0,3.0,0.0,2.0,0.0,0.0,2.0,2.0
1,0.0,0.0,0.0,0.005001,2.333984,2.0,2.333984,0.0,2.333984,0.5,1.0,0.5,0.5
2,0.0,0.0,0.0,0.029999,5.800781,1.5,5.800781,0.0,0.0,0.0,0.0,2.25,2.25
3,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# create a dummy df with all user_ids and then join with attributes features
dummy = pd.DataFrame({'user_id': list(user_dict.values()), 'dummy': [0 for _ in range(len(user_dict.values()))]})
attr_features_full = dummy.join(attr_pred3).drop(axis=1, columns='dummy').fillna(value=0)
attr_features_full.head()

Unnamed: 0,user_id,attr_0,attr_1,attr_2,attr_3,attr_5,attr_7,attr_8,attr_13,attr_14,attr_15,attr_24,attr_25,attr_27
0,0,0.0,0.0,0.0,0.029999,3.0,2.0,3.0,0.0,2.0,0.0,0.0,2.0,2.0
1,1,0.0,0.0,0.0,0.005001,2.333984,2.0,2.333984,0.0,2.333984,0.5,1.0,0.5,0.5
2,2,0.0,0.0,0.0,0.029999,5.800781,1.5,5.800781,0.0,0.0,0.0,0.0,2.25,2.25
3,3,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
attr_features_full.to_csv(path_or_buf='features_attr3_full.csv', index=False)

In [22]:
attr_train3.reset_index().to_csv(path_or_buf='features_attr3.csv', index=False)