In [1]:
import sys
import numpy as np
import pandas as pd

sys.path.append('..')

In [2]:
from utils.data_loading import load_users_data, load_sessions_data

train_users, test_users = load_users_data()
sessions = load_sessions_data()
sessions.replace('-unknown-', np.nan, inplace=True)

In [3]:
users = pd.concat((train_users, test_users), axis=0, ignore_index=True)
users = users.drop('date_first_booking', axis=1)

In [4]:
users['gender'].replace('-unknown-', np.nan, inplace=True)
users['language'].replace('-unknown-', np.nan, inplace=True)

In [5]:
users.loc[users['age'] > 100, 'age'] = np.nan
users.loc[users['age'] < 14, 'age'] = np.nan

In [6]:
categorical_features = [
    'affiliate_channel',
    'affiliate_provider',
    'country_destination',
    'first_affiliate_tracked',
    'first_browser',
    'first_device_type',
    'gender',
    'language',
    'signup_app',
    'signup_method'
]

for categorical_feature in categorical_features:
    users[categorical_feature] = users[categorical_feature].astype('category')

In [7]:
users['date_account_created'] = pd.to_datetime(users['date_account_created'])
users['date_first_active'] = pd.to_datetime(users['timestamp_first_active'], format='%Y%m%d%H%M%S')

In [8]:
weekdays = []
for date in users.date_account_created:
    weekdays.append(date.weekday())
users['weekday_account_created'] = pd.Series(weekdays)

weekdays = []
for date in users.date_account_created:
    weekdays.append(date.weekday())
users['weekday_first_active'] = pd.Series(weekdays)

users['year_account_created'] = pd.DatetimeIndex(users['date_account_created']).year
users['month_account_created'] = pd.DatetimeIndex(users['date_account_created']).month
users['day_account_created'] = pd.DatetimeIndex(users['date_account_created']).day

users['year_first_active'] = pd.DatetimeIndex(users['timestamp_first_active']).year
users['month_first_active'] = pd.DatetimeIndex(users['timestamp_first_active']).month
users['day_first_active'] = pd.DatetimeIndex(users['timestamp_first_active']).day

In [None]:
sessions_lengths = sessions['user_id'].value_counts()

In [35]:
for user, session_length in sessions_lengths.iteritems():
    users.loc[users['id'] == user, 'session_length'] = int(session_length)

2722
2644
2476
2424
2362
2335
2323
2264
2264
2246
2137
2085
2019
1938
1923
1919
1876
1861
1811
1797
1792
1780
1779
1753
1752
1732
1701
1685
1614
1612
1574
1572
1562
1548
1538
1526
1518
1511
1491
1482
1458
1455
1422
1410
1410
1409
1406
1405
1399
1390
1386
1382
1379
1375
1367
1365
1363
1350
1337
1332
1332
1331
1327
1320
1317
1308
1298
1275
1268
1267
1260
1256
1254
1250
1246
1246
1243
1241
1234
1234
1234
1232
1231
1226
1226
1220
1216
1216
1207
1204
1203
1198
1197
1196
1190
1186
1185
1184
1184
1183
1182
1182
1179
1169
1169
1166
1164
1148
1146
1145
1138
1135
1130
1129
1127
1124
1113
1113
1112
1109
1107
1105
1105
1103
1102
1102
1101
1101
1096
1094
1090
1089
1088
1088
1087
1083
1083
1081
1079
1076
1075
1073
1073
1073
1069
1069
1067
1066
1065
1062
1062
1062
1059
1058
1058
1057
1057
1057
1056
1051
1050
1048
1045
1043
1042
1040
1039
1039
1037
1037
1036
1035
1033
1032
1032
1032
1027
1026
1025
1025
1022
1020
1019
1019
1014
1014
1013
1011
1010
1009
1003
1003
1002
999
998
989
989
985
976
974
974
973

KeyboardInterrupt: 