# Preprocessing

In [102]:
import numpy as np
import pandas as pd

In [103]:
train_users = pd.read_csv('../data/train_users_sample.csv')
test_users = pd.read_csv('../data/test_users_sample.csv')
sessions = pd.read_csv('../data/sessions_sample.csv')

In [104]:
users = pd.concat([train_users, test_users], axis=0, ignore_index=True)
users.drop('date_first_booking', axis=1, inplace=True)

In [105]:
users.loc[users['age']>1000, 'age'] = 2015-users.loc[users['age']>1000, 'age']
users.loc[(users['age']>100) | (users['age']<18), 'age'] = -1

users['age'].fillna(-1, inplace=True)

bins = [-1, 20, 25, 30, 40, 50, 60, 75, 100]
users['age_group'] = np.digitize(users['age'], bins, right=True)

In [106]:
users['nans'] = np.sum([
    (users['age'] == -1),
    (users['gender'] == '-unknown-'),
    (users['language'] == '-unknown-'),
    (users['first_affiliate_tracked'] == 'untracked'),
    (users['first_browser'] == '-unknown-')], axis=0)

In [107]:
users['date_account_created'] = pd.to_datetime(users['date_account_created'], errors='ignore')
users['date_first_active'] = pd.to_datetime(users['timestamp_first_active'], format='%Y%m%d%H%M%S')

# Convert to DatetimeIndex
date_account_created = pd.DatetimeIndex(users['date_account_created'])
date_first_active = pd.DatetimeIndex(users['date_first_active'])

# Split dates into day, week, month, year
users['day_account_created'] = date_account_created.day
users['weekday_account_created'] = date_account_created.weekday
users['week_account_created'] = date_account_created.week
users['month_account_created'] = date_account_created.month
users['year_account_created'] = date_account_created.year
users['day_first_active'] = date_first_active.day
users['weekday_first_active'] = date_first_active.weekday
users['week_first_active'] = date_first_active.week
users['month_first_active'] = date_first_active.month
users['year_first_active'] = date_first_active.year

users['time_lag'] = (date_account_created.values - date_first_active.values).astype(int)

In [108]:
# Drop columns
drop_list = [
    'date_account_created',
    'date_first_active',
    'timestamp_first_active'
]

users.drop(drop_list, axis=1, inplace=True)

In [109]:
sessions.rename(columns = {'user_id': 'id'}, inplace=True)

action_count = sessions.groupby(['id', 'action'])['secs_elapsed'].agg(len).unstack()
action_type_count = sessions.groupby(['id', 'action_type'])['secs_elapsed'].agg(len).unstack()
action_detail_count = sessions.groupby(['id', 'action_detail'])['secs_elapsed'].agg(len).unstack()
device_type_sum = sessions.groupby(['id', 'device_type'])['secs_elapsed'].agg(sum).unstack()

sessions_data = pd.concat([action_count, action_type_count, action_detail_count, device_type_sum],axis=1)
sessions_data.columns = sessions_data.columns.map(lambda x: str(x) + '_count')

sessions_data['most_used_device'] = sessions.groupby('id')['device_type'].max()

In [110]:
users = users.join(sessions_data, on='id')

In [111]:
secs_elapsed = sessions.groupby('id')['secs_elapsed']

secs_elapsed = secs_elapsed.agg(
    {
        'secs_elapsed_sum': np.sum,
        'secs_elapsed_mean': np.mean,
        'secs_elapsed_min': np.min,
        'secs_elapsed_max': np.max,
        'secs_elapsed_median': np.median,
        'secs_elapsed_std': np.std,
        'secs_elapsed_var': np.var,
        'day_pauses': lambda x: (x > 86400).sum(),
        'long_pauses': lambda x: (x > 300000).sum(),
        'short_pauses': lambda x: (x < 3600).sum(),
        'session_length' : np.count_nonzero
    }
)

In [112]:
users = users.join(secs_elapsed, on='id')

In [113]:
# Encode categorical features
categorical_features = [
    'gender', 'signup_method', 'signup_flow', 'language',
    'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked',
    'signup_app', 'first_device_type', 'first_browser', 'most_used_device'
]

In [114]:
users = pd.get_dummies(users, columns=categorical_features)

In [115]:
users.set_index('id', inplace=True)

In [117]:
users.loc[train_users['id']].to_csv('../cache/train_users.csv')

In [118]:
users.loc[test_users['id']].drop('country_destination', axis=1).to_csv('../cache/test_users.csv')