# Preprocessing

In [1]:
import numpy as np
import pandas as pd

from kairbnb.preprocessing import one_hot_encoding

In [2]:
train_users = pd.read_csv('../data/train_users.csv', nrows=1000)
test_users = pd.read_csv('../data/test_users.csv', nrows=1000)
sessions = pd.read_csv('../data/sessions.csv', nrows=1000)

In [3]:
users = pd.concat([train_users, test_users], axis=0, ignore_index=True)
users.drop('date_first_booking', axis=1, inplace=True)

In [4]:
users.loc[users['age']>1000, 'age'] = 2015-users.loc[users['age']>1000, 'age']
users.loc[(users['age']>100) | (users['age']<18), 'age'] = -1

users['age'].fillna(-1, inplace=True)

bins = [-1, 20, 25, 30, 40, 50, 60, 75, 100]
users['age_group'] = np.digitize(users['age'], bins, right=True)

users['age_unknown'] = (users['age'] == -1).astype(int)

In [5]:
users['nans'] = np.sum([
    (users['age'] == -1),
    (users['gender'] == '-unknown-'),
    (users['language'] == '-unknown-'),
    (users['first_affiliate_tracked'] == 'untracked'),
    (users['first_browser'] == '-unknown-')], axis=0)

In [6]:
users['date_account_created'] = pd.to_datetime(users['date_account_created'], errors='ignore')
users['date_first_active'] = pd.to_datetime(users['timestamp_first_active'], format='%Y%m%d%H%M%S')

# Convert to DatetimeIndex
date_account_created = pd.DatetimeIndex(users['date_account_created'])
date_first_active = pd.DatetimeIndex(users['date_first_active'])

# Split dates into day, week, month, year
users['day_account_created'] = date_account_created.day
users['weekday_account_created'] = date_account_created.weekday
users['week_account_created'] = date_account_created.week
users['month_account_created'] = date_account_created.month
users['year_account_created'] = date_account_created.year
users['day_first_active'] = date_first_active.day
users['weekday_first_active'] = date_first_active.weekday
users['week_first_active'] = date_first_active.week
users['month_first_active'] = date_first_active.month
users['year_first_active'] = date_first_active.year

users['time_lag'] = date_account_created.values - date_first_active.values

In [7]:
# Drop columns
drop_list = [
    'date_account_created',
    'date_first_active',
    'timestamp_first_active'
]

users.drop(drop_list, axis=1, inplace=True)

In [8]:
sessions.rename(columns = {'user_id': 'id'}, inplace=True)

session_count = sessions.groupby(['id', 'action'])['secs_elapsed'].agg(len).unstack().fillna(0)
session_count.append(sessions.groupby(['id', 'action_type'])['secs_elapsed'].agg(len).unstack().fillna(0))
session_count.append(sessions.groupby(['id', 'device_type'])['secs_elapsed'].agg(len).unstack().fillna(0))
session_count.append(sessions.groupby(['id', 'action_detail'])['secs_elapsed'].agg(len).unstack().fillna(0))

session_count.columns = session_count.columns.map(lambda x: str(x) + '_count')
session_count['most_used_device'] = sessions.groupby('id')['device_type'].max()

In [9]:
users = users.join(session_count, on='id')

In [10]:
# Time per device
# sessions.groupby(['id', 'device_type'])['secs_elapsed'].agg(np.sum).unstack().fillna(0)

In [11]:
# Encode categorical features
categorical_features = [
    'gender', 'signup_method', 'signup_flow', 'language',
    'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked',
    'signup_app', 'first_device_type', 'first_browser', 'most_used_device'
]

users = one_hot_encoding(users, categorical_features)

Age same people country