# Preprocessing

In [1]:
import numpy as np
import pandas as pd

from kairbnb.preprocessing import one_hot_encoding

In [2]:
train_users = pd.read_csv('../data/train_users.csv', nrows=10000)
test_users = pd.read_csv('../data/test_users.csv', nrows=10000)
sessions = pd.read_csv('../data/sessions.csv', nrows=10000)

In [3]:
users = pd.concat([train_users, test_users], axis=0, ignore_index=True)
users.drop('date_first_booking', axis=1, inplace=True)

In [4]:
users.loc[users['age']>1000, 'age'] = 2015-users.loc[users['age']>1000, 'age']
users.loc[(users['age']>100) | (users['age']<18), 'age'] = -1

users['age'].fillna(-1, inplace=True)

bins = [-1, 20, 25, 30, 40, 50, 60, 75, 100]
users['age_group'] = np.digitize(users['age'], bins, right=True)

users['age_unknown'] = (users['age'] == -1).astype(int)

In [5]:
users['nans'] = np.sum([
    (users['age'] == -1),
    (users['gender'] == '-unknown-'),
    (users['language'] == '-unknown-'),
    (users['first_affiliate_tracked'] == 'untracked'),
    (users['first_browser'] == '-unknown-')], axis=0)

In [6]:
users['date_account_created'] = pd.to_datetime(users['date_account_created'], errors='ignore')
users['date_first_active'] = pd.to_datetime(users['timestamp_first_active'], format='%Y%m%d%H%M%S')

# Convert to DatetimeIndex
date_account_created = pd.DatetimeIndex(users['date_account_created'])
date_first_active = pd.DatetimeIndex(users['date_first_active'])

# Split dates into day, week, month, year
users['day_account_created'] = date_account_created.day
users['weekday_account_created'] = date_account_created.weekday
users['week_account_created'] = date_account_created.week
users['month_account_created'] = date_account_created.month
users['year_account_created'] = date_account_created.year
users['day_first_active'] = date_first_active.day
users['weekday_first_active'] = date_first_active.weekday
users['week_first_active'] = date_first_active.week
users['month_first_active'] = date_first_active.month
users['year_first_active'] = date_first_active.year

users['time_lag'] = date_account_created.values - date_first_active.values

In [7]:
# Drop columns
drop_list = [
    'date_account_created',
    'date_first_active',
    'timestamp_first_active'
]

users.drop(drop_list, axis=1, inplace=True)

In [8]:
sessions.rename(columns = {'user_id': 'id'}, inplace=True)

sessions_data = sessions.groupby(['id', 'action'])['secs_elapsed'].agg(len).unstack()
sessions_data.append(sessions.groupby(['id', 'action_type'])['secs_elapsed'].agg(len).unstack())
sessions_data.append(sessions.groupby(['id', 'device_type'])['secs_elapsed'].agg(len).unstack())
sessions_data.append(sessions.groupby(['id', 'action_detail'])['secs_elapsed'].agg(len).unstack())

sessions_data.columns = sessions_data.columns.map(lambda x: str(x) + '_count')
sessions_data['most_used_device'] = sessions.groupby('id')['device_type'].max()

In [9]:
users = users.join(sessions_data, on='id')

In [10]:
secs_elapsed = sessions.groupby('id')['secs_elapsed']

secs_elapsed = secs_elapsed.agg(
    {
        'secs_elapsed_sum': np.sum,
        'secs_elapsed_mean': np.mean,
        'secs_elapsed_min': np.min,
        'secs_elapsed_max': np.max,
        'secs_elapsed_median': np.median,
        'secs_elapsed_std': np.std,
        'secs_elapsed_var': np.var,
        'day_pauses': lambda x: (x > 86400).sum(),
        'long_pauses': lambda x: (x > 300000).sum(),
        'short_pauses': lambda x: (x < 3600).sum(),
        'session_length' : np.size
    }
)

In [11]:
users = users.join(secs_elapsed, on='id')

In [12]:
# Encode categorical features
categorical_features = [
    'gender', 'signup_method', 'signup_flow', 'language',
    'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked',
    'signup_app', 'first_device_type', 'first_browser', 'most_used_device'
]

users = one_hot_encoding(users, categorical_features)

In [15]:
users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162096 entries, 0 to 162095
Columns: 371 entries, age to first_browser_wOSBrowser
dtypes: float64(355), int32(10), int64(3), object(2), timedelta64[ns](1)
memory usage: 453.9+ MB


Pipeline with selectKbest

Age same people country