In [1]:
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
import multiprocessing

from utils.preprocessing import one_hot_encoding
from utils.preprocessing import get_weekday
from utils.preprocessing import process_user_secs_elapsed
from utils.preprocessing import process_user_session

In [2]:
raw_data_path = '../data/raw/'
processed_data_path = '../data/processed/'

In [36]:
global sessions

In [37]:
# Load raw data
train_users = pd.read_csv(raw_data_path + 'train_users.csv')
test_users = pd.read_csv(raw_data_path + 'test_users.csv')
sessions = pd.read_csv(raw_data_path + 'sessions.csv', nrows=10000)

In [38]:
# Join users
users = pd.concat((train_users, test_users), axis=0, ignore_index=True)

In [39]:
# Drop date_first_booking column (empty since competition's restart)
users = users.drop('date_first_booking', axis=1)

In [40]:
# Replace NaNs
users['gender'].replace('-unknown-', np.nan, inplace=True)
users['language'].replace('-unknown-', np.nan, inplace=True)
sessions.replace('-unknown-', np.nan, inplace=True)

In [41]:
# Remove weird age values
users.loc[users['age'] > 100, 'age'] = np.nan
users.loc[users['age'] < 14, 'age'] = np.nan

In [42]:
# Change type to date
users['date_account_created'] = pd.to_datetime(users['date_account_created'])
users['date_first_active'] = pd.to_datetime(users['timestamp_first_active'],
                                            format='%Y%m%d%H%M%S')

users['weekday_account_created'] = users[
    'date_account_created'].apply(get_weekday)
users['weekday_first_active'] = users['date_first_active'].apply(get_weekday)

In [43]:
# Split dates into day, month, year
year_account_created = pd.DatetimeIndex(users['date_account_created']).year
users['year_account_created'] = year_account_created
month_account_created = pd.DatetimeIndex(users['date_account_created']).month
users['month_account_created'] = month_account_created
day_account_created = pd.DatetimeIndex(users['date_account_created']).day
users['day_account_created'] = day_account_created
year_first_active = pd.DatetimeIndex(users['date_first_active']).year
users['year_first_active'] = year_first_active
month_first_active = pd.DatetimeIndex(users['date_first_active']).month
users['month_first_active'] = month_first_active
day_first_active = pd.DatetimeIndex(users['date_first_active']).day
users['day_first_active'] = day_first_active

In [44]:
def c_process_user_session(user):
    user_session = sessions.loc[sessions['user_id'] == user]
    # Get the user session
    user_session_data = pd.Series()

    # Length of the session
    user_session_data['session_lenght'] = len(user_session)
    user_session_data['id'] = user

    suffix = '_secs_elapsed'

    for column in ['action', 'action_type', 'action_detail', 'device_type']:
        column_data = user_session.groupby(column).secs_elapsed.sum()
        column_data.rename(lambda x: x + suffix, inplace=True)
        user_session_data = user_session_data.append(column_data)

    # Get the most used device
    user_session_data['most_used_device'] = user_session['device_type'].max()

    return user_session_data.groupby(level=0).sum()

In [46]:
from multiprocessing import Pool
import time

p = Pool(8)
%time a = p.map(c_process_user_session, sessions['user_id'].unique())
a = pd.DataFrame(a).set_index('id')

CPU times: user 52 ms, sys: 4 ms, total: 56 ms
Wall time: 451 ms


In [48]:
# Process session data
%time processed_sessions = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(process_user_session)( user, sessions.loc[sessions['user_id'] == user]) for user in sessions['user_id'].unique())
user_sessions = pd.DataFrame(processed_sessions).set_index('id')

CPU times: user 480 ms, sys: 128 ms, total: 608 ms
Wall time: 828 ms


In [None]:
# Joint the processed data with each user
users = users.set_index('id')
users = pd.concat([users, user_sessions], axis=1)

In [None]:
# TODO: Classify by dispositive

In [None]:
# Get the count of general session information
user_sessions = sessions.groupby('user_id').count()
user_sessions.rename(columns=lambda x: x + '_count', inplace=True)
users = pd.concat([users, user_sessions], axis=1)

In [None]:
processed_secs_elapsed = Parallel(n_jobs=multiprocessing.cpu_count())(
    delayed(process_user_secs_elapsed)(user, sessions.loc[
        sessions['user_id'] == user, 'secs_elapsed'])
    for user in sessions['user_id'].unique()
)

processed_secs_elapsed = pd.DataFrame(processed_secs_elapsed).set_index('id')
users = pd.concat([users, processed_secs_elapsed], axis=1)

In [None]:
train_users = train_users.set_index('id')
test_users = test_users.set_index('id')

processed_train_users = users.loc[train_users.index]
processed_test_users = users.loc[test_users.index]
processed_test_users.drop(['country_destination'], inplace=True, axis=1)

In [None]:
processed_train_users.to_csv(processed_data_path + 'processed_train_users.csv')
processed_test_users.to_csv(processed_data_path + 'processed_test_users.csv')

In [None]:
drop_list = [
    'date_account_created',
    'date_first_active',
    'timestamp_first_active'
]

# Drop columns
users = users.drop(drop_list, axis=1)

In [None]:
# TODO: Try with StandardScaler
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaler.fit_transform(users)

In [None]:
# Encode categorical features
categorical_features = [
    'gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel',
    'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
    'first_device_type', 'first_browser', 'most_used_device'
]

users = one_hot_encoding(users, categorical_features)

In [None]:
users.index.name = 'id'
processed_train_users = users.loc[train_users.index]
processed_test_users = users.loc[test_users.index]
processed_test_users.drop('country_destination', inplace=True, axis=1)

In [None]:
processed_train_users.to_csv(processed_data_path + 'encoded_train_users.csv')
processed_test_users.to_csv(processed_data_path + 'encoded_test_users.csv')