In [1]:
# General libraries.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Parameters added

* Month of Account Creation
* Season of Account Creation
* Year of Account Creation
* Age Bin 
* Language Bin
* Days Between Account Creation and Date Freezing Data Collection
* Total Number Actions per User
* Number unique devices used per user
* Longest Action by a user
* Total time spent on site per user
* The last action a user made 
* Hour the user first accessed Airbnb
* Count of each action in sessions data 
* Count of each action detail in sessions data

In [2]:
# get training and sessions data 
train = pd.read_csv('../zip_files/train_users_2.csv.zip')
test = pd.read_csv('../zip_files/test_users.csv.zip')
ses = pd.read_csv('../zip_files/sessions.csv.zip')

# rename sessions 'id' column to 'user_id' to correspond to train and test naming convention
ses.rename(columns={'user_id': 'id'}, inplace=True)

print("train_data shape: ", train.shape)
print("sessions shape: ", ses.shape)
print("test shape: ", test.shape) # no country_destination column

train_data shape:  (213451, 16)
sessions shape:  (10567737, 6)
test shape:  (62096, 15)


In [3]:
ses_id = ses['id'].unique()
train_id = train['id'].unique()
test_id = test['id'].unique()

print("sessions and train users:", len(set(ses_id) & set(train_id)))
print("sessions and test users:", len(set(ses_id) & set(test_id)))

sessions and train users: 73815
sessions and test users: 61668


In [4]:
# create param: month of account creation
def parse_month(col):
    start = col.find("-")
    end = col.find("-", start+1)
    month = col[start+1:end]
    return month

train["month_created"] = train.date_account_created.apply(parse_month)
test["month_created"] = test.date_account_created.apply(parse_month)
print("train shape: ", train.shape)
print("test shape: ", test.shape)

train shape:  (213451, 17)
test shape:  (62096, 16)


In [5]:
# create param: season of account creation
def parse_season(col):
    if col in ('12', '01', '02'):
        return 'Winter'
    elif col in ('03', '04', '05'):
        return 'Spring'
    elif col in ('06', '07', '08'):
        return 'Summer'
    else:
        return 'Fall'
    
train["season_created"] = train.month_created.apply(parse_season)
test["season_created"] = test.month_created.apply(parse_season)
print("train shape: ", train.shape)
print("test shape: ", test.shape)

train shape:  (213451, 18)
test shape:  (62096, 17)


In [6]:
# create param: year of account creation
def parse_year(col):
    stop = col.find("-")
    year = col[:stop]
    return year

train["year_created"] = train.date_account_created.apply(parse_year)
test["year_created"] = test.date_account_created.apply(parse_year)
print("train shape: ", train.shape)
print("test shape: ", test.shape)

train shape:  (213451, 19)
test shape:  (62096, 18)


In [7]:
# create param: age bin
def bin_age(col):
    if col >= 65 and col < 100:
        return "100+"
    elif col >=45 and col < 65:
        return "45-65"
    elif col >=30 and col < 45:
        return "30-45"
    elif col >= 0 and col < 30:
        return "Under30"
    else:
        return "Unknown"
    
train["bin_age"] = train.age.apply(bin_age)
test["bin_age"] = test.age.apply(bin_age)
print("train shape: ", train.shape)
print("test shape: ", test.shape)

train shape:  (213451, 20)
test shape:  (62096, 19)


In [8]:
# # create param: book/no-book 
# def booking_flag(col):
#     if type(col) != 'str':
#         return 0
#     else:
#         return 1

# train["booking_flag"] = train.date_first_booking.apply(booking_flag)
# test["booking_flag"] = test.date_first_booking.apply(booking_flag) # will be 0 for all rows
# print("train shape: ", train.shape) 
# print("test shape: ", test.shape)

In [9]:
# create param: language bin
def bin_lang(col):
    if col in ('en', 'zh', 'es', 'fr'):
        return col
    else:
        return 'other'

train["bin_lang"] = train.language.apply(bin_lang)
test["bin_lang"] = test.language.apply(bin_lang)
print("train shape: ", train.shape)
print("test shape: ", test.shape)

train shape:  (213451, 21)
test shape:  (62096, 20)


In [10]:
# # create param: days between account creation and first booking
# # some values negative because people created accounts AFTER they booked

# train_x = pd.to_datetime(train["date_first_booking"]) - pd.to_datetime(train["date_account_created"])
# train["days_delta_creation_booking"] = train_x / np.timedelta64(1, 'D')

# test_x = pd.to_datetime(test["date_first_booking"]) - pd.to_datetime(test["date_account_created"])
# test["days_delta_creation_booking"] = test_x / np.timedelta64(1, 'D')

# print("train shape: ", train.shape)
# print("test shape: ", test.shape)

In [11]:
# create param: days between account creation and date of data collection cut-off (last day of datasets)
train_last = max(train["date_account_created"]) # '2014-06-30'
train_y = pd.to_datetime(max(train["date_account_created"])) - pd.to_datetime(train["date_account_created"])
train["days_since_creation"] = train_y / np.timedelta64(1, 'D')

test_last = max(test["date_account_created"]) # '2014-09-30'
train_y = pd.to_datetime(max(train["date_account_created"])) - pd.to_datetime(train["date_account_created"])
test["days_since_creation"] = train_y / np.timedelta64(1, 'D')

print("train shape: ", train.shape)
print("test shape: ", test.shape)

train shape:  (213451, 22)
test shape:  (62096, 21)


In [12]:
# Create param: Hour the user first accessed Airbnb
# will be 0 for all test users
def first_hour_(x):
    return int(str(x)[8:10])

train['first_hour'] = train.timestamp_first_active.apply(first_hour_)
test['first_hour'] = test.timestamp_first_active.apply(first_hour_)

print("train shape: ", train.shape)
print("test shape: ", test.shape)

train shape:  (213451, 23)
test shape:  (62096, 22)


In [13]:
# Create param: Total number of actions per user
z = pd.DataFrame({'count_actions': ses['id'].value_counts()})
z['id']= z.index
z.index = list(range(0,len(z))) # sets index as numeric

train = pd.merge(train, z, on=['id', 'id'], how='left')
test = pd.merge(test, z, on=['id', 'id'], how='left')

print("train shape: ", train.shape)
print("test shape: ", test.shape)

train shape:  (213451, 24)
test shape:  (62096, 23)


In [14]:
# Create param: Number unique devices used per user
# "unknown" device type counts as a device
y = ses.groupby('id')['device_type'].nunique()  # Df where sessions is grouped by user, and unique number of devices used is returned
y = y.to_frame()  # Converts pandas series to df
y['id']= y.index   # Changes user_id from index to column
y.index = list(range(0,len(y))) # sets index as numeric
y.columns = ['number_devices', 'id']

train = pd.merge(train, y, on=['id', 'id'], how='left')
test = pd.merge(test, y, on=['id', 'id'], how='left')

print("train shape: ", train.shape)
print("test shape: ", test.shape)

train shape:  (213451, 25)
test shape:  (62096, 24)


In [15]:
# Create param: Longest Action by a user
max_time = ses.groupby('id')['secs_elapsed'].max()  # Df where sessions is grouped by user, and unique number of devices used is returned
max_time = max_time.to_frame()  # Converts pandas series to df
max_time['id']= max_time.index   # Changes user_id from index to column
max_time.index = list(range(0,len(max_time))) # sets index as numeric
max_time.columns = ['longest_session', 'id']

train = pd.merge(train, max_time, on=['id', 'id'], how='left')
test = pd.merge(test, max_time, on=['id', 'id'], how='left')

print("train shape: ", train.shape)
print("test shape: ", test.shape)

train shape:  (213451, 26)
test shape:  (62096, 25)


In [16]:
# Create param: Total time spent on site per user
total_time = ses.groupby('id')['secs_elapsed'].sum()  # Df where sessions is grouped by user, and unique number of devices used is returned
total_time = total_time.to_frame()  # Converts pandas series to df
total_time['id']= total_time.index   # Changes user_id from index to column
total_time.index = list(range(0,len(total_time))) # sets index as numeric
total_time.columns = ['total_time', 'id']

train = pd.merge(train, total_time, on=['id', 'id'], how='left')
test = pd.merge(test, total_time, on=['id', 'id'], how='left')

print("train shape: ", train.shape)
print("test shape: ", test.shape)

train shape:  (213451, 27)
test shape:  (62096, 26)


In [17]:
# Create param: The last action a user made 
last_action = ses.groupby('id')['action_detail'].last()
last_action = last_action.to_frame() 
last_action['id'] = last_action.index
last_action.index = list(range(0,len(last_action)))
last_action.columns = ['last_action', 'id']

train = pd.merge(train, last_action, on=['id', 'id'], how='left')
test = pd.merge(test, last_action, on=['id', 'id'], how='left')

print("train shape: ", train.shape)
print("test shape: ", test.shape)

train shape:  (213451, 28)
test shape:  (62096, 27)


In [18]:
# Create params: Count of each action in sessions data
actions = pd.crosstab(index=ses["id"], columns=ses["action"])

# reset index so user_id is its own column
actions.reset_index(level=0, inplace=True)

# Create new dataframe that uses user data and sessions action count data
train_w_actions = pd.merge(train, actions, on=['id', 'id'], how='left')
test_w_actions = pd.merge(test, actions, on=['id', 'id'], how='left')

print("train shape: ", train.shape)
print("test shape: ", test.shape)

print("train w actions shape: ", train_w_actions.shape)
print("test w actions shape: ", test_w_actions.shape)

train shape:  (213451, 28)
test shape:  (62096, 27)
train w actions shape:  (213451, 387)
test w actions shape:  (62096, 386)


In [19]:
# Create params: Count of each action_detail in sessions data
action_detail = pd.crosstab(index=ses["id"], columns=ses["action_detail"])

# reset index so user_id is its own column
action_detail.reset_index(level=0, inplace=True)

train_w_actions = pd.merge(train_w_actions, action_detail, on=['id', 'id'], how='left')
test_w_actions = pd.merge(test_w_actions, action_detail, on=['id', 'id'], how='left')

print("train shape: ", train.shape)
print("test shape: ", test.shape)

print("train w actions shape: ", train_w_actions.shape)
print("test w actions shape: ", test_w_actions.shape)

train shape:  (213451, 28)
test shape:  (62096, 27)
train w actions shape:  (213451, 542)
test w actions shape:  (62096, 541)


In [20]:
# Drop 'date_first_booking' from train and test set
train = train.drop('date_first_booking', 1)
test = test.drop('date_first_booking', 1)

train_w_actions = train_w_actions.drop('date_first_booking', 1)
test_w_actions = test_w_actions.drop('date_first_booking', 1)

print("train shape: ", train.shape)
print("test shape: ", test.shape)

print("train w actions shape: ", train_w_actions.shape)
print("test w actions shape: ", test_w_actions.shape)

train shape:  (213451, 27)
test shape:  (62096, 26)
train w actions shape:  (213451, 541)
test w actions shape:  (62096, 540)


In [21]:
# Export to CSV
train.to_csv('./train_combined.csv',sep=',')
test.to_csv('./test_combined.csv',sep=',')

train_w_actions.to_csv('./train_combined_actions.csv',sep=',')
test_w_actions.to_csv('./test_combined_actions.csv',sep=',')

KeyboardInterrupt: 

## Removing users in train set that pre-date sessions data

In [None]:
# Remove users that pre-date sessions data
modern_train = train.copy()

modern_train_w_actions = train_w_actions.copy()

# drop rows that predate sessions information (have no action count)
modern_train = modern_train[pd.isnull(modern_train.count_actions) != True] 
modern_train_w_actions = modern_train_w_actions[pd.isnull(modern_train_w_actions.count_actions) != True] 

# reset index
modern_train.reset_index(drop=True, inplace=True) 
modern_train_w_actions.reset_index(drop=True, inplace=True)

print("modern train shape: ", modern_train.shape)
print("modern train w actions shape: ", modern_train_w_actions.shape)

In [None]:
# Export 'modern' datasets to CSV
modern_train.to_csv('./modern_train_combined.csv',sep=',')
modern_train_w_actions.to_csv('./modern_train_combined_actions.csv',sep=',')
