# Detailed walk through the Airbnb Dataset

In [112]:
# Setup plot environment
%matplotlib inline

# Imports
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt
import seaborn as sbs
from datetime import datetime
from airbnb_tools import *

from xgboost.sklearn import XGBClassifier


In [122]:
# Read in the training data
train = pd.read_csv("data/train_users_2.csv")

# For now, we are only interested in data that *has* bookings (there is much to be said
# if there isn't a booking, but this project was asked to focus on predictions of where
# and not if someone would book.
train = train[train["country_destination"] != "NDF"]

# Start some preliminary cleaning of the data.
# Fill in missing data with -1 for now
train.fillna(-1, inplace=True)

In [123]:
# Create some features from others that may be telling
def days_between(d1, d2):
    d1 = datetime.strptime(d1, "%Y-%m-%d")
    d2 = datetime.strptime(d2, "%Y-%m-%d")
    outval = (d2-d1).days
    if (outval > 0):
        return outval
    else:
        return 0

In [124]:
# Where were no bookings made?
d1 = train["date_account_created"].values
d2 = train["date_first_booking"].values
elapsed = []
for ii in range(0, len(train)):
    elapsed.append(days_between(d1[ii],d2[ii]))
    
# Add this to the DataFrame
train["elapsed_booking_time"] = elapsed

In [125]:
# What does the timestamp_first_active say?
def tfa(x):
        
    output = []
    x = str(x)
    output.append(x[:4])
    output.append(x[4:6])
    output.append(x[6:8])
    output.append(x[8:10])
    output.append(x[10:12])
    output.append(x[12:14])
    
    return output



In [127]:
# Split the initial Time Stamp into useful bits. The creation delay is the time between
# Their first search (which could be first) to when they created their account.
tfa_year = []
tfa_month = []
tfa_day = []
creation_delay = []
tfa_vector = train["timestamp_first_active"].values
for ii in range(0, len(train)):
    tfa_out = tfa(tfa_vector[ii])
    tfa_year.append(tfa_out[0])
    tfa_month.append(tfa_out[1])
    tfa_day.append(tfa_out[2])
    d2_string = str(tfa_out[0]) + '-' + str(tfa_out[1]) + '-' + str(tfa_out[2])
    creation_delay.append(days_between(d2_string, d1[ii]))
    
#train["tfa_year"] = tfa_year
#train["tfa_month"] = tfa_month
#train["tfa_day"] = tfa_day
train["creation_delay"] = creation_delay
train.drop(["timestamp_first_active", "date_account_created", 
            "date_first_booking"], axis=1, inplace=True)

In [128]:
# Clean up Age a bit.  Assume anything with age < 18 and age > 100 are fubar and set to -1
train['age'][(train['age'] < 15) | (train['age'] > 100)] = -1

# ID is likely not super informative, so let's drop it
train.drop(['id'], inplace=True, axis=1)

In [129]:
# What Categorical Variables are we interested in?
categorical_variables = ['gender', 'language', 'signup_method',  'signup_flow', 
                         'affiliate_channel', 'affiliate_provider', 
                         'first_affiliate_tracked', 'signup_app', 
                         'first_device_type', 'first_browser']
X = split_categorical_variables(train, categorical_variables)

In [130]:
X.head()

Unnamed: 0,age,country_destination,elapsed_booking_time,creation_delay,gender_-unknown-,gender_FEMALE,gender_MALE,gender_OTHER,language_ca,language_cs,...,first_browser_SeaMonkey,first_browser_Silk,first_browser_SiteKiosk,first_browser_SlimBrowser,first_browser_Sogou Explorer,first_browser_Stainless,first_browser_TenFourFox,first_browser_TheWorld Browser,first_browser_Yandex.Browser,first_browser_wOSBrowser
2,56,US,0,476,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,42,other,278,765,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,41,US,0,280,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,-1,US,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,46,US,3,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
