In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier

# Draw inline
# %matplotlib inline

# Set figure aesthetics
# sns.set_style("white", {'ytick.major.size': 5.0})
# sns.set_context("poster", font_scale=0.5)

np.random.seed(0)

In [2]:
## Training set of users
train_users = pd.read_csv('/Users/dominicdebiaso/Development/datasets/airbnb/train_users_2.csv')
## Test set of users
test_users = pd.read_csv('/Users/dominicdebiaso/Development/datasets/airbnb/test_users.csv')
## Web sessions log for users
sessions = pd.read_csv('/Users/dominicdebiaso/Development/datasets/airbnb/sessions.csv')
## Country statistics
countries = pd.read_csv('/Users/dominicdebiaso/Development/datasets/airbnb/countries.csv')
## User statistics
age_gender_bkts = pd.read_csv('/Users/dominicdebiaso/Development/datasets/airbnb/age_gender_bkts.csv')

In [3]:
train_users.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [4]:
train_users.date_account_created

0         2010-06-28
1         2011-05-25
2         2010-09-28
3         2011-12-05
4         2010-09-14
5         2010-01-01
6         2010-01-02
7         2010-01-03
8         2010-01-04
9         2010-01-04
10        2010-01-04
11        2010-01-05
12        2010-01-05
13        2010-01-05
14        2010-01-07
15        2010-01-07
16        2010-01-07
17        2010-01-07
18        2010-01-08
19        2010-01-10
20        2010-01-10
21        2010-01-10
22        2010-01-11
23        2010-01-11
24        2010-01-11
25        2010-01-12
26        2010-01-12
27        2010-01-12
28        2010-01-13
29        2010-01-13
             ...    
213421    2014-06-30
213422    2014-06-30
213423    2014-06-30
213424    2014-06-30
213425    2014-06-30
213426    2014-06-30
213427    2014-06-30
213428    2014-06-30
213429    2014-06-30
213430    2014-06-30
213431    2014-06-30
213432    2014-06-30
213433    2014-06-30
213434    2014-06-30
213435    2014-06-30
213436    2014-06-30
213437    201

In [3]:
### Predicting country destinations

country_labels = train_users['country_destination'].values
# Removing country_destination after creating a label
df_train = train_users.drop(['country_destination'], axis=1)
test_id = test_users['id']
train_counts = df_train.shape[0]

# Combining train and test data
df_all = pd.concat((df_train, test_users), axis=0, ignore_index=True)
# Remove id and date_first_booking
df_all = df_all.drop(['id', 'date_first_booking'], axis=1)
# Filling in NaNs
df_all = df_all.fillna(-1)

## Feature Engineering
# Convert date_account_created into an array of values and add them as separate columns to df
dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
df_all['dac_year'] = dac[:,0]
df_all['dac_month'] = dac[:,1]
df_all['dac_day'] = dac[:,2]
df_all = df_all.drop(['date_account_created'], axis=1)

# Convert timestamp_first_active into an array of values and add them as separate columns to df
df_all['timestamp_first_active'] = pd.to_datetime((df_all.timestamp_first_active / 1000000), format='%Y%m%d')
tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
df_all['tfa_year'] = tfa[:,0]
df_all['tfa_month'] = tfa[:,1]
df_all['tfa_day'] = tfa[:,2]
df_all = df_all.drop(['timestamp_first_active'], axis=1)

# Desired age range is 15 to 99 so replace all other values with -1 and return an array of desired values
# np.logical returns boolean of criteria specified
# np.where returns index location of criteria specified
# output array contains 'x' where condition is true (with given values) and 'y' elsewhere
av = df_all.age.values
df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av)

# One-hot-encoding categorical features
# Create dummy variables for the given features, drop the original feature from the df, and add the dummy 
# features to the df
ohe_features = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider',
            'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for feature in ohe_features:
    df_all_dummy = pd.get_dummies(df_all[feature], prefix=feature)
    df_all = df_all.drop([feature], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)
    
# Split training and test data based on counts
vals = df_all.values
X = vals[:train_counts]
X_test = vals[train_counts:]
# Encode labels with value between 0 and n_classes-1
le = LabelEncoder()
y = le.fit_transform(country_labels)

## Modeling
# Extreme Gradient Boosting Classifier

estimator_depth = 6 # Maximum depth of tree / individual estimators. The maximum depth limits the number
                    # of nodes in the tree. Tune this for best performance:  value depends on interaction of input variables.
learn_rate = 0.3 # Controls the contribution of weak learnings in the final combination. There is
                 # a trade-off between learning_rate and n_estimators.
estimator_num = 25 # Control the number of weak learners
learning_task = 'multi:softprob' # Learning algorithm being used
subsample_ratio = 0.5 # Subsample ratio of the training instance. Setting to 0.5 means that XGBoost randomly 
                      # collects half of the data instanes to grow trees and will prevent overfitting.
subsample_ratio_cols = 0.5 # Subsample ratio of columns when constructing each tree
xgb_model = XGBClassifier(max_depth=estimator_depth, learning_rate=learn_rate, n_estimators=estimator_num, 
                          objective=learning_task, subsample=subsample_ratio, colsample_bytree=subsample_ratio_cols, seed=0)
# Fit model on training set and and training set labels
xgb_model.fit(X, y)
y_pred = xgb_model.predict_proba(X_test)

# 5 classes with highest probabilities
ids = [] # list of ids
cts = [] # list of countries
for i in range(len(test_id)):
    idx = test_id[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()
    
#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('/Users/dominicdebiaso/Desktop/sub.csv', index=False)

In [None]:
### EDA

# Merge train and test files for visualizations
# users = pd.concat((train_users, test_users), axis=0, ignore_index=True)
# Remove ID's
# users.drop('id', axis=1, inplace=True)

## Missing Data
# Replace unknowns in gender column with nan
# users.gender.replace('-unknown-', np.nan, inplace=True)

# NaN values
# users_nan = (users.isnull().sum() / users.shape[0]) * 100
# users_nan[users_nan > 0]

# Unusual ages; under 18 and over 100
# users.age.describe()
# print sum(users.age < 18)
# print sum(users.age > 100)
# users[users.age < 18]['age'].describe()
# users[users.age > 90]['age'].describe()

# users.loc[users.age < 15, 'age'] = np.nan
# users.loc[users.age > 90, 'age'] = np.nan

## Data Types
# sorted(users.columns.tolist())
# categorical_features = [
#     'affiliate_channel',
#     'affiliate_provider',
#     'country_destination',
#     'first_affiliate_tracked',
#     'first_browser',
#     'first_device_type',
#     'gender',
#     'language',
#     'signup_app',
#     'signup_method'
# ]

# for categorical_feature in categorical_features:
#     users[categorical_feature] = users[categorical_feature].astype('category')

# users['date_account_created'] = pd.to_datetime(users['date_account_created'])
# users['date_first_booking'] = pd.to_datetime(users['date_first_booking'])
# users['date_first_active'] = pd.to_datetime((users.timestamp_first_active / 1000000), format='%Y%m%d')

## Visualizations
# Count of users by gender
# users.gender.value_counts(dropna=False).plot(kind='bar', color='#ff9966', rot=0)
# plt.xlabel('Gender')
# sns.despine()

# Gender and Destinations
# women = sum(users['gender'] == 'FEMALE')
# men = sum(users['gender'] == 'MALE')

# female_destinations = users.loc[users['gender'] == 'FEMALE', 'country_destination'].value_counts() / women * 100
# male_destinations = users.loc[users['gender'] == 'MALE', 'country_destination'].value_counts() / men * 100

# female_destinations.plot(kind='bar', width=0.4, color='#ffccff', position=0, label='Female', rot=0)
# male_destinations.plot(kind='bar', width=0.4, color='#ccccff', position=1, label='Male', rot=0)

# plt.legend()
# plt.xlabel('Destination Country')
# plt.ylabel('Percentage')
# sns.despine()
# plt.show()

# Distribution of users by age
# sns.distplot(users.age.dropna(), color='#00cc99')
# plt.xlabel('Age')
# sns.despine()

# Dates by accounts created
# sns.set_style('whitegrid', {'axes.edgecolor': '0'})
# sns.set_context('poster', font_scale=0.8)
# users.date_account_created.value_counts().plot(kind='line', linewidth=1.2, color='#00ccff')
# sns.despine()

# Dates when users first active on site; this can be before account created
# users.date_first_active.value_counts().plot(kind='line', linewidth=1.2, colors='#6666ff')
# sns.despine()

# Date of first booking
# users.date_first_booking.value_counts().plot(kind='line', linewidth=1.1, colors='#3399ff')
# sns.despine()

In [None]:
### Notes

# df = pd.DataFrame(np.random.randint(10, 20, size=(5,4)), columns=list('ABCD'))
# df.A.astype(str).apply(lambda x: list(map(int, x)))
# df = pd.DataFrame(np.random.randin(10, 4)*10, columns=list('ABCD'))

In [None]:
### References
# https://www.kaggle.com/svpons/airbnb-recruiting-new-user-bookings/script-0-8655/code
# https://www.kaggle.com/davidgasquez/airbnb-recruiting-new-user-bookings/user-data-exploration/notebook
# http://xgboost.readthedocs.org/en/latest/model.html