# Gradient Boosting Model

In [1]:
import sys
import numpy as np
import pandas as pd
sys.path.append('..')

This code it's based in [this](https://www.kaggle.com/svpons/airbnb-recruiting-new-user-bookings/script-0-8655) user script and adapted to Notebook. Submitting the output of this code will give you a score of 0.86544

### Data Loading

In [2]:
from utils.data_loading import load_users_data
train_users, test_users = load_users_data()

Get the feature to predict out of the train data:

In [3]:
labels = train_users['country_destination'].values
train_users = train_users.drop(['country_destination'], axis=1)

Take the test user ID so we can make the submission file later:

In [4]:
id_test = test_users['id']
piv_train = train_users.shape[0]

Join train and test users to preprocess both at the same time:

In [5]:
users = pd.concat((train_users, test_users), axis=0, ignore_index=True)

Drop **ID** and **date_first_booking** from the DataFrame, as they are not used to make the predictions:

In [6]:
users = users.drop(['id', 'date_first_booking'], axis=1)

## Preprocessing

In [7]:
# Fill NaN values
users = users.fillna(-1)

Process dates to take *day*, *month* and *year* in different columns:

In [8]:
import datetime
users['date_account_created'] = pd.to_datetime(users['date_account_created'])
users['year_account_created'] = pd.DatetimeIndex(users['date_account_created']).year
users['month_account_created'] = pd.DatetimeIndex(users['date_account_created']).month
users['day_account_created'] = pd.DatetimeIndex(users['date_account_created']).day
users = users.drop(['date_account_created'], axis=1)

users['timestamp_first_active'] = pd.to_datetime(users['timestamp_first_active'], format='%Y%m%d%H%M%S')
users['year_first_active'] = pd.DatetimeIndex(users['timestamp_first_active']).year
users['month_first_active'] = pd.DatetimeIndex(users['timestamp_first_active']).month
users['day_first_active'] = pd.DatetimeIndex(users['timestamp_first_active']).day
users = users.drop(['timestamp_first_active'], axis=1)

Remove weird age data:

In [9]:
age_values = users.age.values
users['age'] = np.where(np.logical_or(age_values < 14, age_values > 100), -1, age_values)

One-hot-Encoding categorical features:

In [10]:
from utils.preprocessing import one_hot_encoding

categorical_features = [
    'gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel',
    'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
    'first_device_type', 'first_browser'
]

users = one_hot_encoding(users, categorical_features)

Split the data into *train* and *test* and encode country labels into integers:

In [11]:
from sklearn.preprocessing import LabelEncoder

# Splitting train and test
values = users.values
X = values[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)
X_test = values[piv_train:]

# Model

In [None]:
from xgboost.sklearn import XGBClassifier

# Classifier
xgb = XGBClassifier(
    max_depth=8,
    learning_rate=0.2,
    n_estimators=42,
    objective="multi:softprob",
    nthread=-1,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.6,
    colsample_bytree=0.6,
    colsample_bylevel=1,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    seed=42
)

In [None]:
xgb.fit(X, y)

In [None]:
y_pred = xgb.predict_proba(X_test)

In [None]:
# Taking the 5 classes with highest probabilities
ids = []
cts = []
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

In [None]:
# Generate Submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('../datasets/submissions/xgboost.csv',index=False)