In [None]:
# Filter warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# model lib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier

## Loading dataset

loading train, test and session data from zip

In [None]:
train_data = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip')
test_data = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/test_users.csv.zip')
print(train_data.shape)
print(test_data.shape)

In [None]:
session_data= pd.read_csv('../input/airbnb-recruiting-new-user-bookings/sessions.csv.zip')
print(session_data.shape)

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
# see cols with missing value in train_data
missing_value_by_train_cols = [cols for cols in train_data.columns
                         if train_data[cols].isnull().any()]

missing_ratio_in_train_data = ['{:0.1f}%'.format(100 * train_data[cols].isnull().sum() / train_data.shape[0])
                               for cols in missing_value_by_train_cols]

# see cols with missing value in test_data
missing_value_by_test_cols = [cols for cols in test_data.columns
                         if test_data[cols].isnull().any()]

missing_ratio_in_test_data = ['{:0.1f}%'.format(100 * test_data[cols].isnull().sum() / test_data.shape[0])
                               for cols in missing_value_by_test_cols]

print(missing_value_by_train_cols)
print(missing_ratio_in_train_data)
print(missing_value_by_test_cols)
print(missing_ratio_in_test_data)

In [None]:
session_data.head()

### Insight from session_data
* If message_post in action_type and action_detail, then it will get NaN in action. But there is not always secs_elapsed.
* If action is lookup, then action_type and action_detail will be NaN.

In [None]:
def plot_action_feature_ratio(before_n_rank):
    not_null_session_action = session_data[session_data['action'].notnull()]['action']
    not_null_ratio = 100 * not_null_session_action.value_counts().sort_values(ascending=False)[:before_n_rank] / session_data.action.count()
    not_null_ratio.hist(bins=50, density=False, cumulative=True)
    return 'Ratio contains: {}'.format(not_null_ratio.sum())

plot_action_feature_ratio(35)

In [None]:
session_data.info()

* We can find that "date_first_booking" does not exist in the train_data, so we can remove it from the data.
* Age is missing about half of all data, so we need to fill missing value
* the 'unknown' needs to change to np.nan in order to set as Nan.

## Feature Engineering

We simply use all the feature to set as baseline model. In this step, we will seperate the date into year, month and day for each column. and turn the categorical columns into one hot code.

In [None]:
# Create new dataframe called 'df_total' as feature engineer matrix
df_train = train_data.drop(['country_destination'], axis=1)
df_total = pd.concat([df_train, test_data], axis=0, ignore_index=True)
df_total = df_total.drop(['date_first_booking'], axis=1)

In [None]:
df_total.info()

### Total data for Feature Engineering

In [None]:
# Seperate the date into three part
date_account_created = np.vstack(df_total['date_account_created'].astype(str).apply(lambda x: 
                                                                                    list(map(int, x.split('-')))))
df_total['date_account_created_year'] = date_account_created[:,0]
df_total['date_account_created_month'] = date_account_created[:,1]
# df_total['date_account_created_day'] = date_account_created[:,2]

# drop the date_account_create because it is no longer use
df_total = df_total.drop('date_account_created', axis=1)

In [None]:
# Seperate the date into six parts
first_active = np.vstack(df_total['timestamp_first_active'].astype(str).apply(lambda x:
                                                                              list(map(int,[x[:4], x[4:6], x[6:8]]))))
df_total['first_active_year'] = date_account_created[:,0]
df_total['first_active_month'] = date_account_created[:,1]
# df_total['first_active_day'] = date_account_created[:,2]

# drop the date_account_create because it is no longer use
df_total = df_total.drop('timestamp_first_active', axis=1)

In [None]:
# correct age with right type
df_total['age'] = df_total['age'].apply(lambda x: int(2015 - x) if x > 1750 else x)
df_total['age'] = df_total.age.apply(lambda x: np.log2(x))

In [None]:
df_total.age.hist(bins=100)

In [None]:
# fill age with simputer
from sklearn.impute import SimpleImputer

age_freq_imputer = SimpleImputer(strategy='mean')
age = df_total.loc[:,"age"].values.reshape(-1,1)
age_freq_imputer.fit_transform(age)
df_total.loc[:,"age"] = age

In [None]:
# Fill na in lang col
lang_freq_imputer = SimpleImputer(strategy='most_frequent')
lang = df_total.loc[:,"language"].values.reshape(-1,1)
lang_freq_imputer.fit_transform(lang)
df_total.loc[:,"language"] = lang

#### Session engineering
* Filled NaN with 'NULL' as a missing value
* calculate each user by
    * number of actions taken
    * number of unique action_type, action_details, device
    * sum of seconds of elasped

In [None]:
# create a new session_data to execute data engineering
df_session = session_data.copy()

In [None]:
# Have a peek with ratio of missing values in session_data
print('action ratio: {:0.2f}%'.format(session_data['action'].isnull().sum() / session_data['action'].count()))
print('action_type ratio: {:0.2f}%'.format(session_data['action_type'].isnull().sum() / session_data['action_type'].count()))
print('action_detail ratio: {:0.2f}%'.format(session_data['action_detail'].isnull().sum() / session_data['action_detail'].count()))
print('sec_elasped ratio: {:0.2f}%'.format(session_data['secs_elapsed'].isnull().sum() / session_data['secs_elapsed'].count()))

# We simply fill na with 'NULL' as a new type and replace data which is '-unknown-' with 'NULL'
# But replace secs as np.nan because we need to calculate sum secs after
df_session = df_session.replace('-unknown-', 'NULL')
df_session = df_session.fillna('NULL')
df_session['secs_elapsed'] = df_session['secs_elapsed'].replace('NULL', np.nan)

In [None]:
# Peek all '-unknown-' is replaced by NaN successfully.
assert df_session.where(df_session == '-unknown-') == 0

In [None]:
# Calculate each user by unique action_type, action_details, device
action_count = session_data.groupby('user_id')['action'].count().reset_index()
unq_action_count = session_data.groupby('user_id')['action'].nunique().reset_index()
unq_action_type = session_data.groupby('user_id')['action_type'].nunique().reset_index()
unq_action_detail = session_data.groupby('user_id')['action_detail'].nunique().reset_index()
unq_device = session_data.groupby('user_id')['device_type'].nunique().reset_index()

# Calculate each usr by summarizing sec_elapsed
sum_sec_elapsed = session_data.groupby('user_id')['secs_elapsed'].sum().reset_index()

In [None]:
# Rename all new columns
action_count.columns = ['user_id', 'action_count']
unq_action_count.columns = ['user_id', 'unq_action_count']
unq_action_type.columns = ['user_id', 'unq_action_type']
unq_action_detail.columns = ['user_id', 'unq_action_detail']
unq_device.columns = ['user_id', 'unq_device']
sum_sec_elapsed.columns = ['user_id', 'sum_sec_elapsed']

In [None]:
sum_sec_elapsed.hist(bins=100)

In [None]:
# log tansformation
log_action_count = action_count.action_count.apply(lambda x: np.log(x + 1))
log_unq_action_count = unq_action_count.unq_action_count.apply(lambda x: np.log(x + 1))
log_unq_action_type = unq_action_type.unq_action_type.apply(lambda x: np.log(x + 1))
log_unq_action_detail = unq_action_detail.unq_action_detail.apply(lambda x: np.log(x + 1))
log_sum_sec_elapsed = sum_sec_elapsed.sum_sec_elapsed.apply(lambda x: np.log(x + 1))

In [None]:
# Have a peek with data dist after log transformation
plt.subplot(5,1,1)
plt.hist(log_action_count, bins=100)
plt.title('Distributions after Log trans')
plt.subplot(5,1,2)
plt.hist(log_unq_action_count, bins=100)
plt.subplot(5,1,3)
plt.hist(log_unq_action_type, bins=100)
plt.subplot(5,1,4)
plt.hist(log_unq_action_detail, bins=100)
plt.subplot(5,1,5)
plt.hist(log_sum_sec_elapsed, bins=100)
plt.legend()
plt.show()

In [None]:
# Appending back user_id to columns
log_action_count = pd.concat([action_count.user_id, log_action_count], axis=1)
log_unq_action_count = pd.concat([unq_action_count.user_id, log_unq_action_count], axis=1)
log_unq_action_type = pd.concat([unq_action_type.user_id, log_unq_action_type], axis=1)
log_unq_action_detail = pd.concat([unq_action_detail.user_id, log_unq_action_detail], axis=1)
log_sum_sec_elapsed = pd.concat([sum_sec_elapsed.user_id, log_sum_sec_elapsed], axis=1)

In [None]:
# Merging back to main dataframe
df_total = df_total.merge(log_action_count, left_on='id', right_on='user_id', how='left', suffixes=('', '_y'))
df_total = df_total.merge(log_unq_action_count, left_on='id', right_on='user_id', how='left', suffixes=('', '_y'))
df_total = df_total.merge(log_unq_action_type, left_on='id', right_on='user_id', how='left', suffixes=('', '_y'))
df_total = df_total.merge(log_unq_action_detail, left_on='id', right_on='user_id', how='left', suffixes=('', '_y'))
df_total = df_total.merge(unq_device, left_on='id', right_on='user_id', how='left', suffixes=('', '_y'))
df_total = df_total.merge(log_sum_sec_elapsed, left_on='id', right_on='user_id', how='left', suffixes=('', '_y'))

# Drop duplicate calumns
df_total.drop(df_total.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)
df_total.drop('user_id', axis=1, inplace=True)

### Fill NaN in affiliate tracked with most freq

In [None]:
df_total.describe(include='object')

In [None]:
# df_total[df_total.first_affiliate_tracked.isnull()]['first_affiliate_tracked']
df_total['first_affiliate_tracked'] = df_total['first_affiliate_tracked'].fillna('untracked')

# Verify the fill na is successful
assert df_total.first_affiliate_tracked.isnull().sum() == 0

In [None]:
df_total.info()

### Impute missing value with KNN

In [None]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

# Replacing unknown in gender with np.nan
df_total.gender.replace('-unknown-', np.nan, inplace=True)

In [None]:
df_total.columns

In [None]:
# Build a null df to store ohe
df_knn = pd.DataFrame()

ohe_to_fill_gender_na_feature = [
    'gender',
    'signup_method',
    'signup_flow',
    'language',
    'affiliate_channel',
    'affiliate_provider',
    'first_affiliate_tracked',
    'signup_app',
    'first_device_type',
    'first_browser'
]

for feature in ohe_to_fill_gender_na_feature:
    df_knn_le = pd.DataFrame(LabelEncoder().fit_transform(df_total[feature].apply(lambda x: str(x))))
    df_knn = pd.concat([df_knn, df_knn_le], axis=1)

In [None]:
# Impute with KNN imputer
KNN_imputer = KNNImputer()
KNN_imputer = KNNImputer(n_neighbors=2,
                         missing_values=np.nan,
                         weights='uniform',
                         metric='nan_euclidean',
                         copy=True)

# fill the na
KNN_imputer.fit_transform(df_knn)

In [None]:
# Impute gender into df_total
df_total['gender'] = df_knn.loc[:,0]

assert df_total.gender.isnull().sum() == 0

In [None]:
one_hot_encoding_feature = [
    'gender',
    'signup_method',
    'signup_flow',
    'language',
    'affiliate_channel',
    'affiliate_provider',
    'first_affiliate_tracked',
    'signup_app',
    'first_device_type',
    'first_browser'
]

for feature in one_hot_encoding_feature:
    df_dummy = pd.get_dummies(df_total[feature], prefix=feature)
    df_total.drop(feature, axis=1, inplace=True)
    df_total = pd.concat([df_total, df_dummy], axis=1)

In [None]:
# Split train and test to validate
le = LabelEncoder()

country_destination = train_data['country_destination']
df_final = df_total.drop('id', axis=1)

train_y = le.fit_transform(country_destination)
train_X = df_final.iloc[:train_data.shape[0],:]
test_X = df_final.iloc[train_data.shape[0]:, :]


In [None]:
le.classes_

In [None]:
# Initializing XGB Classifier
xgb = XGBClassifier(max_depth=10,
                    learning_rate=0.01,
                    n_estimators=50,
                    min_child_weight=1,
                    objective='multi:softprob',
                    subsample=0.5,
                    colsample_bytree=0.5,
                    seed=0)

In [None]:
model = xgb.fit(train_X, train_y)

In [None]:
y_pred = model.predict_proba(test_X)

In [None]:
xgb.classes_

## Verify the nCDG of Prediction

Here is the function of nCDG function to calculate XGB performance.

In [None]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer, ndcg_score
ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=5)

def dcg_score(y_true, y_score, k=5):
    """Discounted cumulative gain (DCG) at rank K.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array, shape = [n_samples, n_classes]
        Predicted scores.
    k : int
        Rank.

    Returns
    -------
    score : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)


def ndcg_score(ground_truth, predictions, k=5):
    """Normalized discounted cumulative gain (NDCG) at rank K.

    Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
    recommendation system based on the graded relevance of the recommended
    entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
    ranking of the entities.

    Parameters
    ----------
    ground_truth : array, shape = [n_samples]
        Ground truth (true labels represended as integers).
    predictions : array, shape = [n_samples, n_classes]
        Predicted probabilities.
    k : int
        Rank.

    Returns
    -------
    score : float

    Example
    -------
    >>> ground_truth = [1, 0, 2]
    >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    1.0
    >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    0.6666666666
    """
    lb = LabelBinarizer()
    lb.fit(range(len(predictions) + 1))
    T = lb.transform(ground_truth)

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)


# NDCG Scorer function
ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=5)

### Split train and test into 5 Kfold

In [None]:
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

# Initialized Kfold to use it later
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
results = []
    
def performance_measures(model, store_results=True):
    train_ndcg = cross_val_score(model, X_train_transformed, y_train, scoring=ndcg_scorer, cv=kf, n_jobs=-1)
    test_ndcg = cross_val_score(model, X_test_transformed, y_test, scoring=ndcg_scorer, cv=kf, n_jobs=-1)
    print("Mean Train NDGC: {}\nMean Test NDGC: {}".format(train_ndcg.mean(), test_ndcg.mean()))

### Generate the Final final to Submission.csv

In [None]:
# initialize the data
test_id = []
cities_list = []

for i in range(test_data.shape[0]):
    each_id = [test_data['id'][i]]
    test_id += each_id * 5
    cities_list += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

# Make sure both columns have same rows
print('length of test_id: {}'.format(len(test_id)))
print('length of cities_list: {}'.format(len(cities_list)))
assert len(test_id) == len(cities_list)

In [None]:
#Generate submission.csv
sub = pd.DataFrame(np.column_stack((test_id, cities_list)), columns=['id', 'country'])
sub.to_csv('submission.csv',index=False)