# Prediction of bookings based on user behavior
Data Scientist – User Profiling, Hotel Search

- Author: Kai Chen
- Date:   Apr, 2018

In [16]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter("ignore", DeprecationWarning)

import numpy as np
import pandas as pd
from datetime import datetime
import operator
from collections import OrderedDict
import time
import csv
import gc
from multiprocessing import Pool

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.metrics import accuracy_score

import xgboost as xgb
from xgboost import XGBClassifier

import lightgbm as lgb

import catboost
from catboost import CatBoostClassifier

from keras.preprocessing import sequence
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb
from keras.utils import to_categorical

np.random.seed(42)

Using TensorFlow backend.


In [4]:
train_user_df = pd.read_csv('train_user_df.csv')
print(train_user_df.describe())
print(train_user_df.head(3))

target_user_df = pd.read_csv('target_user_df.csv')


            user_id    session_id  referer_code        is_app      agent_id  \
count  5.864434e+06  5.864434e+06  5.864434e+06  5.864434e+06  5.864434e+06   
mean   4.612515e+18  4.607873e+18  1.050245e+01  1.247070e-01  7.267086e+00   
std    2.657178e+18  2.656793e+18  2.855244e+01  3.303864e-01  3.802190e+00   
min    3.883091e+14  1.097161e+14  0.000000e+00  0.000000e+00  0.000000e+00   
25%    2.307265e+18  2.310716e+18  0.000000e+00  0.000000e+00  6.000000e+00   
50%    4.624574e+18  4.606553e+18  1.000000e+00  0.000000e+00  9.000000e+00   
75%    6.897454e+18  6.892477e+18  1.000000e+00  0.000000e+00  1.000000e+01   
max    9.223267e+18  9.223359e+18  9.900000e+01  1.000000e+00  1.400000e+01   

       traffic_type   has_booking     action_id     reference          step  
count  5.864434e+06  5.864434e+06  5.864434e+06  5.864434e+06  5.864434e+06  
mean   1.882018e+00  1.301094e-01  2.812956e+03  4.897366e+05  5.463159e+01  
std    1.407386e+00  3.364238e-01  1.636151e+03  2.865

In [10]:
for name, group in train_user_df.groupby('session_id'):
    nb_bookings = group[group['has_booking'] == 1].shape[0]
    if (nb_bookings > 0 and nb_bookings != group.shape[0] and np.max(group['step']) != group.shape[0]):
        print(group)

In [11]:
action_id_list = list(train_user_df.action_id.unique())
print(len(action_id_list))

reference_list = list(train_user_df.reference.unique())
print(len(reference_list))

211
121529


In [13]:
nb_steps_booking_list = []

for name, group in train_user_df.groupby('session_id'):
    nb_bookings = group[group['has_booking'] == 1].shape[0]
    if (nb_bookings > 0):
        nb_steps_booking_list.append(group.shape[0])

print('min nb steps: {}'.format(np.min(nb_steps_booking_list)))
print('max nb steps: {}'.format(np.max(nb_steps_booking_list)))
print('std nb steps: {}'.format(np.std(nb_steps_booking_list)))

min nb steps: 1
max nb steps: 2924
std nb steps: 70.27699664977659


In [14]:
# Is there a common action_id in the sessions with bookings?
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

inter_actions = []

for name, group in train_user_df.groupby('session_id'):
    nb_bookings = group[group['has_booking'] == 1].shape[0]
    if (nb_bookings > 0):
        actions = list(group['action_id'].values)
        inter_actions = intersection(inter_actions, actions)
    
print(inter_actions)

[]


In [15]:
# Is there a common reference in the sessions with bookings?
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

inter_actions = []

for name, group in train_user_df.groupby('session_id'):
    nb_bookings = group[group['has_booking'] == 1].shape[0]
    if (nb_bookings > 0):
        actions = list(group['reference'].values)
        inter_actions = intersection(inter_actions, actions)
    
print(inter_actions)

[]


## Feature engineering

In order to predict if a session has a booking or not, it is not sufficient to take only the session information (i.e., referer_code, is_app, agent_id, traffic_type) and the action information (i.e., action_id, reference) of the last step as features. Ideally, we have to take not only the session information but also all the action information in the session as features. Due to the limitation of the computation resource, for each session, I take the last n steps action information with the session information as features. 


In [17]:
# ---
# Define file paths
TRAIN_BOOKING_FILE_PATH = 'data/case_study_bookings_train.csv'    # training sessions for bookings
TARGET_BOOKING_FILE_PATH = 'data/case_study_bookings_target.csv'  # target sessions to predict bookings

TRAIN_ACTION_FILE_PATH = 'data/case_study_actions_train.csv'      # training set of user actions
TARGET_ACTION_FILE_PATH = 'data/case_study_actions_target.csv'    # user actions in the target sessions

# replace the NAN values by a specific value
NA_ACTION_ID = -10
NA_REFERENCE_ID = -10
NA_STEP = 0

# feature_columns = ['ymd', 'referer_code', 'is_app', 'agent_id', 'traffic_type', 'action_id', 'reference', 'step']
feature_columns = ['referer_code', 'is_app', 'agent_id', 'traffic_type', 'action_id', 'reference', 'step']
target_column   = ['has_booking']

In [None]:
def prepare_data(df, nb_pre_steps=1,
                 feature_columns = ['referer_code', 'is_app', 'agent_id', 'traffic_type', 'action_id', 'reference', 'step'],
                 # feature_columns = ['ymd', 'referer_code', 'is_app', 'agent_id', 'traffic_type', 'action_id', 'reference', 'step'],
                 previous_action_names=['action_id', 'reference'],
                 target_column = 'has_booking',
                 default_action_values = [-10, -10]):
    """
    Create a dataframe, such that each row contains the information of a session.
    Since for each session, there is a sequence of information.
    In this dataframe, for each session,
    I take only the last n steps information with its nb_pre_steps number of previous steps information.
    """
    print('\n== prepare data ==')

    total_nb_rows = len(df['session_id'].unique())

    # initialize the column names
    # columns_add = ['duration'] # add new features
    columns_add = [f_name for f_name in feature_columns]

    for i in range(0, nb_pre_steps):
        for previous_action_name in previous_action_names:
            col_name = '{}_{}'.format(previous_action_name, (i+1))
            columns_add.append(col_name)

    if target_column in df.columns:
        columns_add.append(target_column)

    df_new = pd.DataFrame(columns=columns_add)

    start_time = time.time()
    index = 0 # index of each row
    for name, group in df.groupby('session_id'):
        max_step = np.max(group['step'])

        # get start time
        # min_step = np.min(group['step'])
        # start_time = pd.to_datetime(group[group['step'] == max_step]['ymd'].values[0].astype('str'))
        # get end time
        # end_time = pd.to_datetime(group[group['step'] == min_step]['ymd'].values[0].astype('str'))
        # compute the duration of the session
        # duration = (end_time-start_time).total_seconds()

        # for each session, get its information in the last step
        sub_df = group[group['step'] == max_step]

        # set the initial values of this session
        val_add = []

        # duration
        # val_add.append(duration)

        for feature_column in feature_columns:
            val_add.append(sub_df[feature_column].values[0])

        for i in range(0, nb_pre_steps):
            for j, previous_action_name in enumerate(previous_action_names):
                val_add.append(default_action_values[j])

        if target_column in sub_df.columns:
            val_add.append(sub_df[target_column].values[0])

        df_new = df_new.append(pd.DataFrame([val_add], columns=columns_add))

        # get the session previous steps information and add it to the new row
        for i in range(0, nb_pre_steps):
            step = max_step - i - 1
            sub_df = group[group['step'] == step]
            if (not sub_df is None) and (not sub_df.empty):
                for previous_action_name in previous_action_names:
                    col_name = '{}_{}'.format(previous_action_name, step)
                    # print('previous')
                    # print(previous_df[previous_action].values)
                    # print('----')
                    df_new.iloc[index][col_name] = sub_df[previous_action_name].values[0]


        index += 1

        if index % 20000 == 0:
            time_used = time.time() - start_time
            time_needed = time_used / index * (total_nb_rows-index)
            print('\n{} / {}'.format(index, total_nb_rows))
            print('time used (mins): {}'.format(round(time_used / 60)))
            print('time needed (mins): {}'.format(round(time_needed / 60)))
            print(df_new.iloc[random.randint(0, index-1)][columns_add])
            if target_column in df_new.columns:
                print('{}: {}'.format(target_column, df_new.iloc[random.randint(0, index-1)][target_column]))

    return df_new

def prepare_datasets(param_dict):
    train_user_df = param_dict['train']
    target_user_df = param_dict['target']
    nb_prev_step = param_dict['nb_prev_step']

    print('\n{}'.format(nb_prev_step))

    train_user_df_new = prepare_data(train_user_df, nb_pre_steps=nb_prev_step)
    df_path = '{}-{}.csv'.format('train_user_df', nb_prev_step)
    train_user_df_new.to_csv(df_path, index=False)
    print('\nsave train dataframe to {}'.format(df_path))
    print(train_user_df_new.head(2))

    target_user_df_new = prepare_data(target_user_df, nb_pre_steps=nb_prev_step)
    df_path = '{}-{}.csv'.format('target_user_df', nb_prev_step)
    target_user_df_new.to_csv(df_path, index=False)
    print('\nsave test dataframe to {}'.format(df_path))
    print(target_user_df_new.head(2))

    del train_user_df_new
    del target_user_df_new
    gc.collect()

In [None]:
# ----------
# read train data
train_booking_df = pd.read_csv(TRAIN_BOOKING_FILE_PATH, sep='\t')
train_booking_df['ymd'] = pd.to_datetime(train_booking_df['ymd'].astype('str'))
train_action_df = pd.read_csv(TRAIN_ACTION_FILE_PATH, sep='\t')
train_action_df['ymd'] = pd.to_datetime(train_action_df['ymd'].astype('str'))
train_user_df = pd.merge(train_booking_df, train_action_df, on=['ymd', 'user_id', 'session_id'], how='left')
train_user_df = preprocessing(train_user_df)
train_user_df.to_csv('train_user_df.csv')

# ----------
# read test data
target_booking_df = pd.read_csv(TARGET_BOOKING_FILE_PATH, sep='\t')
target_booking_df['ymd'] = pd.to_datetime(target_booking_df['ymd'].astype('str'))
target_action_df = pd.read_csv(TARGET_ACTION_FILE_PATH, sep='\t')
target_action_df['ymd'] = pd.to_datetime(target_action_df['ymd'].astype('str'))
target_user_df = pd.merge(target_booking_df, target_action_df, on=['ymd', 'user_id', 'session_id'], how='left')
target_user_df = preprocessing(target_user_df)
target_user_df.to_csv('target_user_df.csv')

In [None]:
train_user_df = pd.read_csv('train_user_df.csv')
# train_user_df['ymd'] = pd.to_datetime(train_user_df['ymd'].astype('str'))
target_user_df = pd.read_csv('target_user_df.csv')
# target_user_df['ymd'] = pd.to_datetime(target_user_df['ymd'].astype('str'))

# print(train_user_df.shape)
# print(train_user_df.head(3))
# print(target_user_df.head(3))


# nb_prev_step_list = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
nb_prev_step_list = [32]
param_dict_list = []
for nb_prev_step in nb_prev_step_list:
    param_dict = dict()
    param_dict['train'] = train_user_df
    param_dict['target'] = target_user_df
    param_dict['nb_prev_step'] = nb_prev_step
    param_dict_list.append(param_dict)

n_jobs = 1
with Pool(n_jobs) as p:
    p.map(prepare_datasets, param_dict_list)

## Define machine learning models

In [18]:
feature_columns = ['referer_code', 'is_app', 'agent_id', 'traffic_type', 'action_id', 'reference', 'step']
target_column = ['has_booking']


def get_train_set(df, feature_columns, target_column):
    print('\n === get train set === \n')

    train_df = df[feature_columns + target_column]

    train_x = train_df[feature_columns]
    train_y = train_df[target_column].values

    # https://stackoverflow.com/questions/31995175/scikit-learn-cross-val-score-too-many-indices-for-array
    """
    When we do cross validation in scikit-learn, the process requires an (R,) shape label instead of (R,1). 
    Although they are the same thing to some extend, their indexing mechanisms are different. So in your case, just add:
    c, r = labels.shape
    labels = labels.reshape(c,)
    """
    c, r = train_y.shape
    train_y = train_y.reshape(c, )

    return train_x, train_y


def get_test_set(df, feature_columns):
    print('\n === get test set === \n')

    test_x = df[feature_columns]

    return test_x


def timer(start_time=None):
    # fork from https://www.kaggle.com/tilii7/hyperparameter-grid-search-with-xgboost
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))


def train_xgb(X_train, Y_train, hyperparameter_tuning=False, model_path=None, n_jobs=3, folds=3, param_comb=5, n_estimators=100):
    """
    train a xgb model

    Reference
    https://www.kaggle.com/tilii7/hyperparameter-grid-search-with-xgboost
    """

    # xgb_clf = XGBClassifier(learning_rate=0.01,
    #                     n_estimators=200,
    #                     objective='binary:logistic',
    #                     silent=True, nthread=nthread)

    print('\n === train a xgb model === \n')

    xgb_clf = XGBClassifier(n_estimators=n_estimators, nthread=n_jobs, objective='binary:logistic', silent=True,)

    if hyperparameter_tuning:
        print('xgb hyperparameter tuning ...')

        params = {
            'n_estimators': [5, 10, 80, 100, 200],
            'min_child_weight': [1, 5, 10],
            # 'gamma': [0.5, 1, 1.5, 2, 5],
            'gamma': [0.5, 1, 1.5, 2],
            # 'subsample': [0.6, 0.8, 1.0],
            'subsample': [0.6, 0.8, 1],
            # 'colsample_bytree': [0.6, 0.8, 1.0],
            'colsample_bytree': [0.6, 0.8, 1],
            # 'max_depth': [3, 4, 5]
            'max_depth': [2, 4, 6],
        }

        skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

        random_search = RandomizedSearchCV(xgb_clf, param_distributions=params, n_iter=param_comb, scoring='roc_auc',
                                           n_jobs=n_jobs,
                                           cv=skf.split(X_train, Y_train),
                                           verbose=3, random_state=42)

        start_time = timer(None)
        random_search.fit(X_train, Y_train)
        timer(start_time)

        print('--------------')
        print('\n all results:')
        print(random_search.cv_results_)

        print('\n best estimator:')
        print(random_search.best_estimator_)

        print('\n best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
        print(random_search.best_score_ * 2 - 1)

        print('\n best xgb hyperparameters:')
        print(random_search.best_params_)

        result_csv_path = 'xgb-random-grid-search-results.csv'
        results = pd.DataFrame(random_search.cv_results_)
        results.to_csv(result_csv_path, index=False)
        print('save xgb random search results to {}'.format(result_csv_path))
        print('--------------')

        #xgb_clf = random_search
        xgb_clf = random_search.best_estimator_
    else:
        xgb_clf.fit(X_train, Y_train)

    if model_path is None:
        xgb_model_path = 'xgb.model'
        if hyperparameter_tuning:
            xgb_model_path = 'xgb.ht.model'
    else:
        xgb_model_path = model_path
        # xgb_clf.save_model(xgb_model_path)
    joblib.dump(xgb_clf, xgb_model_path)
    print('\n save the xgb model to {}'.format(xgb_model_path))

    return xgb_clf, xgb_model_path


def train_rf(X_train, Y_train, hyperparameter_tuning=False, model_path=None, n_jobs=3, folds=3, n_estimators=100):
    """
    train a RF classifier

    Reference
    https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
    """
    print('\n === train a random forest model === \n')

    model = RandomForestClassifier(n_estimators=n_estimators, random_state=42, n_jobs=n_jobs)

    if hyperparameter_tuning:
        # Number of trees in random forest
        #n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
        n_estimators = [5, 10, 80, 100, 200]
        # Number of features to consider at every split
        max_features = ['auto', 'sqrt']
        # Maximum number of levels in tree
        #max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
        max_depth = [4, 6, 8]
        max_depth.append(None)
        # Minimum number of samples required to split a node
        min_samples_split = [2, 5, 10]
        # Minimum number of samples required at each leaf node
        min_samples_leaf = [1, 2, 4]
        # Method of selecting samples for training each tree
        bootstrap = [True, False]
        # Create the random grid
        random_grid = {'n_estimators': n_estimators,
                       'max_features': max_features,
                       'max_depth': max_depth,
                       'min_samples_split': min_samples_split,
                       'min_samples_leaf': min_samples_leaf,
                       'bootstrap': bootstrap}
        #print(random_grid)

        rf_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid,
                                       n_iter=100, cv=folds, verbose=2, random_state=42, n_jobs=n_jobs)

        rf_random.fit(X_train, X_train)


        print('--------------')
        print('\n all results:')
        print(rf_random.cv_results_)

        print('\n best estimator:')
        print(rf_random.best_estimator_)

        print('\n best rf parameters:')
        print(rf_random.best_params_)

        print('\n best scores:')
        rf_random.best_score_

        result_cv_path = 'rf-random-grid-search-results.csv'
        results = pd.DataFrame(rf_random.cv_results_)
        results.to_csv(result_cv_path, index=False)
        print('\n save rf random search results to {}'.format(result_cv_path))
        print('--------------')

        model = rf_random.best_estimator_
    else:
        model.fit(X_train, Y_train)

    if model_path is None:
        model_path = 'rf.model'
        if hyperparameter_tuning:
            model_path = 'rf.ht.model'


    joblib.dump(model, model_path)
    print('\n save the rf model to {}'.format(model_path))

    return model, model_path


def train_nb(X_train, Y_train, model_path=None):
    """
    train a naive bayes classifier
    """
    # reference https://www.analyticsvidhya.com/blog/2017/09/naive-bayes-explained/

    print('\n === train a gaussian naive bayes model === \n')

    model = GaussianNB()
    model.fit(X_train, Y_train,)

    if model_path is None:
        model_path = 'nb.model'

    joblib.dump(model, model_path)
    print('\n save the GaussianNB model to {}'.format(model_path))

    return model, model_path


def train_lgbm(X_train, Y_train,
               # categorical_feature=['referer_code', 'is_app', 'agent_id', 'traffic_type', 'action_id', 'reference'],
               categorical_feature='auto',
               model_path=None, n_jobs=3, hyperparameter_tuning=False, num_boost_round=100, folds=3):
    """
    train a lightGBM model

    Reference
    https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py
    https://www.kaggle.com/garethjns/microsoft-lightgbm-with-parameter-tuning-0-823?scriptVersionId=1751960
    """

    print('\n === train a lightGBM === \n')

    d_train = lgb.Dataset(X_train, label=Y_train,
                          # categorical_feature=['aisle_id', 'department_id']
                          categorical_feature=categorical_feature,
                          )


    if not hyperparameter_tuning:
        # params = {
        #     'boosting_type': 'gbdt',
        #     'objective': 'binary',
        #     'num_class': 1,                # must be 1 for non-multiclass training
        #     'metric': 'binary_error',
        #     #'metric': 'binary_logloss',
        #     #'n_jobs': n_jobs,
        #     'nthread': n_jobs,
        #     #'num_leaves': 31,
        #
        #     'num_leaves': 64,
        #     'min_child_weight': 1,
        #     'min_child_samples': 5,
        #     'scale_pos_weight': 1,
        #     'reg_alpha': 5,
        #     'learning_rate': 0.05,
        #     'max_bin': 512,
        #
        #     #'feature_fraction': 0.9,
        #     #'bagging_fraction': 0.8,
        #     #'bagging_freq': 5,
        #     #'verbose': 0
        # }

        params = {'boosting_type': 'gbdt',
                  'max_depth': -1,
                  'objective': 'binary',
                  'nthread': n_jobs,  # Updated from nthread
                  'num_leaves': 64,
                  'learning_rate': 0.05,
                  'max_bin': 512,
                  'subsample_for_bin': 200,
                  'subsample': 1,
                  'subsample_freq': 1,
                  'colsample_bytree': 0.8,
                  'reg_alpha': 5,
                  'reg_lambda': 10,
                  'min_split_gain': 0.5,
                  'min_child_weight': 1,
                  'min_child_samples': 5,
                  'scale_pos_weight': 1,
                  'num_class': 1,
                  'metric': 'binary_error'}

        gbm = lgb.train(params,
                        d_train,
                        num_boost_round=num_boost_round,
                        categorical_feature=categorical_feature)

    else:
        params = {'boosting_type': 'gbdt',
                  'max_depth': -1,
                  'objective': 'binary',
                  'nthread': n_jobs,  # Updated from nthread
                  'num_leaves': 64,
                  'learning_rate': 0.05,
                  'max_bin': 512,
                  'subsample_for_bin': 200,
                  'subsample': 1,
                  'subsample_freq': 1,
                  'colsample_bytree': 0.8,
                  'reg_alpha': 5,
                  'reg_lambda': 10,
                  'min_split_gain': 0.5,
                  'min_child_weight': 1,
                  'min_child_samples': 5,
                  'scale_pos_weight': 1,
                  'num_class': 1,
                  'metric': 'binary_error'}

        gridParams = {
            'learning_rate': [0.005],
            'n_estimators': [8, 16, 24],
            'num_leaves': [6, 8, 12, 16],
            'boosting_type': ['gbdt'],
            'objective': ['binary'],
            'random_state': [42],  # Updated from 'seed'
            'colsample_bytree': [0.64, 0.65, 0.66],
            'subsample': [0.7, 0.75],
            'reg_alpha': [1, 1.2],
            'reg_lambda': [1, 1.2, 1.4],
        }

        mdl = lgb.LGBMClassifier(boosting_type='gbdt',
                                 objective='binary',
                                 n_jobs=n_jobs,  # Updated from 'nthread'
                                 silent=True,
                                 max_depth=params['max_depth'],
                                 max_bin=params['max_bin'],
                                 subsample_for_bin=params['subsample_for_bin'],
                                 subsample=params['subsample'],
                                 subsample_freq=params['subsample_freq'],
                                 min_split_gain=params['min_split_gain'],
                                 min_child_weight=params['min_child_weight'],
                                 min_child_samples=params['min_child_samples'],
                                 scale_pos_weight=params['scale_pos_weight'])

        print(mdl.get_params().keys())

        grid = RandomizedSearchCV(estimator=mdl, param_distributions=gridParams,
                                       n_iter=100, cv=folds, verbose=2, random_state=42, n_jobs=n_jobs)

        #grid = GridSearchCV(mdl, gridParams, verbose=2, cv=folds, n_jobs=n_jobs)
        grid.fit(X_train, Y_train)

        print('best parameters:')
        print(grid.best_params_)
        print('best score: ')
        print(grid.best_score_)

        # using parameters already set above, replace in the best from the grid search
        params['colsample_bytree'] = grid.best_params_['colsample_bytree']
        params['learning_rate'] = grid.best_params_['learning_rate']
        #params['max_bin'] = grid.best_params_['max_bin']
        params['num_leaves'] = grid.best_params_['num_leaves']
        params['reg_alpha'] = grid.best_params_['reg_alpha']
        params['reg_lambda'] = grid.best_params_['reg_lambda']
        params['subsample'] = grid.best_params_['subsample']
        #params['subsample_for_bin'] = grid.best_params_['subsample_for_bin']

        print('Fitting with params: ')
        print(params)

        X_train_sub, X_val, Y_train_sub, Y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=42)

        d_train_sub = lgb.Dataset(X_train_sub, label=Y_train_sub,
                              # categorical_feature=['aisle_id', 'department_id']
                              categorical_feature=categorical_feature,
                              #categorical_feature='auto'
                              )

        d_val_sub = lgb.Dataset(X_val, label=Y_val,
                              # categorical_feature=['aisle_id', 'department_id']
                              categorical_feature=categorical_feature,
                              #categorical_feature='auto'
                              )

        gbm = lgb.train(params,
                        d_train_sub,
                        num_boost_round=1000,
                        valid_sets=[d_train_sub, d_val_sub],
                        early_stopping_rounds=50,
                        verbose_eval=4)

        # Plot importance
        #lgb.plot_importance(gbm)

    if model_path is None:
        model_path = 'lgbm.model'
        if hyperparameter_tuning:
            model_path = 'lgbm.ht.model'

    # save model to file
    gbm.save_model(model_path)
    print('save the lightGBM model to {}'.format(model_path))

    # load model to predict
    # print('Load model to predict')
    # bst = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # y_pred = bst.predict(X_test)

    return gbm, model_path


def train_catboost(X_train, Y_train,
                   categorical_feature=[0, 1, 2, 3, 4, 5],
                   #categorical_feature=['referer_code', 'is_app', 'agent_id', 'traffic_type', 'action_id', 'reference'],
                   model_path=None, hyperparameter_tuning=False, num_boost_round=100):
    """
    train a catboost model

    Reference:
    https://tech.yandex.com/catboost/doc/dg/concepts/python-usages-examples-docpage/
    """

    print('\n === train a catboost === \n')

    model = CatBoostClassifier(loss_function='Logloss',
                               iterations=num_boost_round,
                               #learning_rate=1,
                               #depth=2
                               )
    model.fit(X_train, Y_train, categorical_feature)

    if model_path is None:
        model_path = 'catboost.model'
        if hyperparameter_tuning:
            model_path = 'catboost.ht.model'

    model.save_model(model_path)

    print('\n save the catboost model to {}'.format(model_path))

    return model, model_path

def train_cnn(X_train, Y_train, model_path, maxlen = 400, epochs = 2):
    # max_features = 5000
    max_features = 1000
    batch_size = 64
    embedding_dims = 20
    filters = 250
    kernel_size = 3
    hidden_dims = 100

    # X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    # Y_train = to_categorical(Y_train)

    print('Build model...')
    model = Sequential()
    model.add(Embedding(max_features,
                        embedding_dims,
                        input_length=maxlen))
    model.add(Dropout(0.2))

    # we add a Convolution1D, which will learn filters
    # word group filters of size filter_length:
    model.add(Conv1D(filters,
                     kernel_size,
                     padding='valid',
                     activation='relu',
                     strides=1))
    # we use max pooling:
    model.add(GlobalMaxPooling1D())

    # We add a vanilla hidden layer:
    model.add(Dense(hidden_dims))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))

    # We project onto a single unit output layer, and squash it with a sigmoid:
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    model.fit(X_train, Y_train,
              batch_size=batch_size,
              epochs=epochs,
              # validation_data=(x_test, y_test),
              verbose=2,
              )

    model.save(model_path)

    return model, model_path


def predict(model_path, X_test, is_lgbm=False, is_catboost=False, is_cnn=False, maxlen=400, lgbm_threshold=0.5):
    """
    load the model and predict unseen data
    """

    print('\n === predict === \n')

    if is_lgbm:
        # lightgbm
        model = lgb.Booster(model_file=model_path)
    elif is_catboost:
        model = CatBoostClassifier()
        model = model.load_model(model_path)
    elif is_cnn:
        model = load_model(model_path)
    else:
        # sklearn
        # xgboost
        model = joblib.load(model_path)

    # y_pred = model.predict_prob(X_test)
    y_pred = model.predict(X_test)

    if is_lgbm:
        #print('==')
        #print(y_pred)
        y_output = []
        for y in y_pred:
            if y > lgbm_threshold:
                y_output.append(1)
            else:
                y_output.append(0)
        #print('==')
        #print(y_output)
        return(np.array(y_output))
        #return np.array([np.argmax(y) for y in y_pred])
    elif is_cnn:
        # X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
        y_pred = model.predict(X_test)
        y_pred = [np.argmax(y) for y in y_pred]
        return np.array(y_pred)
    else:
        return y_pred


def blend_predictions(y_pred_list, threshold=0.5):
    """
    blend the predictions
    """

    print('\n === blend predictions === \n')

    y_pred = y_pred_list[0]

    for i in range(1, len(y_pred_list)):
        for j in range(len(y_pred)):
            y_pred[j] += y_pred_list[i][j]

    # average the predictions
    y_pred = y_pred*1.0 / len(y_pred_list)

    y_output = []
    for y in y_pred:
        if y > threshold:
            y_output.append(1)
        else:
            y_output.append(0)

    return np.array(y_output)


def evaluate(y_true, y_pred):
    """
    evaluate the prediction
    """
    print('\n === evaluate === \n')

    nb_bookings_true = 0
    for y in y_true:
        if y == 1:
            nb_bookings_true += 1
    print('\n number of bookings in y_true: {}'.format(nb_bookings_true))
    print('\n y_true shape:')
    print(y_true.shape)

    nb_bookings_pred = 0
    for y in y_pred:
        if y == 1:
            nb_bookings_pred += 1
    print('\n number of bookings in y_pred {}'.format(nb_bookings_pred))
    print('\n y_pred shape:')
    print(y_pred.shape)

    mcc_score = matthews_corrcoef(y_true, y_pred)
    print('\n matthews corrcoef score {}'.format(mcc_score))

    """
      tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
      print(tn)
      print(fp)
      print(fn)
      print(tp)
      print('---')

      mcc = (tp*tn - fp*fn) / np.math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
      print(mcc)
      print(matthews_corrcoef(y_true, y_pred))
      """

    accuracy = accuracy_score(y_true, y_pred)
    print('\n accuracy: {}'.format(accuracy))
    print('\n classification report:')
    print(classification_report(y_true, y_pred))

    f1score = f1_score(y_true, y_pred, average='binary')
    print('\n f1 score: {}'.format(f1score))

    #roc_auc_score(y_true, y_pred)

    return mcc_score, accuracy, f1score


def plot_performance(x_list, y_list, title, x_label, y_label):
    """
    define the plotting function
    """
    plt.plot(x_list, y_list)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.show()

    
def save_prediction(target_user_df, y_pred, file_path):
    """
    save predictions to a csv file
    """
    prediciton_df = pd.DataFrame(columns=['session_id', 'has_booking'])
    prediciton_df['session_id'] = target_user_df['session_id'].values
    prediciton_df['has_booking'] = [int(y) for y in y_pred]

    print('prediction:')
    print(prediciton_df['has_booking'].unique())

    prediciton_df.to_csv(file_path, '\t', index=False)
    print('save prediction to {}'.format(file_path))

## Evaluation

In [20]:
def evaluation(model_name, train_sub_x, val_x, train_sub_y, val_y, nb_prev_step, num_boost_rounds, hyperparameter_tuning,
               category_feature_index_list,):
    """
    define the model evaluation function
    """
    
    mcc_list = []
    acc_list = []
    f1_list  = []
    dict_mcc_score = dict()
        
    for num_boost_round in num_boost_rounds:
        print('\nnum boost round: {}'.format(num_boost_round))

        y_pred = None

        if model_name == 'catboost':
            model_path = 'catboost-[num_boost_round]{}-[ht]{}-[nb_prev]{}-sub.model'.format(num_boost_round,
                                                                                             hyperparameter_tuning,
                                                                                             nb_prev_step)

            model, model_path = train_catboost(train_sub_x, train_sub_y, hyperparameter_tuning=hyperparameter_tuning,
                                               categorical_feature=category_feature_index_list,
                                               model_path=model_path, num_boost_round=num_boost_round)
            y_pred = predict(model_path, val_x, is_catboost=True)
            
        elif model_name == 'xgb':
            model_path = 'xgb-[num_boost_round]{}-[ht]{}-[nb_prev]{}-sub.model'.format(num_boost_round,
                                                                                       hyperparameter_tuning,
                                                                                       nb_prev_step)
            model, model_path = train_xgb(X_train=train_sub_x, Y_train=train_sub_y,
                                          hyperparameter_tuning=hyperparameter_tuning,
                                          model_path=model_path, n_estimators=num_boost_round)
            y_pred = predict(model_path, val_x)

        elif model_name == 'lgbm':
            model_path = 'lgbm-[num_boost_round]{}-[ht]{}-[nb_prev]{}-sub.model'.format(num_boost_round,
                                                                                       hyperparameter_tuning,
                                                                                       nb_prev_step)

            model, model_path = train_lgbm(train_sub_x, train_sub_y, hyperparameter_tuning=hyperparameter_tuning,
                                           categorical_feature=categorical_feature,
                                           model_path=model_path, num_boost_round=num_boost_round)
            
            y_pred = predict(model_path, val_x, is_lgbm=True)
        
        elif model_name == 'rf':
            model_path = 'rf-[num_boost_round]{}-[ht]{}-[nb_prev]{}-sub.model'.format(num_boost_round,
                                                                                      hyperparameter_tuning,
                                                                                      nb_prev_step)
            
            model, model_path = train_rf(X_train=train_sub_x, Y_train=train_sub_y, 
                                         hyperparameter_tuning=hyperparameter_tuning,
                                         model_path=model_path, n_jobs=2, n_estimators=num_boost_round)
            
            y_pred = predict(model_path, val_x)
            
        elif model_name == 'cnn':
            epochs = 100
            model_path = 'cnn-[epochs]{}-[nb_prev]{}-sub.model'.format(epochs, nb_prev_step)
            maxlen = train_sub_x.shape[1]
            model, model_path = train_cnn(train_sub_x, train_sub_y,
                                           model_path=model_path, epochs=epochs, maxlen=maxlen)
            y_pred = predict(model_path, val_x, is_cnn=True, maxlen=maxlen)

        mcc_score, accuracy, f1score = evaluate(val_y, y_pred)
        
        dict_mcc_score[num_boost_round] = mcc_score
        
        mcc_list.append(mcc_score)
        acc_list.append(accuracy)
        f1_list.append(f1score)

    plot_performance(num_boost_rounds, mcc_list, 'mcc score (model: {} #steps: {})'.format(model_name, nb_prev_step), 
                     'num_boost_round', 'mcc score')
    
    plot_performance(num_boost_rounds, acc_list, 'accuracy (model: {} #steps: {})'.format(model_name, nb_prev_step), 
                     'num_boost_round', 'accuracy')
    
    plot_performance(num_boost_rounds, f1_list, 'f1 score (model: {} #steps: {})'.format(model_name, nb_prev_step), 
                     'num_boost_round', 'f1 score')
    
    return dict_mcc_score

In [22]:
# --------------
# Prepare the datasets

nb_prev_step = 32

no_feature_name_list = ['ymd', 'user_id', 'session_id', 'has_booking',]
no_cat_feature_name  = ['step']

target_columns       = ['has_booking']
feature_columns      = ['referer_code', 'is_app', 'agent_id', 'traffic_type', 'action_id', 'reference', 'step']
    
    
train_user_df = pd.read_csv('train_user_df-{}.csv'.format(nb_prev_step))
target_user_df = pd.read_csv('target_user_df-{}.csv'.format(nb_prev_step))

# get all features names
feature_columns = [feature for feature in train_user_df.columns if not (feature in no_feature_name_list)]
print('feature columns: ')
print(feature_columns)

# get all category features names
categorical_feature = [feature for feature in feature_columns if (not feature in no_cat_feature_name)]
print('category feature columns: ')
print(categorical_feature)

category_feature_index_list = [i for i, feature in enumerate(feature_columns) if not (feature in no_cat_feature_name)]
print('category feature index list:')
print(category_feature_index_list)

# shuffle the train set
train_user_df = train_user_df.reindex(np.random.permutation(train_user_df.index))

train_x, train_y = get_train_set(train_user_df, feature_columns, target_columns)
print('\n -----')
print('train set size:')
print(train_x.shape)
print(train_y.shape)

print('feature columns: ')
print(train_x.columns)   

# prepare validation set
train_sub_x, val_x, train_sub_y, val_y = train_test_split(train_x, train_y, test_size=0.1, random_state=42)


feature columns: 
['referer_code', 'is_app', 'agent_id', 'traffic_type', 'action_id', 'reference', 'step', 'action_id_1', 'reference_1', 'action_id_2', 'reference_2', 'action_id_3', 'reference_3', 'action_id_4', 'reference_4', 'action_id_5', 'reference_5', 'action_id_6', 'reference_6', 'action_id_7', 'reference_7', 'action_id_8', 'reference_8', 'action_id_9', 'reference_9', 'action_id_10', 'reference_10', 'action_id_11', 'reference_11', 'action_id_12', 'reference_12', 'action_id_13', 'reference_13', 'action_id_14', 'reference_14', 'action_id_15', 'reference_15', 'action_id_16', 'reference_16', 'action_id_17', 'reference_17', 'action_id_18', 'reference_18', 'action_id_19', 'reference_19', 'action_id_20', 'reference_20', 'action_id_21', 'reference_21', 'action_id_22', 'reference_22', 'action_id_23', 'reference_23', 'action_id_24', 'reference_24', 'action_id_25', 'reference_25', 'action_id_26', 'reference_26', 'action_id_27', 'reference_27', 'action_id_28', 'reference_28', 'action_id_29',

In [None]:
# ---------------
# Plot the performance

num_boost_rounds = [100, 200, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 6000, 10000]
hyperparameter_tuning = False
model_name = 'xgb'

dict_mcc_score_xgb = evaluation(model_name, train_sub_x, val_x, train_sub_y, val_y, nb_prev_step, 
                            num_boost_rounds, hyperparameter_tuning, category_feature_index_list,)


model_name = 'catboost'

dict_mcc_score_catboost = evaluation(model_name, train_sub_x, val_x, train_sub_y, val_y, nb_prev_step, 
                            num_boost_rounds, hyperparameter_tuning, category_feature_index_list,)



num boost round: 100

 === train a xgb model === 


 save the xgb model to xgb-[num_boost_round]100-[ht]False-[nb_prev]32-sub.model

 === predict === 


 === evaluate === 


 number of bookings in y_true: 2001

 y_true shape:
(30768,)

 number of bookings in y_pred 67

 y_pred shape:
(30768,)

 matthews corrcoef score 0.11774788466590541

 accuracy: 0.9357774310972439

 classification report:
             precision    recall  f1-score   support

          0       0.94      1.00      0.97     28767
          1       0.69      0.02      0.04      2001

avg / total       0.92      0.94      0.91     30768


 f1 score: 0.04448742746615087

num boost round: 200

 === train a xgb model === 


 save the xgb model to xgb-[num_boost_round]200-[ht]False-[nb_prev]32-sub.model

 === predict === 


 === evaluate === 


 number of bookings in y_true: 2001

 y_true shape:
(30768,)

 number of bookings in y_pred 127

 y_pred shape:
(30768,)

 matthews corrcoef score 0.17009575297938734

 accuracy: 0.

## Make predictions

In [None]:
test_x = get_test_set(target_user_df, feature_columns)

print('\n -----')
print('test set size:')
print(test_x.shape)        

In [None]:
num_boost_round = 1000

model, model_path = train_catboost(train_x, train_y, hyperparameter_tuning=False,
                                   categorical_feature=category_feature_index_list,
                                   model_path=model_path, num_boost_round=num_boost_round)

print('== make predictions (with historical data) ==')

y_pred = predict(model_path=model_path, X_test=test_x, is_catboost=True)

prediction_file_path = 'prediction-catboost-{}-{}-{}-{}.csv'.format(num_boost_round, nb_prev_step)

save_prediction(target_user_df, y_pred, prediction_file_path)

## Summary

The object of this task is to predict if a session has a booking nor not. This is a classic binary classification problem. The target is 0 or 1, where 0 means no booking and 1 means has a booking. The features I have used in this task are: session information (i.e., referer_code, is_app, agent_id, and traffic_type) and action information (i.e., action_id, reference) of the last n steps. Please note, ideally, for each session all the steps action information should be considered as features. Due to the limitation of the computation resource, I took the last n steps action information. 

We can see that in the experiments, with n=32, by using Gradient Boosting model, we achieved on the validation set. I believe with large n, the performance will be improved. 


## Future work

- Hyperparameter tuning with cross validation. Due to the limitation of time and computation power, I didn't try hyperparameter tuning for each of the models. However, the hyperparameter tuning functions have been implemented already.

- Feature engineering, e.g., increase the number of last steps. Ideally, all the steps should be considered as features.

- Deep learning for booking prediction. We could use RNN (or GRU, LSTM) to model the sequences of actions for the sessions.

## Additional questions

### What makes the classification problem difficult in this task? How do you handle that?
  
  - One difficulty is that we have the categorical features with various values, such as action id, reference.
  One solution is using the tree based models, such as xgboost, catboost, random forest, lightGBM. These model can handle the categorical features automatically. Another solution is using the on-hot encoding approach, by using prior knowledge, we can first group the values into several categories.
  
  - Another difficulty is a user will or will not book a hotel in the current step is not only affect by the current action but also affected by the previous actions. In other words, for each step, with only the current action information, it's not sufficient enough to predict if the user will book the hotel nor not. Therefore, the previous action information should be included as features. Note due to the limitation of time and computation power, only a part of the experiment results are included in this notebook. The details are explained in Section "Second approach: Using historical data".
  

### Evaluate and compare at least 3 classification methods for this task.
This has been shown in Sections "First approach" and "Second approach: Using historical data".
  

### Propose at least 3 features that are significant to predict bookings?

* User historical information
  For example,
    - The historical booking information
    - The historical searching information (this feature I have used in previous section of this notebook)
    
* Hotel information
  If a user select a hotel (e.g., click the button 'View Deal') then the features of the hotel is significant to predict bookings. For example:
    - Images
    - Reviews
    - Rating
    - Location
    - Discount
    - Number of users who viewed this hotel
    
* Time information
  The time information could be also helpful. For example, we could convert the dates into several categories. Because we know that during the tourist season, the number of bookings is large.
   
* Action information
  If a user select a hotel, we can provide a button 'Add to list'. In other words, if a user adds some hotels to the list, the likelihood for him to book a hotel will be high. However, I have checked several hotel/flight booking engines. None of them have this feature. Therefore, I doubt the effectiveness of this feature. Personally I think this feature is useful. But I don't know the reason why this feature is not there.
  

### We can spot a very significant action type. What might this action refer to?

It has been shown in Section "Step 1: Step 1: read and explore the data" that action 2142 has the largest number of bookings, i.e., 173073. This action might refer to clicking the 'View Deal'.