# Prediction of bookings based on user behavior
Data Scientist â€“ User Profiling, Hotel Search

- Author: Kai Chen
- Date:   Apr, 2018

In [1]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter("ignore", DeprecationWarning)

import numpy as np
import pandas as pd
from datetime import datetime
import operator
from collections import OrderedDict
import time
import csv
import gc
from multiprocessing import Pool

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.metrics import accuracy_score

import xgboost as xgb
from xgboost import XGBClassifier

import lightgbm as lgb

import catboost
from catboost import CatBoostClassifier

np.random.seed(42)

In [4]:
train_user_df = pd.read_csv('train_user_df.csv')
print(train_user_df.describe())
print(train_user_df.head(3))

target_user_df = pd.read_csv('target_user_df.csv')


            user_id    session_id  referer_code        is_app      agent_id  \
count  5.864434e+06  5.864434e+06  5.864434e+06  5.864434e+06  5.864434e+06   
mean   4.612515e+18  4.607873e+18  1.050245e+01  1.247070e-01  7.267086e+00   
std    2.657178e+18  2.656793e+18  2.855244e+01  3.303864e-01  3.802190e+00   
min    3.883091e+14  1.097161e+14  0.000000e+00  0.000000e+00  0.000000e+00   
25%    2.307265e+18  2.310716e+18  0.000000e+00  0.000000e+00  6.000000e+00   
50%    4.624574e+18  4.606553e+18  1.000000e+00  0.000000e+00  9.000000e+00   
75%    6.897454e+18  6.892477e+18  1.000000e+00  0.000000e+00  1.000000e+01   
max    9.223267e+18  9.223359e+18  9.900000e+01  1.000000e+00  1.400000e+01   

       traffic_type   has_booking     action_id     reference          step  
count  5.864434e+06  5.864434e+06  5.864434e+06  5.864434e+06  5.864434e+06  
mean   1.882018e+00  1.301094e-01  2.812956e+03  4.897366e+05  5.463159e+01  
std    1.407386e+00  3.364238e-01  1.636151e+03  2.865

In [10]:
for name, group in train_user_df.groupby('session_id'):
    nb_bookings = group[group['has_booking'] == 1].shape[0]
    if (nb_bookings > 0 and nb_bookings != group.shape[0] and np.max(group['step']) != group.shape[0]):
        print(group)

In [11]:
action_id_list = list(train_user_df.action_id.unique())
print(len(action_id_list))

reference_list = list(train_user_df.reference.unique())
print(len(reference_list))

211
121529


In [13]:
nb_steps_booking_list = []

for name, group in train_user_df.groupby('session_id'):
    nb_bookings = group[group['has_booking'] == 1].shape[0]
    if (nb_bookings > 0):
        nb_steps_booking_list.append(group.shape[0])

print('min nb steps: {}'.format(np.min(nb_steps_booking_list)))
print('max nb steps: {}'.format(np.max(nb_steps_booking_list)))
print('std nb steps: {}'.format(np.std(nb_steps_booking_list)))

min nb steps: 1
max nb steps: 2924
std nb steps: 70.27699664977659


In [14]:
# Is there a common action_id in the sessions with bookings?
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

inter_actions = []

for name, group in train_user_df.groupby('session_id'):
    nb_bookings = group[group['has_booking'] == 1].shape[0]
    if (nb_bookings > 0):
        actions = list(group['action_id'].values)
        inter_actions = intersection(inter_actions, actions)
    
print(inter_actions)

[]


In [15]:
# Is there a common reference in the sessions with bookings?
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

inter_actions = []

for name, group in train_user_df.groupby('session_id'):
    nb_bookings = group[group['has_booking'] == 1].shape[0]
    if (nb_bookings > 0):
        actions = list(group['reference'].values)
        inter_actions = intersection(inter_actions, actions)
    
print(inter_actions)

[]


## Feature engineering

In order to predict if a session has a booking or not, it is not sufficient to take only the session information (i.e., referer_code, is_app, agent_id, traffic_type) and the action information (i.e., action_id, reference) of the last step as features. Ideally, we have to take not only the session information but also all the action information in the session as features. Due to the limitation of the computation resource, for each session, I take the last n steps action information with the session information as features. 
