# Booking.com WSDM WebTour 2021 Challenge demo

In [1]:
#imports
import pandas as pd
import numpy as np
from datetime import datetime
from statistics import mode
import matplotlib.pyplot as plt
import pickle
from collections import Counter
from tensorflow.keras.preprocessing import sequence
from scipy import stats

### Load testset

In [2]:
test_set = pd.read_csv('sample_test_set.csv')

In [3]:
t = test_set.groupby("utrip_id").apply(np.array)

In [4]:
lens = [len(i) for i in t]

In [5]:
np.min(lens), np.max(lens)

(4, 5)

In [6]:
with open("Preprocessing_encoders.pkl", "rb") as f:
    encode_cities, encode_devices, encode_affiliate_id, encode_hotel_country, encode_booker_country = pickle.load(f)

In [7]:
with open("normalized_values.pkl", "rb") as f:
    normalized_values = pickle.load(f)

In [8]:
def get_season(date):
    if date < spring_start:
        # 0 for winter
        return 0
    if date < summer_start:
        # 1 for spring
        return 1
    if date < autumn_start:
        # 2 for summer
        return 2
    if date < winter_start2:
        # 3 for autumn
        return 3
    #else: it is the start of 2017, hence winter
    return 0

#define seasons key dates
winter_start = datetime.strptime("2015-12-21", "%Y-%m-%d")
spring_start = datetime.strptime("2016-03-20", "%Y-%m-%d")
summer_start = datetime.strptime("2016-06-20", "%Y-%m-%d")
autumn_start = datetime.strptime("2016-09-22", "%Y-%m-%d")
winter_start2 = datetime.strptime("2016-12-21", "%Y-%m-%d")

In [9]:
def apply_preprocessing(test_set, datetime_format="%Y-%m-%d", target="?", n_values=normalized_values,
                       skip_duplicates=True):
    
    df = test_set.copy()
    
    #Transform dates into datetime format
    df.checkin = df.checkin.apply(lambda x: datetime.strptime(x, datetime_format))
    df.checkout = df.checkout.apply(lambda x: datetime.strptime(x, datetime_format))
    
    #sort
    df.sort_values(["utrip_id", "checkin"], inplace=True)
    
    
    #New column: how much days spent in the hotel
    df["days_in_hotel"] = df[["checkin", "checkout"]].apply(lambda x: (x[1] - x[0]).days, axis=1)
    
    #get current season when checking in or checking out
    df["checkin_season"] = df.checkin.apply(get_season)
    df["checkout_season"] = df.checkout.apply(get_season)
    
    df["month_reservation"] = df.checkin.apply(lambda x: x.month)
    
    #Let's deal with trip with consecutive cities
    df["is_new_trip"] = df[["utrip_id"]].shift() != df[["utrip_id"]]
    df["is_new_city"] = df[["city_id"]].shift() != df[["city_id"]]
    #When it is both not a new trip, and not a new city, it means it is a consecutive city reservation within a trip
    df["is_consecutive_res"] = (df.is_new_city == False) & (df.is_new_trip == False)
    
    #remove consecutive reservations
    if skip_duplicates:
        df = df.loc[np.invert(df.is_consecutive_res)]
    
    col_to_drop = ["checkin","checkout", "checkout_season", "is_consecutive_res", 
               "is_new_city", "is_new_trip", "row_num", "total_rows", "user_id"]
    
    df.drop(col_to_drop, axis=1, inplace=True)
    
    
    
    #Retrieve trips info
    df.set_index("utrip_id", inplace=True)
    
    #encode city_id and hotel country
    mask = df['city_id'] != target
    df.loc[mask, 'city_id'] = encode_cities.transform(df.loc[mask, 'city_id'])
    df.loc[mask, 'hotel_country'] = encode_hotel_country.transform(df.loc[mask, "hotel_country"])
    
    #encode the rest of data
    df.device_class = encode_devices.transform(df.device_class)
    #df.affiliate_id = encode_affiliate_id.transform(df.affiliate_id)
    df.booker_country = encode_booker_country.transform(df.booker_country)
    
    ordered_col = ['city_id','device_class','affiliate_id',
                   'booker_country','hotel_country', "days_in_hotel",
                   'checkin_season','month_reservation']
    df = df[ordered_col]
    #Scale
    for col in df.columns[1:]:
        # not hotel country
        if col == "hotel_country":
            continue
        min_value, max_value = normalized_values[col]
        df[col] = (df[col]  - min_value) / (max_value - min_value)
    
    #Normalize hotel country col
    min_value, max_value = normalized_values["hotel_country"]
    df.loc[mask, "hotel_country"] = (df.loc[mask, "hotel_country"] - min_value) / (max_value - min_value)
    

    #Retrieve trips info
    trips = df.groupby("utrip_id").apply(np.array)
    index = trips.groupby("utrip_id").apply(np.array).index
    
    cities_sequences = []
    trip_info = []
    lst_indexes = []
    for i, trip in enumerate(trips):
        if len(trip) == 1:
            print("Not a trip")
            continue
        #save index
        lst_indexes.append((index[i]))
        cities_sequence = np.array(trip[:-1,0], dtype="int32")
        last_device = trip[-1,1]
        last_booker = trip[-1,3]
        last_num_days = trip[-1, 5]
        month_res = trip[-1,7]
        
        trip_information = [last_device, last_booker, last_num_days, month_res]
        
        #Save all
        cities_sequences.append(cities_sequence)
        trip_info.append(trip_information)
    
    
    return cities_sequences, trip_info, lst_indexes

In [12]:
X_trip, X_info, index = apply_preprocessing(test_set, datetime_format="%d/%m/%Y", target=0, skip_duplicates=False)

In [13]:
X_trip

[array([33371,  6873,  3469], dtype=int32),
 array([29631, 26533, 17285], dtype=int32),
 array([28294, 27497, 31337, 20790], dtype=int32),
 array([33842, 33842,  2925], dtype=int32)]

Save it

In [15]:
from numpy import savez_compressed, load
savez_compressed("sample_trip.npz", X_trip)
savez_compressed("sample_info.npz", X_info)
savez_compressed("sample_index.npz", index)

  return array(a, dtype, copy=False, order=order, subok=True)


In [31]:
from tensorflow.keras.models import load_model
import tensorflow.keras.backend as K
model = load_model("best_model.hdf5")

In [17]:
import random
from numpy import savez_compressed, load

def predictions_generator(filename1, filename2, batch_size=512):
    #initiate a batch count for each size of trips
    batch_count = {k: 0 for k in range(50)}
    inputs = {k: [] for k in range(50)}
    while True:
        #load files
        trip_sequences = load(filename1, allow_pickle=True)["arr_0"]
        trip_infos = load(filename2, allow_pickle=True)["arr_0"]
        
        for i in range(len(trip_sequences)):
        
            #consider trip i
            trip_sequence = trip_sequences[i]
            trip_info = trip_infos[i]
            len_trip = len(trip_sequence)
            
            #add info to corresponding  size
            inputs[len_trip].append([trip_sequence, trip_info])
            batch_count[len_trip] += 1
            
            if batch_count[len_trip] == batch_size:
                X_trip = np.array([t[0] for t in inputs[len_trip]], dtype="int32")
                X_info = np.array([t[1] for t in inputs[len_trip]], dtype="float32")
                #yield the data to feed the 
                yield [X_trip, X_info]
                inputs[len_trip] = []
                batch_count[len_trip] = 0
               
            

index = load("sample_index.npz", allow_pickle=True)["arr_0"]

In [32]:
batch_size = 1
pred_generator = predictions_generator("sample_trip.npz", "sample_info.npz",
                                        batch_size=batch_size)


In [33]:
with open("consecutive_cities.pkl", "rb") as f:
    consec = pickle.load(f)

In [34]:
def select_highly_consecutive_cities(threshold=0.06):
    selected_cities = [city for city in consec.keys() if consec[city] > threshold]
    s = encode_cities.transform(selected_cities) #encoded
    return s

In [35]:
#modify column 0
def adapt_4th_pred(p, s):
    last_pred = p[4]
    #if not frequently followed by duplicate, do not consider it
    if last_pred not in s:
        #return 4th choice of model
        return p[0]
    #else
    #if last pred already predicted
    if last_pred in p[:4].values:
        return p[0]
    #else
    return p[4]

In [36]:
def predict_from_generator(generator, topn=4, to_pred=4000, use_last=False):
    
    preds = model.predict(generator, steps=to_pred, batch_size=1)
    sorted_preds = np.array(preds).argsort(axis=1) #sort them along axis
    res_df = pd.DataFrame(sorted_preds[:,-topn:]) #select topn
    
    if use_last:
        s = select_highly_consecutive_cities()
        last_cities = [trip[-1] for trip in X_info[:to_pred]]
        #we use the last city as a prediction
        res_df[topn] = last_cities
        res_df[0] = res_df.apply(lambda x: int(adapt_4th_pred(x, s)), axis=1)
        res_df.drop(topn, inplace=True, axis=1)

    #inverse transform
    for i in range(topn):
        res_df.iloc[:,i] = encode_cities.inverse_transform(res_df.iloc[:,i])
    
    if topn == 4:
        res_df.columns=["city_id_1","city_id_2","city_id_3","city_id_4"]
    return res_df

In [37]:
res_df = predict_from_generator(pred_generator, use_last=True, to_pred=4)
submission = pd.concat([pd.Series(index, name="utrip_id"), res_df], axis=1)
submission.to_csv('submission_sample.csv',index=False)

In [29]:
res_df = predict_from_generator(pred_generator, use_last=True, to_pred=4, topn=10)

In [38]:
res_df

Unnamed: 0,city_id_1,city_id_2,city_id_3,city_id_4
0,3809,30018,5797,13471
1,52570,15343,4932,60222
2,45188,35160,47759,17775
3,38912,4932,60222,17013


## Read submission file and ground truth

In [39]:
ground_truth = pd.read_csv('sample_truth.csv', index_col =[0])
submission = pd.read_csv('submission_sample.csv', index_col =[0])

In [40]:
submission

Unnamed: 0_level_0,city_id_1,city_id_2,city_id_3,city_id_4
utrip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000066_5,3809,30018,5797,13471
1000270_5,52570,15343,4932,60222
1000441_5,45188,35160,47759,17775
100048_5,38912,4932,60222,17013


## Evaluate - use accuracy at 4 to evaluate the prediction

In [41]:
def evaluate_accuracy_at_4(submission,ground_truth):
    '''checks if the true city is within the four recommended cities'''
    data_to_eval = submission.join(ground_truth,on='utrip_id')
    hits = data_to_eval.apply(
        lambda row: row['city_id'] in (row[['city_id_1', 'city_id_2', 'city_id_3', 'city_id_4']].values),
            axis = 1)
    return hits.mean()

In [42]:
evaluate_accuracy_at_4(submission,ground_truth)

0.25

In [51]:
ground_truth

Unnamed: 0_level_0,city_id,hotel_country
utrip_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1000066_5,41971,Urkesh
1000270_5,23921,Cobra Island
1000441_5,50457,Osterlich
100048_5,17013,Borginia
