In [1]:
#imports
import pandas as pd
import numpy as np
from datetime import datetime
from statistics import mode
import matplotlib.pyplot as plt
import pickle
from collections import Counter
from tensorflow.keras.preprocessing import sequence
from scipy import stats

In [2]:
import time
import warnings
warnings.filterwarnings("ignore")

In [3]:
test_set = pd.read_csv('booking_test_set.csv')

In [4]:
test_set

Unnamed: 0,user_id,checkin,checkout,device_class,affiliate_id,booker_country,utrip_id,row_num,total_rows,city_id,hotel_country
0,1000066,2016-07-21,2016-07-23,desktop,9924,Gondal,1000066_2,1,4,56430,Urkesh
1,1000066,2016-07-23,2016-07-25,desktop,9924,Gondal,1000066_2,2,4,41971,Urkesh
2,1000066,2016-07-25,2016-07-28,desktop,9924,Gondal,1000066_2,3,4,5797,Urkesh
3,1000066,2016-07-28,2016-07-31,mobile,2436,Gondal,1000066_2,4,4,0,
4,1000270,2016-02-08,2016-02-09,mobile,9452,The Devilfire Empire,1000270_1,1,4,50075,The Devilfire Empire
...,...,...,...,...,...,...,...,...,...,...,...
378662,999911,2016-10-07,2016-10-08,desktop,9598,Gondal,999911_1,5,5,0,
378663,999991,2016-08-15,2016-08-17,desktop,8065,Elbonia,999991_3,1,4,29770,Elbonia
378664,999991,2016-08-18,2016-08-19,desktop,8065,Elbonia,999991_3,2,4,36170,Carpathia
378665,999991,2016-08-19,2016-08-20,tablet,3631,Elbonia,999991_3,3,4,52155,Elbonia


In [5]:
with open("Preprocessing_encoders.pkl", "rb") as f:
    encode_cities, encode_devices, encode_affiliate_id, encode_hotel_country, encode_booker_country = pickle.load(f)

In [6]:
with open("normalized_values.pkl", "rb") as f:
    normalized_values = pickle.load(f)

In [7]:
def get_season(date):
    if date < spring_start:
        # 0 for winter
        return 0
    if date < summer_start:
        # 1 for spring
        return 1
    if date < autumn_start:
        # 2 for summer
        return 2
    if date < winter_start2:
        # 3 for autumn
        return 3
    #else: it is the start of 2017, hence winter
    return 0

#define seasons key dates
winter_start = datetime.strptime("2015-12-21", "%Y-%m-%d")
spring_start = datetime.strptime("2016-03-20", "%Y-%m-%d")
summer_start = datetime.strptime("2016-06-20", "%Y-%m-%d")
autumn_start = datetime.strptime("2016-09-22", "%Y-%m-%d")
winter_start2 = datetime.strptime("2016-12-21", "%Y-%m-%d")

In [8]:
normalized_values

{'device_class': (0, 2),
 'affiliate_id': (0, 3253),
 'booker_country': (0, 4),
 'hotel_country': (0, 194),
 'days_in_hotel': (1, 30),
 'checkin_season': (0, 3),
 'month_reservation': (1, 12)}

In [9]:
def apply_preprocessing(test_set, datetime_format="%Y-%m-%d", target="?", n_values=normalized_values,
                       skip_duplicates=True):
    
    df = test_set.copy()
    
    #Transform dates into datetime format
    df.checkin = df.checkin.apply(lambda x: datetime.strptime(x, datetime_format))
    df.checkout = df.checkout.apply(lambda x: datetime.strptime(x, datetime_format))
    
    #sort
    df.sort_values(["utrip_id", "checkin"], inplace=True)
    
    
    #New column: how much days spent in the hotel
    df["days_in_hotel"] = df[["checkin", "checkout"]].apply(lambda x: (x[1] - x[0]).days, axis=1)
    
    #get current season when checking in or checking out
    df["checkin_season"] = df.checkin.apply(get_season)
    df["checkout_season"] = df.checkout.apply(get_season)
    
    df["month_reservation"] = df.checkin.apply(lambda x: x.month)
    
    #Let's deal with trip with consecutive cities
    df["is_new_trip"] = df[["utrip_id"]].shift() != df[["utrip_id"]]
    df["is_new_city"] = df[["city_id"]].shift() != df[["city_id"]]
    #When it is both not a new trip, and not a new city, it means it is a consecutive city reservation within a trip
    df["is_consecutive_res"] = (df.is_new_city == False) & (df.is_new_trip == False)
    
    #remove consecutive reservations
    if skip_duplicates:
        df = df.loc[np.invert(df.is_consecutive_res)]
    
    col_to_drop = ["checkin","checkout", "checkout_season", "is_consecutive_res", 
               "is_new_city", "is_new_trip", "row_num", "total_rows", "user_id"]
    
    df.drop(col_to_drop, axis=1, inplace=True)
    
    
    
    #Retrieve trips info
    df.set_index("utrip_id", inplace=True)
    
    #encode city_id and hotel country
    mask = df['city_id'] != target
    df.loc[mask, 'city_id'] = encode_cities.transform(df.loc[mask, 'city_id'])
    df.loc[mask, 'hotel_country'] = encode_hotel_country.transform(df.loc[mask, "hotel_country"])
    
    #encode the rest of data
    df.device_class = encode_devices.transform(df.device_class)
    #df.affiliate_id = encode_affiliate_id.transform(df.affiliate_id)
    df.booker_country = encode_booker_country.transform(df.booker_country)
    
    ordered_col = ['city_id','device_class','affiliate_id',
                   'booker_country','hotel_country', "days_in_hotel",
                   'checkin_season','month_reservation']
    df = df[ordered_col]
    #Scale
    for col in df.columns[1:]:
        # not hotel country
        if col == "hotel_country":
            continue
        min_value, max_value = normalized_values[col]
        df[col] = (df[col]  - min_value) / (max_value - min_value)
    
    #Normalize hotel country col
    min_value, max_value = normalized_values["hotel_country"]
    df.loc[mask, "hotel_country"] = (df.loc[mask, "hotel_country"] - min_value) / (max_value - min_value)
    
    return df

In [10]:
df = apply_preprocessing(test_set, datetime_format="%Y-%m-%d", target=0, skip_duplicates=True)

In [11]:
df

Unnamed: 0_level_0,city_id,device_class,affiliate_id,booker_country,hotel_country,days_in_hotel,checkin_season,month_reservation
utrip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1000066_2,33371,0.0,3.050722,0.50,0.927835,0.034483,0.666667,0.545455
1000066_2,24864,0.0,3.050722,0.50,0.927835,0.034483,0.666667,0.545455
1000066_2,3469,0.0,3.050722,0.50,0.927835,0.068966,0.666667,0.545455
1000066_2,0,0.5,0.748847,0.50,,0.068966,0.666667,0.545455
1000270_1,29631,0.5,2.905626,1.00,0.881443,0.000000,0.000000,0.090909
...,...,...,...,...,...,...,...,...
999911_1,0,0.0,2.950507,0.50,,0.000000,1.000000,0.818182
999991_3,17603,0.0,2.479250,0.25,0.237113,0.034483,0.666667,0.636364
999991_3,21382,0.0,2.479250,0.25,0.164948,0.000000,0.666667,0.636364
999991_3,30848,1.0,1.116200,0.25,0.237113,0.000000,0.666667,0.636364


In [12]:
#remove targets (where =0)
df = df.loc[df.city_id != 0]

In [13]:
df

Unnamed: 0_level_0,city_id,device_class,affiliate_id,booker_country,hotel_country,days_in_hotel,checkin_season,month_reservation
utrip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1000066_2,33371,0.0,3.050722,0.50,0.927835,0.034483,0.666667,0.545455
1000066_2,24864,0.0,3.050722,0.50,0.927835,0.034483,0.666667,0.545455
1000066_2,3469,0.0,3.050722,0.50,0.927835,0.068966,0.666667,0.545455
1000270_1,29631,0.5,2.905626,1.00,0.881443,0.000000,0.000000,0.090909
1000270_1,17285,0.0,0.035659,1.00,0.185567,0.000000,0.000000,0.090909
...,...,...,...,...,...,...,...,...
999911_1,14181,0.0,1.769136,0.50,0.185567,0.034483,1.000000,0.818182
999911_1,12843,0.0,1.395942,0.50,0.185567,0.034483,1.000000,0.818182
999991_3,17603,0.0,2.479250,0.25,0.237113,0.034483,0.666667,0.636364
999991_3,21382,0.0,2.479250,0.25,0.164948,0.000000,0.666667,0.636364


In [14]:
trips = df.groupby("utrip_id").apply(np.array)

In [15]:
lens = [len(trip) for trip in trips]

In [16]:
np.min(lens), np.max(lens)

(1, 43)

In [17]:
np.mean(lens)

4.060881378958988

In [18]:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
cities_sequences = []
trips_info = []
y = []
for trip in trips:
    #discard trip smaller than 4
    if len(trip) < 4:
        continue
    #We start at 3
    for i in range(3, len(trip)):
        for elements in TimeseriesGenerator(trip, trip, i):
            #elements contain the list of i sequences and list of i targets in a tuple
            tpls = elements[0]
            targets = elements[1]
            n = len(tpls)
            for j in range(n):
                #cities within trip
                cities_sequence = tpls[j][:,0]
                #info within trip. because it is the info of the last reservation, it is contained in "targets"
                last_device = targets[j][1]
                last_booker = targets[j][3]
                last_num_days = targets[j][5]
                month_res = targets[j][7]
                
                trip_information = [last_device, last_booker, last_num_days, month_res]
                
                #target city
                target = targets[j][0]
                
                #Save all
                cities_sequences.append(cities_sequence)
                trips_info.append(trip_information)
                y.append(target)

In [19]:
print(f'We now have {len(y)} trips to add to our train set, with at least 3 destinations')

We now have 214984 trips to add to our train set, with at least 3 destinations


In [20]:
from numpy import savez_compressed, load
savez_compressed("X_train_cities_3.npz", cities_sequences)
savez_compressed("X_train_info_3.npz", trips_info)
savez_compressed("y_train_3.npz", y)