In [1]:
#imports
import pandas as pd
import numpy as np
from datetime import datetime
from statistics import mode
import matplotlib.pyplot as plt
import pickle
from collections import Counter
from tensorflow.keras.preprocessing import sequence
from scipy import stats

In [2]:
import time
import warnings
warnings.filterwarnings("ignore")

In [3]:
with open("test_data.pkl", "rb") as f:
    test_set, y_test = pickle.load(f)

In [4]:
test_set

Unnamed: 0,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,is_new_trip,is_target
0,2016-06-01,2016-06-03,20764,tablet,9452,Gondal,Santa Prisca,1174942_1,True,False
1,2016-06-03,2016-06-04,59001,tablet,9452,Gondal,Santa Prisca,1174942_1,False,False
2,2016-06-04,2016-06-07,15186,tablet,9452,Gondal,Santa Prisca,1174942_1,False,False
3,2016-06-07,2016-06-10,?,desktop,384,Gondal,?,1174942_1,False,True
4,2016-05-11,2016-05-13,60237,desktop,5755,Gondal,Kangan,1311136_1,True,False
...,...,...,...,...,...,...,...,...,...,...
58651,2016-08-09,2016-08-11,36170,desktop,9924,Elbonia,Carpathia,97967_1,False,False
58652,2016-08-11,2016-08-13,17990,desktop,9924,Elbonia,Carpathia,97967_1,False,False
58653,2016-08-13,2016-08-15,62185,desktop,9924,Elbonia,Axphain,97967_1,False,False
58654,2016-08-15,2016-08-16,56503,desktop,9924,Elbonia,Axphain,97967_1,False,False


In [5]:
with open("Preprocessing_encoders.pkl", "rb") as f:
    encode_cities, encode_devices, encode_affiliate_id, encode_hotel_country, encode_booker_country = pickle.load(f)

In [6]:
with open("normalized_values.pkl", "rb") as f:
    normalized_values = pickle.load(f)

In [7]:
y_test = y_test.set_index("utrip_id").sort_index()

In [8]:
def get_season(date):
    if date < spring_start:
        # 0 for winter
        return 0
    if date < summer_start:
        # 1 for spring
        return 1
    if date < autumn_start:
        # 2 for summer
        return 2
    if date < winter_start2:
        # 3 for autumn
        return 3
    #else: it is the start of 2017, hence winter
    return 0

#define seasons key dates
winter_start = datetime.strptime("2015-12-21", "%Y-%m-%d")
spring_start = datetime.strptime("2016-03-20", "%Y-%m-%d")
summer_start = datetime.strptime("2016-06-20", "%Y-%m-%d")
autumn_start = datetime.strptime("2016-09-22", "%Y-%m-%d")
winter_start2 = datetime.strptime("2016-12-21", "%Y-%m-%d")

In [9]:
test_set

Unnamed: 0,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,is_new_trip,is_target
0,2016-06-01,2016-06-03,20764,tablet,9452,Gondal,Santa Prisca,1174942_1,True,False
1,2016-06-03,2016-06-04,59001,tablet,9452,Gondal,Santa Prisca,1174942_1,False,False
2,2016-06-04,2016-06-07,15186,tablet,9452,Gondal,Santa Prisca,1174942_1,False,False
3,2016-06-07,2016-06-10,?,desktop,384,Gondal,?,1174942_1,False,True
4,2016-05-11,2016-05-13,60237,desktop,5755,Gondal,Kangan,1311136_1,True,False
...,...,...,...,...,...,...,...,...,...,...
58651,2016-08-09,2016-08-11,36170,desktop,9924,Elbonia,Carpathia,97967_1,False,False
58652,2016-08-11,2016-08-13,17990,desktop,9924,Elbonia,Carpathia,97967_1,False,False
58653,2016-08-13,2016-08-15,62185,desktop,9924,Elbonia,Axphain,97967_1,False,False
58654,2016-08-15,2016-08-16,56503,desktop,9924,Elbonia,Axphain,97967_1,False,False


In [10]:
normalized_values

{'device_class': (0, 2),
 'affiliate_id': (0, 3253),
 'booker_country': (0, 4),
 'hotel_country': (0, 194),
 'days_in_hotel': (1, 30),
 'checkin_season': (0, 3),
 'month_reservation': (1, 12)}

In [11]:
def apply_preprocessing(test_set, datetime_format="%Y-%m-%d", target="?", n_values=normalized_values,
                       skip_duplicates=True):
    
    df = test_set.copy()
    
    #Transform dates into datetime format
    df.checkin = df.checkin.apply(lambda x: datetime.strptime(x, datetime_format))
    df.checkout = df.checkout.apply(lambda x: datetime.strptime(x, datetime_format))
    
    #New column: how much days spent in the hotel
    df["days_in_hotel"] = df[["checkin", "checkout"]].apply(lambda x: (x[1] - x[0]).days, axis=1)
    
    #get current season when checking in or checking out
    df["checkin_season"] = df.checkin.apply(get_season)
    df["checkout_season"] = df.checkout.apply(get_season)
    
    df["month_reservation"] = df.checkin.apply(lambda x: x.month)
    
    #Let's deal with trip with consecutive cities
    df["is_new_trip"] = df[["utrip_id"]].shift() != df[["utrip_id"]]
    df["is_new_city"] = df[["city_id"]].shift() != df[["city_id"]]
    #When it is both not a new trip, and not a new city, it means it is a consecutive city reservation within a trip
    df["is_consecutive_res"] = (df.is_new_city == False) & (df.is_new_trip == False)
    
    #remove consecutive reservations
    if skip_duplicates:
        df = df.loc[np.invert(df.is_consecutive_res)]
    
    col_to_drop = ["checkin","checkout", "checkout_season", "is_consecutive_res", 
               "is_new_city", "is_new_trip", "is_target"]
    
    df.drop(col_to_drop, axis=1, inplace=True)
    
    
    
    #Retrieve trips info
    df.set_index("utrip_id", inplace=True)
    
    #encode city_id and hotel country
    mask = df['city_id'] != target
    df.loc[mask, 'city_id'] = encode_cities.transform(df.loc[mask, 'city_id'])
    df.loc[mask, 'hotel_country'] = encode_hotel_country.transform(df.loc[mask, "hotel_country"])
    
    #encode the rest of data
    df.device_class = encode_devices.transform(df.device_class)
    df.affiliate_id = encode_affiliate_id.transform(df.affiliate_id)
    df.booker_country = encode_booker_country.transform(df.booker_country)
    
    #Scale
    for col in df.columns[1:]:
        # not hotel country
        if col == "hotel_country":
            continue
        min_value, max_value = normalized_values[col]
        df[col] = (df[col]  - min_value) / (max_value - min_value)
    
    #Normalize hotel country col
    min_value, max_value = normalized_values["hotel_country"]
    df.loc[mask, "hotel_country"] = (df.loc[mask, "hotel_country"] - min_value) / (max_value - min_value)
    
    return df

In [12]:
df = apply_preprocessing(test_set)

In [13]:
#remove targets (where =?)
df = df.loc[df.city_id != "?"]

In [14]:
df

Unnamed: 0_level_0,city_id,device_class,affiliate_id,booker_country,hotel_country,days_in_hotel,checkin_season,month_reservation
utrip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1174942_1,12264,1.0,0.879188,0.50,0.783505,0.034483,0.333333,0.454545
1174942_1,34900,1.0,0.879188,0.50,0.783505,0.000000,0.333333,0.454545
1174942_1,8950,1.0,0.879188,0.50,0.783505,0.068966,0.333333,0.454545
1311136_1,35618,0.0,0.545650,0.50,0.407216,0.034483,0.333333,0.363636
1311136_1,8493,0.0,0.227790,0.50,0.407216,0.034483,0.333333,0.363636
...,...,...,...,...,...,...,...,...
97967_1,12280,0.0,0.924377,0.25,0.237113,0.000000,0.666667,0.636364
97967_1,21382,0.0,0.924377,0.25,0.164948,0.034483,0.666667,0.636364
97967_1,10628,0.0,0.924377,0.25,0.164948,0.034483,0.666667,0.636364
97967_1,36765,0.0,0.924377,0.25,0.0463918,0.034483,0.666667,0.636364


In [15]:
trips = df.groupby("utrip_id").apply(np.array)

In [16]:
lens = [len(trip) for trip in trips]

In [17]:
np.min(lens), np.max(lens)

(1, 38)

In [18]:
np.mean(lens)

4.088

In [19]:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
cities_sequences = []
trips_info = []
y = []
for trip in trips:
    #discard trip smaller than 4
    if len(trip) < 4:
        continue
    #We start at 3
    for i in range(3, len(trip)):
        for elements in TimeseriesGenerator(trip, trip, i):
            #elements contain the list of i sequences and list of i targets in a tuple
            tpls = elements[0]
            targets = elements[1]
            n = len(tpls)
            for j in range(n):
                #cities within trip
                cities_sequence = tpls[j][:,0]
                #info within trip. because it is the info of the last reservation, it is contained in "targets"
                last_device = targets[j][1]
                last_booker = targets[j][3]
                last_num_days = targets[j][5]
                month_res = targets[j][7]
                
                trip_information = [last_device, last_booker, last_num_days, month_res]
                
                #target city
                target = targets[j][0]
                
                #Save all
                cities_sequences.append(cities_sequence)
                trips_info.append(trip_information)
                y.append(target)

In [20]:
print(f'We now have {len(y)} trips to add to our train set, with at least 3 destinations')

We now have 35838 trips to add to our train set, with at least 3 destinations


In [21]:
from numpy import savez_compressed, load
savez_compressed("X_train_cities_2.npz", cities_sequences)
savez_compressed("X_train_info_2.npz", trips_info)
savez_compressed("y_train_2.npz", y)

_____