In [15]:
#imports
import pandas as pd
import numpy as np
from datetime import datetime
from statistics import mode
import matplotlib.pyplot as plt
import pickle
from collections import Counter
from tensorflow.keras.preprocessing import sequence
from scipy import stats

In [16]:
import time
import warnings
warnings.filterwarnings("ignore")

In [17]:
with open("test_data.pkl", "rb") as f:
    test_set, y_test = pickle.load(f)

In [18]:
test_set

Unnamed: 0,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,is_new_trip,is_target
0,2016-06-01,2016-06-03,20764,tablet,9452,Gondal,Santa Prisca,1174942_1,True,False
1,2016-06-03,2016-06-04,59001,tablet,9452,Gondal,Santa Prisca,1174942_1,False,False
2,2016-06-04,2016-06-07,15186,tablet,9452,Gondal,Santa Prisca,1174942_1,False,False
3,2016-06-07,2016-06-10,?,desktop,384,Gondal,?,1174942_1,False,True
4,2016-05-11,2016-05-13,60237,desktop,5755,Gondal,Kangan,1311136_1,True,False
...,...,...,...,...,...,...,...,...,...,...
58651,2016-08-09,2016-08-11,36170,desktop,9924,Elbonia,Carpathia,97967_1,False,False
58652,2016-08-11,2016-08-13,17990,desktop,9924,Elbonia,Carpathia,97967_1,False,False
58653,2016-08-13,2016-08-15,62185,desktop,9924,Elbonia,Axphain,97967_1,False,False
58654,2016-08-15,2016-08-16,56503,desktop,9924,Elbonia,Axphain,97967_1,False,False


In [19]:
with open("Preprocessing_encoders.pkl", "rb") as f:
    encode_cities, encode_devices, encode_affiliate_id, encode_hotel_country, encode_booker_country = pickle.load(f)

In [20]:
with open("normalized_values.pkl", "rb") as f:
    normalized_values = pickle.load(f)

In [21]:
y_test = y_test.set_index("utrip_id").sort_index()

In [22]:
def get_season(date):
    if date < spring_start:
        # 0 for winter
        return 0
    if date < summer_start:
        # 1 for spring
        return 1
    if date < autumn_start:
        # 2 for summer
        return 2
    if date < winter_start2:
        # 3 for autumn
        return 3
    #else: it is the start of 2017, hence winter
    return 0

#define seasons key dates
winter_start = datetime.strptime("2015-12-21", "%Y-%m-%d")
spring_start = datetime.strptime("2016-03-20", "%Y-%m-%d")
summer_start = datetime.strptime("2016-06-20", "%Y-%m-%d")
autumn_start = datetime.strptime("2016-09-22", "%Y-%m-%d")
winter_start2 = datetime.strptime("2016-12-21", "%Y-%m-%d")

In [23]:
test_set

Unnamed: 0,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,is_new_trip,is_target
0,2016-06-01,2016-06-03,20764,tablet,9452,Gondal,Santa Prisca,1174942_1,True,False
1,2016-06-03,2016-06-04,59001,tablet,9452,Gondal,Santa Prisca,1174942_1,False,False
2,2016-06-04,2016-06-07,15186,tablet,9452,Gondal,Santa Prisca,1174942_1,False,False
3,2016-06-07,2016-06-10,?,desktop,384,Gondal,?,1174942_1,False,True
4,2016-05-11,2016-05-13,60237,desktop,5755,Gondal,Kangan,1311136_1,True,False
...,...,...,...,...,...,...,...,...,...,...
58651,2016-08-09,2016-08-11,36170,desktop,9924,Elbonia,Carpathia,97967_1,False,False
58652,2016-08-11,2016-08-13,17990,desktop,9924,Elbonia,Carpathia,97967_1,False,False
58653,2016-08-13,2016-08-15,62185,desktop,9924,Elbonia,Axphain,97967_1,False,False
58654,2016-08-15,2016-08-16,56503,desktop,9924,Elbonia,Axphain,97967_1,False,False


In [24]:
normalized_values

{'device_class': (0, 2),
 'affiliate_id': (0, 3253),
 'booker_country': (0, 4),
 'hotel_country': (0, 194),
 'days_in_hotel': (1, 30),
 'checkin_season': (0, 3),
 'month_reservation': (1, 12)}

In [25]:
def apply_preprocessing(test_set, datetime_format="%Y-%m-%d", target="?", n_values=normalized_values,
                       skip_duplicates=True):
    
    df = test_set.copy()
    
    #Transform dates into datetime format
    df.checkin = df.checkin.apply(lambda x: datetime.strptime(x, datetime_format))
    df.checkout = df.checkout.apply(lambda x: datetime.strptime(x, datetime_format))
    
    #New column: how much days spent in the hotel
    df["days_in_hotel"] = df[["checkin", "checkout"]].apply(lambda x: (x[1] - x[0]).days, axis=1)
    
    #get current season when checking in or checking out
    df["checkin_season"] = df.checkin.apply(get_season)
    df["checkout_season"] = df.checkout.apply(get_season)
    
    df["month_reservation"] = df.checkin.apply(lambda x: x.month)
    
    #Let's deal with trip with consecutive cities
    df["is_new_trip"] = df[["utrip_id"]].shift() != df[["utrip_id"]]
    df["is_new_city"] = df[["city_id"]].shift() != df[["city_id"]]
    #When it is both not a new trip, and not a new city, it means it is a consecutive city reservation within a trip
    df["is_consecutive_res"] = (df.is_new_city == False) & (df.is_new_trip == False)
    
    #remove consecutive reservations
    if skip_duplicates:
        df = df.loc[np.invert(df.is_consecutive_res)]
    
    col_to_drop = ["checkin","checkout", "checkout_season", "is_consecutive_res", 
               "is_new_city", "is_new_trip", "is_target"]
    
    df.drop(col_to_drop, axis=1, inplace=True)
    
    
    
    #Retrieve trips info
    df.set_index("utrip_id", inplace=True)
    
    #encode city_id and hotel country
    mask = df['city_id'] != target
    df.loc[mask, 'city_id'] = encode_cities.transform(df.loc[mask, 'city_id'])
    df.loc[mask, 'hotel_country'] = encode_hotel_country.transform(df.loc[mask, "hotel_country"])
    
    #encode the rest of data
    df.device_class = encode_devices.transform(df.device_class)
    df.affiliate_id = encode_affiliate_id.transform(df.affiliate_id)
    df.booker_country = encode_booker_country.transform(df.booker_country)
    
    #Scale
    for col in df.columns[1:]:
        # not hotel country
        if col == "hotel_country":
            continue
        min_value, max_value = normalized_values[col]
        df[col] = (df[col]  - min_value) / (max_value - min_value)
    
    #Normalize hotel country col
    min_value, max_value = normalized_values["hotel_country"]
    df.loc[mask, "hotel_country"] = (df.loc[mask, "hotel_country"] - min_value) / (max_value - min_value)
    
    #Retrieve trips info
    trips = df.groupby("utrip_id").apply(np.array)
    index = trips.groupby("utrip_id").apply(np.array).index
    
    cities_sequences = []
    trip_info = []
    lst_indexes = []
    for i, trip in enumerate(trips):
        if len(trip) == 1:
            print("Not a trip")
            continue
        #save index
        lst_indexes.append((index[i]))
        cities_sequence = trip[:-1,0]
        last_device = trip[-1,1]
        last_booker = trip[-1,3]
        last_num_days = trip[-1, 5]
        month_res = trip[-1,7]
        
        trip_information = [last_device, last_booker, last_num_days, month_res]

        #Save all
        cities_sequences.append(cities_sequence)
        trip_info.append(trip_information)
    
    
    return cities_sequences, trip_info, lst_indexes

In [26]:
X_trip, X_info, index = apply_preprocessing(test_set)

Not a trip
Not a trip
Not a trip
Not a trip
Not a trip
Not a trip
Not a trip
Not a trip
Not a trip


In [27]:
X_trip

[array([25354, 32022, 31250, 819, 10335, 3925], dtype=object),
 array([2925, 35611, 5762], dtype=object),
 array([12000, 38342, 731, 4074], dtype=object),
 array([2656, 13933, 24788], dtype=object),
 array([15545, 26688, 9318, 8040, 19894, 6785], dtype=object),
 array([39134, 15970, 10187, 30264, 8651], dtype=object),
 array([27396, 9304, 27396, 9304, 15633], dtype=object),
 array([3216, 38738, 38122], dtype=object),
 array([9834, 13538, 35570], dtype=object),
 array([32861, 39315, 27979], dtype=object),
 array([6246, 2260, 2493], dtype=object),
 array([25911, 2594, 25359], dtype=object),
 array([30989, 12181, 35629], dtype=object),
 array([28721, 20753, 32, 17350], dtype=object),
 array([38173, 14673, 16499, 17532, 26471, 10073, 4347], dtype=object),
 array([25103, 37024, 3323, 15010, 26045], dtype=object),
 array([35570, 4578, 420, 5216, 33628, 23852], dtype=object),
 array([32660, 34292, 17681], dtype=object),
 array([37180, 29632, 32619, 38342], dtype=object),
 array([39203, 18649,

In [31]:
test_set.loc[test_set.utrip_id == "1001542_1"]

Unnamed: 0,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,is_new_trip,is_target
49260,2016-05-27,2016-05-30,4932,desktop,2661,The Devilfire Empire,Borginia,1001542_1,True,False
49261,2016-05-30,2016-06-03,60222,desktop,8225,The Devilfire Empire,Borginia,1001542_1,False,False
49262,2016-06-03,2016-06-04,9680,desktop,8225,The Devilfire Empire,Borginia,1001542_1,False,False
49263,2016-06-04,2016-06-09,?,mobile,359,The Devilfire Empire,?,1001542_1,False,True


In [35]:
encode_cities.transform([25025])

array([14821])

In [33]:
encode_cities.transform(y_test.city_id)

array([14821, 10040, 10958, ..., 25674, 21693, 34736])

In [40]:
y = encode_cities.transform(y_test.loc[y_test.index.isin(index)].city_id)

In [41]:
from numpy import savez_compressed, load
savez_compressed("X_test_trip.npz", X_trip)
savez_compressed("X_test_info.npz", X_info)
savez_compressed("X_test_index.npz", index)
savez_compressed("y_test.npz", y)