In [1]:
#imports
import pandas as pd
import numpy as np
from datetime import datetime
from statistics import mode
import matplotlib.pyplot as plt
import pickle
from collections import Counter
from scipy import stats

In [2]:
import time
import warnings
warnings.filterwarnings("ignore")

In [3]:
with open("preprocessed_df.pkl", "rb") as f:
    df = pickle.load(f)

In [4]:
df.head()

Unnamed: 0,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,days_in_hotel,checkin_season,month_reservation
0,18386,0,134,2,60,1006220_1,2,1,4
1,23415,0,134,2,60,1006220_1,1,1,4
2,11933,0,134,2,59,1006220_1,4,1,4
3,14304,0,134,2,60,1006220_1,1,1,4
4,3164,1,125,4,36,1010293_1,1,2,7


In [5]:
#Now reset index with utrip id
df.set_index("utrip_id", inplace=True)

## TRAIN / VAL split

In [6]:
import random
random.seed(10)
print(f"There is {df.index.nunique()} different trips")
n_samples = int(np.round(df.index.nunique() * 0.1))
print(f'We will use 10% in our validation set: {n_samples}')


val_index = random.sample(list(df.index.unique()), n_samples)

There is 206802 different trips
We will use 10% in our validation set: 20680


In [7]:
#save it
with open("val_index_split.pkl", 'wb') as f:
    pickle.dump(val_index, f)

In [8]:
val_set = df.loc[df.index.isin(val_index)]
train_set = df.loc[np.invert(df.index.isin(val_index))]

#for simplification
df = train_set

In [9]:
#Normalize all columns except city_id
normalized_values = {}

#first, we retrieve them from train set
for col in df.columns[1:]:
    normalized_values[col] = (df[col].min(), df[col].max())
    df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

    #We normalize our val set
for col in val_set.columns[1:]:
    min_value, max_value = normalized_values[col]
    val_set[col] = (val_set[col]  - min_value) / (max_value - min_value)

In [10]:
val_set

Unnamed: 0_level_0,city_id,device_class,affiliate_id,booker_country,hotel_country,days_in_hotel,checkin_season,month_reservation
utrip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1017326_1,1163,0.5,0.924377,0.50,0.407216,0.000000,0.333333,0.272727
1017326_1,2094,0.5,0.924377,0.50,0.407216,0.034483,0.333333,0.363636
1017326_1,32861,0.5,0.924377,0.50,0.407216,0.000000,0.333333,0.363636
1017326_1,3506,0.5,0.924377,0.50,0.407216,0.000000,0.333333,0.363636
106408_3,38342,0.0,0.924377,0.75,0.268041,0.103448,0.666667,0.636364
...,...,...,...,...,...,...,...,...
966493_1,10040,0.0,0.041193,0.50,0.108247,0.172414,0.333333,0.363636
966493_1,8241,0.0,0.785429,0.50,0.108247,0.000000,0.333333,0.363636
966493_1,35611,0.0,0.041193,0.50,0.108247,0.137931,0.333333,0.363636
966493_1,8674,0.0,0.041193,0.50,0.108247,0.034483,0.333333,0.363636


In [11]:
with open("normalized_values.pkl", "wb") as f:
    pickle.dump(normalized_values, f)

In [12]:
normalized_values

{'device_class': (0, 2),
 'affiliate_id': (0, 3253),
 'booker_country': (0, 4),
 'hotel_country': (0, 194),
 'days_in_hotel': (1, 30),
 'checkin_season': (0, 3),
 'month_reservation': (1, 12)}

For train set: we create additionnal trips

In [13]:
#Retrieve trips info
trips = df.groupby("utrip_id").apply(np.array)

In [14]:
lens = [len(trip) for trip in trips]

In [15]:
np.min(lens), np.max(lens)

(1, 40)

In [16]:
print(f"We have currently {len(lens)} trips in our train set")

We have currently 186122 trips in our train set


In [17]:
print(f"We will keep all those greater than size 3: {(np.array(lens) > 3).sum()} trips")

We will keep all those greater than size 3: 161488 trips


But we will create additionnal sub_trips, based on longer trips

In [18]:
df

Unnamed: 0_level_0,city_id,device_class,affiliate_id,booker_country,hotel_country,days_in_hotel,checkin_season,month_reservation
utrip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1006220_1,18386,0.0,0.041193,0.5,0.309278,0.034483,0.333333,0.272727
1006220_1,23415,0.0,0.041193,0.5,0.309278,0.000000,0.333333,0.272727
1006220_1,11933,0.0,0.041193,0.5,0.304124,0.103448,0.333333,0.272727
1006220_1,14304,0.0,0.041193,0.5,0.309278,0.000000,0.333333,0.272727
1010293_1,3164,0.5,0.038426,1.0,0.185567,0.000000,0.666667,0.545455
...,...,...,...,...,...,...,...,...
999261_1,8407,1.0,0.964955,0.5,0.268041,0.034483,0.666667,0.727273
999755_1,2656,0.0,0.249308,1.0,0.309278,0.068966,1.000000,1.000000
999755_1,601,0.0,0.743314,1.0,0.309278,0.000000,1.000000,1.000000
999755_1,38342,0.0,0.743314,1.0,0.268041,0.068966,0.000000,1.000000


For each trip, we decide to keep info on: last device class, last month of reservation, last booker country, num of days in last hotel


In [20]:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
cities_sequences = []
trips_info = []
y = []
for trip in trips:
    #discard trip smaller than 4
    if len(trip) < 4:
        continue
    #We start at 3
    for i in range(3, len(trip)):
        for elements in TimeseriesGenerator(trip, trip, i):
            #elements contain the list of i sequences and list of i targets in a tuple
            tpls = elements[0]
            targets = elements[1]
            n = len(tpls)
            for j in range(n):
                #cities within trip
                cities_sequence = tpls[j][:,0]
                #info within trip. because it is the info of the last reservation, it is contained in "targets"
                last_device = targets[j][1]
                last_booker = targets[j][3]
                last_num_days = targets[j][5]
                month_res = targets[j][7]
                
                trip_information = [last_device, last_booker, last_num_days, month_res]
                
                #target city
                target = targets[j][0]
                
                #Save all
                cities_sequences.append(cities_sequence)
                trips_info.append(trip_information)
                y.append(target)

In [21]:
print(f'We now have a new train set of {len(y)} trips with at least 3 destinations')

We now have a new train set of 943932 trips with at least 3 destinations


for val_set, we simply retrieve the information

In [22]:
#Retrieve trips info
val_trips = val_set.groupby("utrip_id").apply(np.array)

In [25]:
val_trips[0].shape

(4, 8)

In [26]:
cities_sequences_val = []
trips_info_val = []
y_val = []
for trip in val_trips:
    #discard trip smaller than 4
    if len(trip) < 4:
        continue
    #We start at 3
    cities_sequence_val = trip[:-1,0]
    target = trip[-1,0]
    
    #info within trip
    last_device = trip[-1,1]
    last_booker = trip[-1,3]
    last_num_days = trip[-1, 5]
    month_res = trip[-1,7]
    
    trip_information = [last_device, last_booker, last_num_days, month_res]

    #Save all
    cities_sequences_val.append(cities_sequence_val)
    trips_info_val.append(trip_information)
    y_val.append(target)

We then use those trips, without the final destination, to add to our training data

In [27]:
for trip in val_trips:
    #We do not consider the last destination of the trip
    trip = trip[:-1]
    #discard trip smaller than 4
    if len(trip) < 4:
        continue
    #We start at 3
    for i in range(3, len(trip)):
        for elements in TimeseriesGenerator(trip, trip, i):
            #elements contain the list of i sequences and list of i targets in a tuple
            tpls = elements[0]
            targets = elements[1]
            n = len(tpls)
            for j in range(n):
                #cities within trip
                cities_sequence = tpls[j][:,0]
                #info within trip. because it is the info of the last reservation, it is contained in "targets"
                last_device = targets[j][1]
                last_booker = targets[j][3]
                last_num_days = targets[j][5]
                month_res = targets[j][7]
                
                trip_information = [last_device, last_booker, last_num_days, month_res]
                
                #target city
                target = targets[j][0]
                
                #Save all
                cities_sequences.append(cities_sequence)
                trips_info.append(trip_information)
                y.append(target)

In [28]:
lens_trips = [len(trip) for trip in cities_sequences]
print(f"Max len of trip in train seq: {np.max(lens_trips)}")

lens_trips_val = [len(trip) for trip in cities_sequences_val]
print(f"Max len of trip in val seq: {np.max(lens_trips_val)}")

Max len of trip in train seq: 39
Max len of trip in val seq: 26


In [29]:
print(f'We now have a new train set of {len(y)} trips with at least 3 destinations')

We now have a new train set of 1006777 trips with at least 3 destinations


No padding. We will fit our model with data of various input shape in each batch

In [30]:
# Finalization
X_train_cities = cities_sequences
X_train_info = trips_info
y_train = np.array(y)


X_val_cities = cities_sequences_val
X_val_info = trips_info_val
y_val = np.array(y_val)

In [31]:
len(y_train), len(y_val)

(1006777, 17913)

## For each trip_id, we have a variable lenght sequence of cities, a size 4 vector of info, and the city target

In [32]:
from numpy import savez_compressed, load

In [33]:
#Save data to compressed files
#train data
savez_compressed("X_train_cities.npz", X_train_cities)
savez_compressed("X_train_info.npz", X_train_info)
savez_compressed("y_train.npz", y_train)

#val data
savez_compressed("X_val_cities.npz", X_val_cities)
savez_compressed("X_val_info.npz", X_val_info)
savez_compressed("y_val.npz", y_val)

______