In [1]:
#imports
import pandas as pd
import numpy as np
from datetime import datetime
from statistics import mode
import matplotlib.pyplot as plt
import pickle
from collections import Counter
from tensorflow.keras.preprocessing import sequence
from scipy import stats

In [2]:
import time
import warnings
warnings.filterwarnings("ignore")

In [3]:
with open("test_data.pkl", "rb") as f:
    test_set, y_test = pickle.load(f)

In [4]:
test_set

Unnamed: 0,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,is_new_trip,is_target
0,2016-06-01,2016-06-03,20764,tablet,9452,Gondal,Santa Prisca,1174942_1,True,False
1,2016-06-03,2016-06-04,59001,tablet,9452,Gondal,Santa Prisca,1174942_1,False,False
2,2016-06-04,2016-06-07,15186,tablet,9452,Gondal,Santa Prisca,1174942_1,False,False
3,2016-06-07,2016-06-10,?,desktop,384,Gondal,?,1174942_1,False,True
4,2016-05-11,2016-05-13,60237,desktop,5755,Gondal,Kangan,1311136_1,True,False
...,...,...,...,...,...,...,...,...,...,...
58651,2016-08-09,2016-08-11,36170,desktop,9924,Elbonia,Carpathia,97967_1,False,False
58652,2016-08-11,2016-08-13,17990,desktop,9924,Elbonia,Carpathia,97967_1,False,False
58653,2016-08-13,2016-08-15,62185,desktop,9924,Elbonia,Axphain,97967_1,False,False
58654,2016-08-15,2016-08-16,56503,desktop,9924,Elbonia,Axphain,97967_1,False,False


In [5]:
with open("Preprocessing_encoders.pkl", "rb") as f:
    encode_cities, encode_devices, encode_affiliate_id, encode_hotel_country, encode_booker_country = pickle.load(f)

In [6]:
with open("normalized_values.pkl", "rb") as f:
    normalized_values = pickle.load(f)

In [7]:
y_test = y_test.set_index("utrip_id").sort_index()

In [8]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, concatenate, Input, TimeDistributed, Layer, Bidirectional, Softmax, Multiply, Lambda, GRU
from tensorflow.keras.regularizers import L2
from tensorflow.keras import activations
import tensorflow.keras.backend as K

In [9]:
#Parameters
reg = L2(l2=0.0001)
embedding_size = 300
num_cities = 39901
info_features_length = 4
dropout = 0.5

trips_sequences_input = Input(shape=(None,)) 
info_input = Input(shape=(info_features_length))




#First RNN on cities with Embeddings
emb_layer = Embedding(output_dim=embedding_size, input_dim=num_cities, input_length=None,
                      mask_zero=False, name="city_embeddings", embeddings_regularizer=reg)

emb_cities = emb_layer(trips_sequences_input) 
lstm_cities_1 = GRU(100, dropout=dropout, return_sequences=True,
                    kernel_regularizer=reg)(emb_cities)
  

lstm_cities_2 = GRU(100, dropout=dropout, return_sequences=True,
                  kernel_regularizer=reg)(lstm_cities_1)


#Add attention layer
attention=TimeDistributed(Dense(1))(lstm_cities_2)
attention=Softmax(axis=1)(attention)
context=Multiply()([attention,lstm_cities_2])
out=Lambda(lambda x: K.sum(x,axis=1))(context)    



concat = concatenate([out, info_input]) 


#Combine two RNN with features
#pre_output = Dense()
output = Dense(num_cities, activation='softmax')(concat) 


 
model = Model(inputs=[trips_sequences_input, info_input], outputs=[output])


In [10]:
import tensorflow as tf
from tensorflow.keras.metrics import sparse_top_k_categorical_accuracy
learning_rate = 0.0003
batch_size = 512
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, clipvalue=.25)
model.compile(optimizer=optimizer,
              loss="sparse_categorical_crossentropy",        #custom_loss(model, embedding_size, 1, batch_size), # Call the loss function with the model
              metrics=['sparse_top_k_categorical_accuracy'])

In [11]:
from tensorflow.keras.models import load_model
import tensorflow.keras.backend as K
model.load_weights("improvement-22-0.49.hdf5") #, custom_objects={'loss': K.sparse_categorical_crossentropy})

In [12]:
model.save("best_model.hdf5")

In [12]:
def drop_timesteps(t, len_trip, proportion=3):
    max_id_to_drop = len(t) // 3
    n = int(max_id_to_drop * pow(np.random.rand(), 0.3)) + 1 #random number of indexes to drop
    for i in range(n):
        to_skip = int(len_trip * pow(np.random.rand(), 0.3)) #more likely to skip a step towrds the end to add confusion
        t = np.delete(t, to_skip, axis=0)
        len_trip -= 1 #one time step was removed, so the trip is 1 time step smaller
        
    return t, len_trip

In [13]:
import random
from numpy import savez_compressed, load

def predictions_generator(filename1, filename2, batch_size=512, skip=False):
    #initiate a batch count for each size of trips
    batch_count = {k: 0 for k in range(50)}
    inputs = {k: [] for k in range(50)}
    while True:
        #load files
        trip_sequences = load(filename1, allow_pickle=True)["arr_0"]
        trip_infos = load(filename2, allow_pickle=True)["arr_0"]
        
        for i in range(len(trip_sequences)):
        
            #consider trip i
            trip_sequence = trip_sequences[i]
            trip_info = trip_infos[i]
            len_trip = len(trip_sequence)
            
            if (skip) & (len_trip > 3):
                #randomly skip one or more timesteps:
                random.seed()
                trip_sequence, len_trip = drop_timesteps(trip_sequence, len_trip, 3)
            
            #add info to corresponding  size
            inputs[len_trip].append([trip_sequence, trip_info])
            batch_count[len_trip] += 1
            
            if batch_count[len_trip] == batch_size:
                X_trip = np.array([t[0] for t in inputs[len_trip]], dtype="int32")
                X_info = np.array([t[1] for t in inputs[len_trip]], dtype="float32")
                #yield the data to feed the 
                yield [X_trip, X_info]
                inputs[len_trip] = []
                batch_count[len_trip] = 0
               
            

index = load("X_test_index.npz", allow_pickle=True)["arr_0"]

In [19]:
from numpy import savez_compressed, load
import random
def getData(filename1, filename2, filename3, batch_size=512, skip=0.2, train=True):
    #initiate a batch count for each size of trips
    batch_count = {k: 0 for k in range(50)}
    inputs = {k: [] for k in range(50)}
    targets = {k: [] for k in range(50)}
    while True:
        #load files
        trip_sequences = load(filename1, allow_pickle=True)["arr_0"]
        trip_infos = load(filename2, allow_pickle=True)["arr_0"]
        predictions = load(filename3, allow_pickle=True)["arr_0"]
        
        if train:
          random.seed()
          #Shuffle
          to_shuffle = list(zip(trip_sequences, trip_infos, predictions))
          random.shuffle(to_shuffle)
          trip_sequences, trip_infos, predictions = zip(*to_shuffle)

        #trip_sequences = np.array(trip_sequences)
        #trip_infos = np.array(trip_infos) 
        predictions = np.array(predictions, dtype="int32")
        
        for i in range(len(trip_sequences)):
          
            #consider trip i
            trip_sequence = trip_sequences[i]
            trip_info = trip_infos[i]
            prediction = predictions[i]
            len_trip = len(trip_sequence)
            
            if (train) & (len_trip > 3):
              #randomly skip one time step:
              random.seed()
              n = np.random.randint(100)
              if n / 100 < skip:
                #random step to skip:
                trip_sequence, len_trip = drop_timesteps(trip_sequence, len_trip)
            
            #add info to corresponding  size
            inputs[len_trip].append([trip_sequence, trip_info])
            targets[len_trip].append(prediction)
            batch_count[len_trip] += 1
            
            if batch_count[len_trip] == batch_size:
                X_trip = np.array([t[0] for t in inputs[len_trip]], dtype='float32')
                X_info = np.array([t[1] for t in inputs[len_trip]], dtype='float32')
                y = np.array(targets[len_trip]).reshape(batch_size,)
                #yield the data to feed the 
                yield [X_trip, X_info], y
                inputs[len_trip] = []
                targets[len_trip] = []
                batch_count[len_trip] = 0
               
               

In [30]:
test_generator = getData("X_test_trip.npz", "X_test_info.npz","y_test.npz",
                                        batch_size=256, skip=False, train=False)

In [34]:
model.evaluate(test_generator, steps=len(index) // 256)



[12.425256729125977, 0.4914434552192688]

In [15]:
test_trips = load("X_test_trip.npz", allow_pickle=True)["arr_0"]

In [16]:
with open("consecutive_cities.pkl", "rb") as f:
    consec = pickle.load(f)

In [17]:
def select_highly_consecutive_cities(threshold=0.06):
    selected_cities = [city for city in consec.keys() if consec[city] > threshold]
    s = encode_cities.transform(selected_cities) #encoded
    return s

In [18]:
#modify column 0
def adapt_4th_pred(p, s):
    last_pred = p[4]
    #if not frequently followed by duplicate, do not consider it
    if last_pred not in s:
        #return 4th choice of model
        return p[0]
    #else
    #if last pred already predicted
    if last_pred in p[:4].values:
        return p[0]
    #else
    return p[4]

In [19]:
model = load_model("final_model.hdf5")

In [20]:
def predict_from_generator(generator, topn=4, to_pred=4000, use_last=False):
    
    preds = model.predict(generator, steps=to_pred, batch_size=1)
    sorted_preds = np.array(preds).argsort(axis=1) #sort them along axis
    res_df = pd.DataFrame(sorted_preds[:,-topn:]) #select topn
    
    if use_last:
        s = select_highly_consecutive_cities()
        last_cities = [trip[-1] for trip in test_trips[:to_pred]]
        #we use the last city as a prediction
        res_df[4] = last_cities
        res_df[0] = res_df.apply(lambda x: adapt_4th_pred(x, s), axis=1)
        res_df.drop(4, inplace=True, axis=1)
    
    #inverse transform
    for i in range(4):
        res_df.iloc[:,i] = encode_cities.inverse_transform(res_df.iloc[:,i])
        
    return res_df

In [29]:
batch_size = 1
skip_generator = predictions_generator("X_test_trip.npz", "X_test_info.npz",
                                        batch_size=batch_size, skip=True)

normal_generator = predictions_generator("X_test_trip.npz", "X_test_info.npz",
                                        batch_size=batch_size, skip=False)

In [30]:
n_pred = 2000
res_df = predict_from_generator(normal_generator, use_last=True, to_pred=n_pred)

In [31]:
res_df["true"] = y_test.loc[y_test.index.isin(index[:n_pred])].city_id.values
res_df["good"] = res_df.apply(lambda x: x["true"] in x[[0, 1, 2, 3]].values, axis=1)
print(f"Percentage of good reco: {res_df.good.sum() / res_df.shape[0]}%")

Percentage of good reco: 0.4965%


In [30]:
res_df["top1"] = res_df.apply(lambda x: x["true"] in x[[3]].values, axis=1)
print(f"Percentage of top reco: {res_df.top1.sum() / res_df.shape[0]}%")

Percentage of top reco: 0.2655%


In [56]:
res_df.loc[res_df.top1].true.value_counts()

47499    46
2416     21
62185    16
17013    15
26436    15
         ..
25286     1
25025     1
58819     1
52933     1
40960     1
Name: true, Length: 192, dtype: int64

In [46]:
correct_preds = res_df.loc[res_df.good].true.value_counts().to_dict()

In [47]:
all_preds = res_df.true.value_counts().to_dict()

In [48]:
for k in all_preds.keys():
    if k not in correct_preds.keys():
        #no good predictions
        all_preds[k] = 0.0
        continue
    all_preds[k] = correct_preds[k] / all_preds[k]

In [49]:
all_preds

{47499: 0.9444444444444444,
 36063: 0.6486486486486487,
 17013: 0.9230769230769231,
 2416: 1.0,
 21929: 0.6190476190476191,
 26235: 0.8,
 29770: 0.6111111111111112,
 29319: 0.7058823529411765,
 3763: 0.9411764705882353,
 62185: 0.9411764705882353,
 26436: 1.0,
 4932: 0.875,
 55763: 0.8666666666666667,
 61320: 0.8571428571428571,
 23921: 0.9285714285714286,
 10485: 0.8571428571428571,
 51765: 0.7692307692307693,
 64876: 0.5384615384615384,
 51291: 0.7692307692307693,
 17127: 0.6923076923076923,
 52815: 0.7692307692307693,
 7410: 0.6923076923076923,
 2078: 0.9166666666666666,
 66648: 1.0,
 51259: 0.8,
 52818: 0.6,
 8766: 0.8888888888888888,
 8462: 1.0,
 35160: 0.875,
 46854: 0.875,
 48483: 0.875,
 51517: 0.5,
 47976: 0.75,
 38677: 0.8571428571428571,
 60143: 1.0,
 42356: 0.7142857142857143,
 25025: 0.7142857142857143,
 12308: 0.8571428571428571,
 30520: 1.0,
 19771: 0.7142857142857143,
 382: 0.7142857142857143,
 20345: 0.7142857142857143,
 60222: 1.0,
 65856: 0.7142857142857143,
 22065: 