In [3]:
#imports
import pandas as pd
import numpy as np
from datetime import datetime
from statistics import mode
import matplotlib.pyplot as plt
import pickle

In [4]:
import time
import warnings
warnings.filterwarnings("ignore")

In [5]:
with open("preprocessed_df.pkl", "rb") as f:
    df = pickle.load(f)
    
with open("val_index_split.pkl", 'rb') as f:
    val_index = pickle.load(f)

In [6]:
df.set_index("utrip_id", inplace=True)

In [7]:
df.city_id.value_counts()

28148    8450
14181    8291
32619    6630
17350    6547
21317    6503
         ... 
23164       1
25211       1
18841       1
20890       1
0           1
Name: city_id, Length: 38569, dtype: int64

In [6]:
df.city_id.nunique()

38569

In [7]:
#train val split
val_set = df.loc[df.index.isin(val_index)]
train_set = df.loc[np.invert(df.index.isin(val_index))]

In [8]:
#Retrieve trips 
trips = train_set.groupby("utrip_id").city_id.apply(np.array)
val_trips = val_set.groupby("utrip_id").city_id.apply(np.array)

In [9]:
len(val_trips)

19592

In [10]:
#separate into cities and target city
X_val = [trip[:-1] for trip in val_trips if len(trip) > 3]
y_val = [trip[-1] for trip in val_trips if len(trip) > 3]

In [12]:
lens = [len(trip) for trip in trips]

In [13]:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from collections import defaultdict
model = defaultdict(lambda: defaultdict(lambda: 0))

for trip in trips:
    #discard trip smaller than 2
    if len(trip) < 2:
        continue
    for i in range(1, len(trip)):
        for elements in TimeseriesGenerator(trip, trip, i):
            #elements contain the list of i sequences and list of i targets in a tuple
            tpls = elements[0]
            targets = elements[1]
            n = len(tpls)
            for j in range(n):
                #cities within trip
                cities_sequence = tuple(tpls[j])
                #target city
                target = targets[j]
                
                #add to model
                model[cities_sequence][target] +=1

In [14]:
def predict(test_trip, return_top=4):
    predictions = {k: 0 for k in range(39901)}
    i = 0
    while i < len(test_trip):
        trip_to_consider = test_trip[i:]
        preds = dict(model[tuple(trip_to_consider)])
        for k in preds.keys():
            predictions[k] += preds[k] * len(trip_to_consider)
        i += 1
    
    #at this point, if no prediction is made, try it without the last city
    if sum(predictions.values()) == 0:
        return predict(test_trip[:-1])
    
    #Now, we calculate the sum of all values
    total = sum(predictions.values(), 0.0)
    #divide each value by it to get a proportion
    predictions = {k: v / total for k, v in predictions.items()}
    return predictions
    sorted_preds = sorted(predictions.items(), key=lambda x: x[1], reverse=True)
    return np.array(sorted_preds, dtype=int)[:min(return_top, len(preds)), 0]

In [144]:
#tranform model (default dict of default dict in a dict of dict to save it in pickle)
dct = {k: dict(model[k]) for k in model.keys()}

In [146]:
with open("ngram.pkl", "wb") as f:
    pickle.dump(dct, f)

In [39]:
import sys
sys.setrecursionlimit(4000)

In [None]:
start = time.time()
preds = []
for i, trip in enumerate(X_val):
    if i % 1000 == 0:
        present = np.round(time.time() - start, 2)
        print(f"Done for {i} iterations")
        print(f"Seconds elapsed: {present}")
    pred = pd.Series(predict(trip)).values
    preds.append(pred)

Done for 0 iterations
Seconds elapsed: 0.0
Done for 1000 iterations
Seconds elapsed: 35.53
Done for 2000 iterations
Seconds elapsed: 71.04
Done for 3000 iterations
Seconds elapsed: 106.66
Done for 4000 iterations
Seconds elapsed: 142.55


In [156]:
ngram = defaultdict(dict, {k: defaultdict(int, dct[k]) for k in dct.keys()})

In [168]:
ngram[(647, 289, 289)].keys()

dict_keys([])

In [48]:
lst = [max(model[k].values()) for k in model.keys() if len(model[k]) > 0]

In [107]:
sum(predict(X_val[0]).values())

0.9999999999999986

In [180]:
a = predict(X_val[0])

In [185]:
pd.Series(a).shape

(39901,)

In [1]:
len(y_val)

NameError: name 'y_val' is not defined

In [56]:
lst.index(1302)

16

In [174]:
to_test = 1000
preds = [predict(trip, return_top=4) for trip in X_val[500:1000]]

In [175]:
res_df = pd.DataFrame(preds)
res_df["true"] = y_val[500:1000]
res_df["good"] = res_df.apply(lambda x: x["true"] in x[[0, 1, 2, 3]].values, axis=1)
print(f"Percentage of good reco: {res_df.good.sum() / res_df.shape[0]}%")

Percentage of good reco: 0.458%


In [176]:
res_df["top1"] = res_df.apply(lambda x: x["true"] in x[[0]].values, axis=1)
print(f"Percentage of top reco: {res_df.top1.sum() / res_df.shape[0]}%")

Percentage of top reco: 0.254%


In [29]:
len(lst_top3)

193

In [30]:
for i in res_df.loc[res_df.top1].index:
    if i not in lst_top3:
        lst_top3.append(i)

In [32]:
len(lst_top3)

260

In [37]:
a = map(lambda x: predict(x), X_val[:100])

In [38]:
pd.DataFrame(a)

Unnamed: 0,0,1,2,3
0,33628,5216,23852,1127.0
1,10496,2648,26363,30563.0
2,1127,35570,420,5216.0
3,4966,22871,12597,20168.0
4,35611,5762,2925,34077.0
...,...,...,...,...
95,8040,37734,7830,26688.0
96,36981,25176,13306,17066.0
97,10496,10926,13761,18124.0
98,22564,25951,7535,22286.0


In [83]:
model[tuple([610])]

defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
            {36905: 1,
             19894: 14,
             6785: 2,
             22785: 2,
             21389: 1,
             26030: 11,
             1803: 5,
             14246: 5,
             8040: 102,
             15863: 24,
             24837: 2,
             26962: 1,
             23137: 3,
             33859: 5,
             9433: 1,
             30841: 26,
             26209: 2,
             6307: 2,
             39669: 23,
             1891: 7,
             31917: 2,
             16295: 8,
             14994: 1,
             2736: 8,
             30330: 22,
             36181: 1,
             10830: 4,
             8965: 3,
             35106: 2,
             6780: 12,
             37719: 1,
             18833: 16,
             25656: 1,
             14627: 5,
             20021: 3,
             39554: 4,
             26818: 1,
             20653: 1,
             21796: 1,
             36577: 1,
             1953

In [88]:
preds

[array([33628,  5216, 23852,  1127]),
 array([10496,  2648, 26363, 30563]),
 array([ 1127, 35570,   420,  5216]),
 array([ 4966, 22871, 12597, 20168]),
 array([35611,  5762,  2925, 34077]),
 array([ 2382, 19316, 12625]),
 array([13761,   136, 29478, 32942]),
 array([16235,  4623, 19408, 18652]),
 array([ 8040, 30841, 15863, 39669]),
 array([ 9462, 19408, 18652, 28721]),
 array([11469, 28633, 27909, 37991]),
 array([13226,  7766, 12314, 14754]),
 array([12000, 36981, 34786, 26368]),
 array([ 1414,  4333, 27962,  9018]),
 array([32660, 27410, 12959,  9752]),
 array([22871, 29077,  4966,  2410]),
 array([ 9048, 31266, 36209, 34600]),
 array([12000, 17066, 25176, 19824]),
 array([27962, 16861,  5483,  4333]),
 array([ 3925, 33577,   903, 37024]),
 array([16235, 18652,  9462, 29002]),
 array([ 4055, 18499, 20653,   396]),
 array([ 5040, 29704, 30288, 17837]),
 array([28148, 32861, 19342,  7690]),
 array([10864, 39379, 14050,  2229]),
 array([ 9068,  9210, 25674, 31898]),
 array([36765, 2398

In [74]:
dict(model[tuple(X_val[167][4:])])

{29890: 4,
 10112: 46,
 2077: 170,
 10729: 3,
 5715: 33,
 30330: 454,
 19681: 1,
 34405: 45,
 15706: 1,
 3357: 21,
 14050: 23,
 601: 28,
 18020: 141,
 25674: 16,
 39379: 161,
 31268: 8,
 15545: 38,
 21317: 155,
 12848: 2,
 7203: 4,
 8040: 22,
 12106: 48,
 26447: 2,
 31586: 3,
 22286: 15,
 4757: 1,
 1126: 5,
 19140: 5,
 37180: 16,
 844: 1,
 4328: 10,
 11772: 2,
 10381: 16,
 24496: 1,
 15328: 23,
 5316: 12,
 12867: 3,
 17583: 18,
 25: 1,
 2805: 8,
 38342: 90,
 32497: 1,
 10864: 20,
 28564: 2,
 24788: 13,
 32619: 10,
 8984: 34,
 8725: 1,
 11933: 4,
 6232: 8,
 26236: 16,
 11137: 2,
 30967: 1,
 31767: 8,
 37638: 6,
 38293: 5,
 11078: 2,
 9752: 10,
 19358: 1,
 25113: 4,
 14195: 4,
 15215: 7,
 15670: 1,
 36676: 13,
 19781: 2,
 37991: 5,
 25813: 5,
 6019: 1,
 21132: 1,
 28082: 4,
 35932: 1,
 24656: 2,
 10175: 1,
 30343: 1,
 2737: 1,
 22563: 1,
 1761: 3,
 22436: 3,
 16684: 1,
 4296: 5,
 7074: 27,
 11883: 9,
 39187: 1,
 11469: 4,
 33386: 1,
 31886: 4,
 38963: 1,
 32172: 1,
 23415: 6,
 20192: 1,


In [72]:
predict(X_val[167])

array([30330,  2077, 39379, 21317])

In [73]:
y_val[167]

21317

In [177]:
with open("Preprocessing_encoders.pkl", "rb") as f:
    encode_cities, encode_devices, encode_affiliate_id, encode_hotel_country, encode_booker_country = pickle.load(f)

In [203]:
v = encode_cities.transform([56430, 11543,  5797])
a = encode_cities.transform([50075, 44768, 29207])
c = encode_cities.transform([47759, 46411, 52933, 35160])
d = encode_cities.transform([57236, 57236,  4932])
predict(v)

array([24864, 27754, 17743, 23570])

In [207]:
encode_cities.inverse_transform(predict(d))

array([60222, 17013, 50957, 67025])

In [107]:
a = map(predict, X_test[:20])

In [108]:
pd.DataFrame(a)

Unnamed: 0,0,1,2,3
0,2845.0,26590.0,11064.0,18124.0
1,31282.0,34280.0,24510.0,3632.0
2,13306.0,36981.0,12000.0,17066.0
3,30604.0,31492.0,6560.0,6870.0
4,2925.0,10040.0,5762.0,30130.0
5,33395.0,4430.0,22405.0,13525.0
6,38173.0,5106.0,23930.0,16499.0
7,,,,
8,39379.0,18020.0,30330.0,15215.0
9,32494.0,9678.0,,


In [197]:
predictions = [predict(trip, return_top=4) for trip in X_test[:100]]

In [198]:
predictions

[{0: 0,
  1: 0,
  2: 0,
  3: 0,
  4: 0,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0,
  11: 0,
  12: 0,
  13: 0,
  14: 0,
  15: 0,
  16: 0,
  17: 0,
  18: 0,
  19: 0,
  20: 0,
  21: 0,
  22: 0,
  23: 0,
  24: 0,
  25: 0,
  26: 0,
  27: 0,
  28: 0,
  29: 0,
  30: 0,
  31: 0,
  32: 0,
  33: 0,
  34: 0,
  35: 0,
  36: 0,
  37: 0,
  38: 0,
  39: 0,
  40: 0,
  41: 0,
  42: 0,
  43: 0,
  44: 0,
  45: 0,
  49: 0,
  50: 0,
  51: 0,
  52: 0,
  54: 0,
  55: 0,
  56: 0,
  57: 0,
  58: 0,
  59: 0,
  60: 0,
  61: 0,
  62: 0,
  63: 0,
  64: 0,
  65: 0,
  66: 0,
  67: 0,
  69: 0,
  70: 0,
  71: 0,
  72: 0,
  73: 0,
  74: 0,
  75: 0,
  76: 0,
  78: 0,
  79: 0,
  80: 0,
  81: 0,
  82: 0,
  83: 0,
  84: 0,
  85: 0,
  86: 0,
  87: 0,
  88: 0,
  89: 0,
  90: 0,
  91: 0,
  93: 0,
  94: 0,
  95: 0,
  96: 0,
  97: 0,
  98: 0,
  99: 0,
  100: 0,
  101: 0,
  103: 0,
  104: 0,
  106: 0,
  107: 0,
  108: 0,
  110: 0,
  111: 0,
  112: 0,
  113: 0,
  114: 0,
  115: 0,
  116: 0,
  118: 0,
  119: 0,
  120: 0,
  121: 0,
  

In [160]:
res_df = pd.DataFrame(predictions)

In [161]:
res_df["true"] = y_test[1000:5000]

In [162]:
res_df["good"] = res_df.apply(lambda x: x["true"] in x[[0, 1, 2, 3]].values, axis=1)

In [163]:
res_df

Unnamed: 0,0,1,2,3,true,good
0,39466.0,17350.0,14181.0,12994.0,39466,True
1,8646.0,14066.0,1622.0,13047.0,14066,True
2,15545.0,37458.0,12730.0,38878.0,15545,True
3,28148.0,32861.0,19342.0,2487.0,39315,False
4,9752.0,12959.0,32660.0,34292.0,12959,True
...,...,...,...,...,...,...
3995,20032.0,15317.0,14063.0,9891.0,35550,False
3996,25083.0,31492.0,34382.0,730.0,25083,True
3997,1622.0,11411.0,13047.0,27754.0,36344,False
3998,34841.0,39079.0,3269.0,9815.0,16850,False


In [164]:
print(f"Percentage of good reco: {res_df.good.sum() / res_df.shape[0]}%")

Percentage of good reco: 0.62975%


In [165]:
res_df["top1"] = res_df.apply(lambda x: x["true"] in x[[0]].values, axis=1)
print(f"Percentage of top1 reco: {res_df.top1.sum() / res_df.shape[0]}%")

Percentage of top1 reco: 0.42675%


In [166]:
res_df["top2"] = res_df.apply(lambda x: x["true"] in x[[0, 1]].values, axis=1)
print(f"Percentage of top2 reco: {res_df.top2.sum() / res_df.shape[0]}%")

Percentage of top2 reco: 0.52725%


In [167]:
res_df["top3"] = res_df.apply(lambda x: x["true"] in x[[0, 1, 2]].values, axis=1)
print(f"Percentage of top3 reco: {res_df.top3.sum() / res_df.shape[0]}%")

Percentage of top3 reco: 0.5875%


In [604]:
predict([57236, 57236, 4932])

array([60222., 15343.,  4932., 50957.])

In [583]:
[[56430, 11543, 5797],
 [50075, 44768, 29207],
 [47759, 46411, 52933, 35160],
 [57236, 57236, 4932]]

array([16065,  8726, 15564,  6559])

In [440]:
#Build new trips
new_targets = []
new_trips = []
for trip in trips:
    if len(trip) == 1:
        continue
    for i in range(1, len(trip)):
        for elements in TimeseriesGenerator(trip, trip, i):
            #elements contain the list of i cities and list of i targets in a tuple
            tpls = elements[0]
            targets = elements[1]
            n = len(tpls)
            for j in range(n):
                new_targets.append(targets[j])
                new_trips.append(tpls[j])


In [443]:
trips

utrip_id
1000027_1                          [8183, 15626, 60902, 30628]
1000033_1                  [38677, 52089, 21328, 27485, 38677]
1000045_1     [64876, 55128, 9608, 31817, 36170, 58178, 36063]
1000083_1                         [55990, 14705, 35160, 36063]
100008_1                     [11306, 12096, 6761, 6779, 65690]
                                   ...                        
999776_1                          [17775, 66634, 17775, 17775]
999839_1                            [8335, 21328, 8335, 48968]
999842_1                          [51291, 66969, 67169, 24036]
999855_1     [382, 38509, 18930, 38509, 51145, 11179, 61881...
999944_1                            [17944, 47075, 228, 62930]
Name: city_id, Length: 217686, dtype: object

In [441]:
new_targets

[15626,
 60902,
 30628,
 60902,
 30628,
 30628,
 52089,
 21328,
 27485,
 38677,
 21328,
 27485,
 38677,
 27485,
 38677,
 38677,
 55128,
 9608,
 31817,
 36170,
 58178,
 36063,
 9608,
 31817,
 36170,
 58178,
 36063,
 31817,
 36170,
 58178,
 36063,
 36170,
 58178,
 36063,
 58178,
 36063,
 36063,
 14705,
 35160,
 36063,
 35160,
 36063,
 36063,
 12096,
 6761,
 6779,
 65690,
 6761,
 6779,
 65690,
 6779,
 65690,
 65690,
 31088,
 40521,
 55128,
 21033,
 6306,
 6788,
 6788,
 61187,
 42503,
 40521,
 55128,
 21033,
 6306,
 6788,
 6788,
 61187,
 42503,
 55128,
 21033,
 6306,
 6788,
 6788,
 61187,
 42503,
 21033,
 6306,
 6788,
 6788,
 61187,
 42503,
 6306,
 6788,
 6788,
 61187,
 42503,
 6788,
 6788,
 61187,
 42503,
 6788,
 61187,
 42503,
 61187,
 42503,
 42503,
 42482,
 20345,
 33540,
 32627,
 20345,
 33540,
 32627,
 33540,
 32627,
 32627,
 27112,
 17764,
 56651,
 35850,
 17764,
 56651,
 35850,
 56651,
 35850,
 35850,
 57619,
 22065,
 2748,
 46854,
 5797,
 57658,
 22065,
 2748,
 46854,
 5797,
 5765

In [444]:
len(new_trips)

2985431

In [605]:
new_trips

[[8183],
 [15626],
 [60902],
 [8183, 15626],
 [15626, 60902],
 [8183, 15626, 60902],
 [38677],
 [52089],
 [21328],
 [27485],
 [38677, 52089],
 [52089, 21328],
 [21328, 27485],
 [38677, 52089, 21328],
 [52089, 21328, 27485],
 [38677, 52089, 21328, 27485],
 [64876],
 [55128],
 [9608],
 [31817],
 [36170],
 [58178],
 [64876, 55128],
 [55128, 9608],
 [9608, 31817],
 [31817, 36170],
 [36170, 58178],
 [64876, 55128, 9608],
 [55128, 9608, 31817],
 [9608, 31817, 36170],
 [31817, 36170, 58178],
 [64876, 55128, 9608, 31817],
 [55128, 9608, 31817, 36170],
 [9608, 31817, 36170, 58178],
 [64876, 55128, 9608, 31817, 36170],
 [55128, 9608, 31817, 36170, 58178],
 [64876, 55128, 9608, 31817, 36170, 58178],
 [55990],
 [14705],
 [35160],
 [55990, 14705],
 [14705, 35160],
 [55990, 14705, 35160],
 [11306],
 [12096],
 [6761],
 [6779],
 [11306, 12096],
 [12096, 6761],
 [6761, 6779],
 [11306, 12096, 6761],
 [12096, 6761, 6779],
 [11306, 12096, 6761, 6779],
 [17127],
 [31088],
 [40521],
 [55128],
 [21033],
 [63

In [445]:
from sklearn.preprocessing import LabelEncoder

In [446]:
flat_list = [item for sublist in new_trips for item in sublist]

In [451]:
a = np.concatenate([flat_list, new_targets])

In [453]:
num_cities = len(np.unique(a))

In [524]:
new_trips = [list(trip) for trip in new_trips]

In [606]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(new_trips, new_targets, random_state=0)

In [1]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, LSTM, Bidirectional, Dropout, concatenate, Input, Masking, InputLayer
from tensorflow.keras.regularizers import L1L2

cities_length = len(X_padded[0])
features_length = filtered_features.shape[1],
embedding_size = 300
num_cities = num_cities + 1
reg = L1L2(l1=0.01, l2=0.01)


nlp_cities_input = Input(shape=(None,1)) 

#First RNN on cities
latent = Masking()(nlp_cities_input)
emb_cities = Embedding(output_dim=embedding_size, input_dim=num_cities)(latent) 
lstm_cities_1 = LSTM(64, activation="relu", recurrent_dropout=0.3, return_sequences=False, 
                     recurrent_regularizer=reg, kernel_regularizer=reg, bias_regularizer=reg)(emb_cities) 
#lstm_cities_2 = LSTM(16, activation="relu", recurrent_dropout=0.1, return_sequences=True)(lstm_cities_1)
#nlp_cities_out = LSTM(8, activation="relu", recurrent_dropout=0.1)(lstm_cities_2)

#Combine two RNN with feature

#Add another layer
pre_output = Dense(195, activation="relu",)(lstm_cities_1)
#drop = Dropout(0.5)(pre_output)

#Predict
output = Dense(num_cities, activation='softmax')(pre_output) 
 
model = Model(inputs=[nlp_cities_input], outputs=[output])

SyntaxError: invalid syntax (<ipython-input-1-9d6bded60c79>, line 15)

In [None]:
model.summary()

In [571]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [572]:
def rework_data(X, y, len_trip=4):
    lens = [len(trip) for trip in X]
    mask = (np.array(lens) == len_trip)
    X = np.array(X)[mask]
    y = np.array(y)[mask]
    X = np.array([list(trip) for trip in X])
    
    X = [encode_cities.transform(trip)+1 for trip in X]
    y = encode_cities.transform(y) + 1
    
    print(f"There is {len(y)} individuals in this set")
    return np.array([trip for trip in X]), y

In [573]:
X_train, y_train = rework_data(X_train, y_train, 4)

There is 222891 individuals in this set


In [574]:
X_test, y_test = rework_data(X_test, y_test, 4)

There is 74480 individuals in this set


In [576]:
history = model.fit(x=X_train,
          y=y_train, 
          validation_data=(X_test, y_test), epochs=10, batch_size=512,
                   #callbacks=callbacks_list
                   )

Epoch 1/10
Epoch 2/10
Epoch 3/10
 24/436 [>.............................] - ETA: 2:20 - loss: 7.9624 - accuracy: 0.0160

KeyboardInterrupt: 