In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn import TransformerEncoder, TransformerDecoder, TransformerEncoderLayer, TransformerDecoderLayer
import pandas as pd
import math
import datetime
import numpy as np

This code tries to predict the prices of booking hotels after learning embeddings of dates and using historical booking prices. 

In [2]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

Have embeddings for Weekday, Month and Date.  Make a sequential model for price prediction in 2020 Feb.  Use transformers for prediction of future prices. 
Explanation for Weekday:
- Saturday and Sunday are supposed to more pricy (higher demand) than others
- Adjacent days are also supposed to be more pricy 

Explanation for Month:
- Festival might be associated with months (like christmas)
- Months might also relate to weather. Some places might have extreme weathers which will result in lower demand in those weather

Explanation for Date:
- Day of the month might also be associated with festivals
- People might be more willing to travel during the start or end of the month depending on the work pattern. One such examples would be During the tax filings or financial year closing time which is towards end of the month less people are likely to travel from a city which is working hub

Explanation for not including year
- If there are some event which are happening in a particular year (like hosting a world cup) we better avoid incorporating such information in our model
- Exception to this might be repititive events which repeats at a frequency of multiplicity of a year. But this has been ignored in preparation of the model

In [3]:
class TransformerModel(nn.Module):
    def __init__(self, seq_len, d_model, nhead, dim_feedforward, num_weekday, num_months, num_days, num_price_bucket, dropout = 0.2):
        super().__init__()
        self.seq_len = seq_len
        self.d_model = d_model
        self.nhead = nhead
        self.dim_feedforward = dim_feedforward
        self.num_weekday = num_weekday
        self.num_months = num_months
        self.num_days = num_days
        self.num_price_bucket = num_price_bucket
        self.pos_encoder = PositionalEncoding(self.d_model, dropout, self.seq_len)
        transformer_encoder_layer = TransformerEncoderLayer(self.d_model, self.nhead, self.dim_feedforward)
        transformer_decoder_layer = TransformerDecoderLayer(self.d_model, self.nhead, self.dim_feedforward)
        self.encoder = TransformerEncoder(transformer_encoder_layer, 2)
        self.decoder = TransformerDecoder(transformer_decoder_layer, 2)
        self.weekday_embedding = nn.Embedding(num_weekday, self.d_model)
        self.month_embedding = nn.Embedding(num_months, self.d_model)
        self.num_days_embedding = nn.Embedding(num_days, self.d_model)
        self.price_bucket_embedding = nn.Embedding(self.num_price_bucket, self.d_model)
        self.linear = nn.Linear(d_model,1)
        self.mask = torch.triu(torch.ones(self.seq_len, self.seq_len), diagonal=1)
        self.mask[self.mask==1]=float('-inf')
        
    def forward(self, enc_inp, dec_inp):
        enc_inp = self.month_embedding(enc_inp[...,0])+self.num_days_embedding(enc_inp[...,1])+self.weekday_embedding(enc_inp[...,2])
        dec_inp = self.price_bucket_embedding(dec_inp)
        enc_inp = enc_inp.transpose(0,-2)
        dec_inp = dec_inp.transpose(0,-2)
        enc_inp = self.pos_encoder(enc_inp)
        dec_inp = self.pos_encoder(dec_inp)
        enc_out = self.encoder(enc_inp)
        dec_out = self.decoder(tgt=dec_inp, memory=enc_out, tgt_mask=self.mask)
        output = self.linear(dec_out)
        return output

In [4]:
SEQLEN = 7
class priceDataset(Dataset):
    def __init__(self, price_df):
        self.data = price_df
        self.default_price_bucket = self.data['price_bucket'].mode()
    
    def __len__(self):
        return len(self.data)-SEQLEN
    
    def __getitem__(self, index):
        enc_inp = torch.tensor(self.data[['month', 'day', 'weekday']][index:index+SEQLEN].values)
        response = torch.tensor(self.data['Price'][index:index+SEQLEN].values,dtype=torch.float32)
        if index:
            dec_inp = torch.tensor(self.data['price_bucket'][index:index+SEQLEN].values)
        else:
            dec_inp = torch.tensor(np.append(self.default_price_bucket,self.data['price_bucket'][index:index+SEQLEN-1]))
        if dec_inp.shape[0]<SEQLEN:
            dec_inp = F.pad(dec_inp, pad=(0,SEQLEN-dec_inp.shape[0]), mode='constant', value=0)
            enc_inp = F.pad(enc_inp, pad=(0,0,0,SEQLEN-dec_inp.shape[0]), mode='constant', value=0)
            response = F.pad(response, pad=(0,SEQLEN-dec_inp.shape[0]), mode='constant', value=0)
        return enc_inp, dec_inp, response


In [5]:
path = 'price_data.csv'

LRANGE = 80
URANGE = 199
RANGELENGTH = 20

###################################################################
#value of LRANGE, URANGE and RANGELENGTH has been decided from EDA#
###################################################################

def make_price_buckets(price):
    return (min(max(price,LRANGE),URANGE)-LRANGE)//RANGELENGTH

def prepare_data(path):
    '''
    takes path of the csv and prepares data for pytorch dataset function
    '''
    price_df = pd.read_csv(path)
    '''
    format date and extract day, month and weekday from date
    '''
    price_df['Date'] = pd.to_datetime(price_df['Date'],format='%m/%d/%Y')
    price_df['weekday'], price_df['month'], price_df['day'] = price_df['Date'].dt.weekday, price_df['Date'].dt.month, price_df['Date'].dt.day
    '''
    make month and day 0 based index for the ease in using embedding
    '''
    price_df['month'] = price_df['month']-1
    price_df['day'] = price_df['day'] - 1
    '''
    make price bucket for embedding prices into decoder. Bar the prices at 80-200. 
    '''
    price_df['price_bucket'] = price_df['Price'].apply(lambda x: make_price_buckets(x))
    return price_df
price_df = prepare_data(path)

In [6]:
price_df.head(5)

Unnamed: 0,Date,Price,weekday,month,day,price_bucket
0,2012-01-01,99,6,0,0,0
1,2012-01-02,95,0,0,1,0
2,2012-01-03,96,1,0,2,0
3,2012-01-04,95,2,0,3,0
4,2012-01-05,93,3,0,4,0


In [7]:
device = torch.device('cpu')

d_model = 4
nhead = 2
dim_feedforward = 32
num_weekday = 7
num_months = 12
num_days = 31
num_price_bucket = 6
EPOCHS = 128

price_dataset = priceDataset(price_df)

validation_split = 0.05
indices = list(range(len(price_dataset)))
split = int(len(indices)*validation_split)

train_split, val_split = indices[:-split], indices[-split:]

model = TransformerModel(seq_len=SEQLEN, d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, num_weekday=num_weekday, num_months=num_months, num_days=num_days, num_price_bucket=num_price_bucket)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.MSELoss(reduction='mean')

price_train_dataloader = DataLoader(price_dataset, batch_size=30, num_workers = 12, sampler = train_split)
price_val_dataloader = DataLoader(price_dataset, batch_size=30, num_workers=6, sampler=val_split)
model.to(device)

for epoch in range(EPOCHS):
    epoch_loss = 0
    cnt = 1
    for enc_inp, dec_inp, response in price_train_dataloader:
        enc_inp = enc_inp.to(device)
        dec_inp = dec_inp.to(device)
        response = response.to(device)
        response = response.transpose(0,1)
        output = model(enc_inp, dec_inp)
        output = output.squeeze()
        loss = loss_fn(response, output)
        epoch_loss+=loss
        
        loss.backward()
        optimizer.step()
        
        cnt+=1
    print(f'epoch_number_train:{epoch} epoch_loss_train: {epoch_loss/cnt}\n')
    with torch.no_grad():
        epoch_loss_val = 0
        cnt=0
        for enc_inp, dec_inp, response in price_val_dataloader:
            response = response.transpose(0,1)
            output = model(enc_inp, dec_inp)
            output = output.squeeze()
            loss = loss_fn(response, output)
            epoch_loss_val += loss
            cnt+=1
        
    print(f'epoch_number_val:{epoch} epoch_loss_val: {epoch_loss_val/cnt}\n')


epoch_number_train:0 epoch_loss_train: 12719.77734375

epoch_number_val:0 epoch_loss_val: 15803.1748046875

epoch_number_train:1 epoch_loss_train: 12581.4345703125

epoch_number_val:1 epoch_loss_val: 15649.9580078125

epoch_number_train:2 epoch_loss_train: 12428.41796875

epoch_number_val:2 epoch_loss_val: 15464.6044921875

epoch_number_train:3 epoch_loss_train: 12254.0185546875

epoch_number_val:3 epoch_loss_val: 15246.0830078125

epoch_number_train:4 epoch_loss_train: 12047.9501953125

epoch_number_val:4 epoch_loss_val: 14998.2373046875

epoch_number_train:5 epoch_loss_train: 11819.4189453125

epoch_number_val:5 epoch_loss_val: 14721.0625

epoch_number_train:6 epoch_loss_train: 11564.4052734375

epoch_number_val:6 epoch_loss_val: 14414.3701171875

epoch_number_train:7 epoch_loss_train: 11286.0595703125

epoch_number_val:7 epoch_loss_val: 14083.6826171875

epoch_number_train:8 epoch_loss_train: 10985.4462890625

epoch_number_val:8 epoch_loss_val: 13722.9150390625

epoch_number_train:9

epoch_number_val:73 epoch_loss_val: 400.1477966308594

epoch_number_train:74 epoch_loss_train: 571.1250610351562

epoch_number_val:74 epoch_loss_val: 358.8323974609375

epoch_number_train:75 epoch_loss_train: 557.0470581054688

epoch_number_val:75 epoch_loss_val: 384.0614318847656

epoch_number_train:76 epoch_loss_train: 581.67919921875

epoch_number_val:76 epoch_loss_val: 328.6908874511719

epoch_number_train:77 epoch_loss_train: 596.081787109375

epoch_number_val:77 epoch_loss_val: 311.36163330078125

epoch_number_train:78 epoch_loss_train: 612.5618896484375

epoch_number_val:78 epoch_loss_val: 337.3544006347656

epoch_number_train:79 epoch_loss_train: 606.1331176757812

epoch_number_val:79 epoch_loss_val: 360.4973449707031

epoch_number_train:80 epoch_loss_train: 592.9929809570312

epoch_number_val:80 epoch_loss_val: 370.83056640625

epoch_number_train:81 epoch_loss_train: 585.9598999023438

epoch_number_val:81 epoch_loss_val: 367.748779296875

epoch_number_train:82 epoch_loss_train

Create a test dataframe

In [8]:
test_df = price_df.iloc[-6:].reset_index(drop=True)

In [9]:
test_df.head(6)

Unnamed: 0,Date,Price,weekday,month,day,price_bucket
0,2016-01-11,139,0,0,10,2
1,2016-01-12,147,1,0,11,3
2,2016-01-13,150,2,0,12,3
3,2016-01-14,148,3,0,13,3
4,2016-01-15,149,4,0,14,3
5,2016-01-16,147,5,0,15,3


In [10]:
dates = pd.date_range('2020-02-01','2020-02-29',freq='D')

In [11]:
df_temp = pd.DataFrame({'Date':dates})
test_df = pd.concat([test_df, df_temp], ignore_index=True).reindex(columns=test_df.columns)

In [12]:
test_df['weekday'], test_df['month'], test_df['day'] = test_df['Date'].dt.weekday, test_df['Date'].dt.month, test_df['Date'].dt.day

In [13]:
test_len = test_df.shape[0]
default_price_bucket = price_df['price_bucket'].mode()

In [14]:
def getitem(test_df, index):
    enc_inp = torch.tensor(test_df[['month', 'day', 'weekday']][index:index+SEQLEN].values)
    if index:
        dec_inp = torch.tensor(test_df['price_bucket'][index-1:index+SEQLEN-1].values, dtype = torch.long)
    else:
        dec_inp = torch.tensor(np.append(default_price_bucket,test_df['price_bucket'][index:index+SEQLEN-1]), dtype = torch.long)
    return enc_inp, dec_inp

In [15]:
for i in range(test_len-SEQLEN+1):
    enc_inp, dec_inp = getitem(test_df, i)
    enc_inp = enc_inp.unsqueeze(0)
    dec_inp = dec_inp.unsqueeze(0)
    response = model(enc_inp, dec_inp)
    response = response[6].item()
    test_df.loc[i+SEQLEN-1,'Price'] = response
    test_df.loc[i+SEQLEN-1, 'price_bucket'] = make_price_buckets(response)
    #breakpoint()
    

In [16]:
test_df['Price'] = test_df['Price'].round(2,)

In [17]:
test_df = test_df.loc[6:]

In [18]:
test_df.head(10)

Unnamed: 0,Date,Price,weekday,month,day,price_bucket
6,2020-02-01,106.41,5,2,1,1.0
7,2020-02-02,110.18,6,2,2,1.0
8,2020-02-03,117.59,0,2,3,1.0
9,2020-02-04,109.98,1,2,4,1.0
10,2020-02-05,109.7,2,2,5,1.0
11,2020-02-06,109.65,3,2,6,1.0
12,2020-02-07,111.42,4,2,7,1.0
13,2020-02-08,109.95,5,2,8,1.0
14,2020-02-09,130.78,6,2,9,2.0
15,2020-02-10,112.14,0,2,10,1.0


In [19]:
test_df.to_csv('result.csv', index=False)