In [1]:
import pandas as pd
import torch.utils.data as Data
import torch
from torch.autograd import Variable
import torch.nn.functional as F
from torch.nn import init
import matplotlib.pyplot as plt
from torch import nn

In [209]:
HIDDEN_LAYERS = 5
B_INIT = -0.2
ACTIVATION = F.relu
class FcNet(torch.nn.Module):
    def __init__(self, num_features, dim_out, batch_normalization=True):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(FcNet, self).__init__()
        self.do_bn = batch_normalization
        self.fcs = []
        self.bns = []
        self.bn_input = nn.BatchNorm1d(num_features, momentum=0.5)   # for input data
        
        for i in range(HIDDEN_LAYERS):
            input_size = num_features if i == 0 else 20 # each hidden layer has 10 neurons
            fc = nn.Linear(input_size, 20)
            setattr(self, 'fc%i' % i, fc) # such that pytorch can distinguish different layers
            self._set_init(fc)
            self.fcs.append(fc)
            if self.do_bn:
                bn = nn.BatchNorm1d(20, momentum=0.5)
                setattr(self, 'bn%i' % i, bn)
                self.bns.append(bn)
            
        self.predict = nn.Linear(20, 1)
        self._set_init(self.predict)
    
    def _set_init(self, layer):
        init.normal(layer.weight, mean=0, std=0.1)
        init.constant(layer.bias, B_INIT)

    def forward(self, x):
        """
        In the forward function we accept a Variable of input data and we must return
        a Variable of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Variables.
        """
        pre_activation = [x]
        if self.do_bn: x = self.bn_input(x)
        layer_input = [x]
        for i in range(HIDDEN_LAYERS):
            x = self.fcs[i](x)
            pre_activation.append(x)
            if self.do_bn:
                x = self.bns[i](x)
            x = ACTIVATION(x)
            layer_input.append(x)
        out = self.predict(x)
        return out, layer_input, pre_activation

In [148]:
# df = pd.read_csv('route16.csv', index_col=0)
# no_dirty_cols = ['TT', 'flow','holiday','hldy_seq','weekday','timeslot',
#                  'occ1', 'occ2', 'occ3', 'occ4',
#                  'Spd1','Spd2','Spd3','Spd4']
# df[no_dirty_cols].to_csv('route16_1603.csv')

In [235]:
df = pd.read_csv('route15.csv', index_col=0)

In [241]:
def condition(s):
    for i in ['SpdAvg', 'SpdTrnd', 'occ', 'Evt', 'Lanes'] + ['W_TT','S_TT']:
        if s.startswith(i):
            return True
    return False
keep_cols = [col for col in df.columns if condition(col)]
keep_cols += ['hldy_seq', 'weekday', 'timeslot', 'TT_mean']
keep_cols

['W_TT',
 'S_TT',
 'SpdAvg0',
 'occ0',
 'SpdAvg1',
 'occ1',
 'SpdAvg2',
 'occ2',
 'SpdAvg3',
 'occ3',
 'SpdAvg4',
 'occ4',
 'SpdAvg5',
 'occ5',
 'SpdTrnd0',
 'SpdTrnd1',
 'SpdTrnd2',
 'SpdTrnd3',
 'SpdTrnd4',
 'SpdTrnd5',
 'Evt0',
 'Evt1',
 'Evt2',
 'Evt3',
 'Evt4',
 'Evt5',
 'Lanes0',
 'Lanes1',
 'Lanes2',
 'Lanes3',
 'Lanes4',
 'Lanes5',
 'hldy_seq',
 'weekday',
 'timeslot',
 'TT_mean']

In [240]:
cols

['SpdAvg0',
 'occ0',
 'SpdAvg1',
 'occ1',
 'SpdAvg2',
 'occ2',
 'SpdAvg3',
 'occ3',
 'SpdAvg4',
 'occ4',
 'SpdAvg5',
 'occ5',
 'SpdTrnd0',
 'SpdTrnd1',
 'SpdTrnd2',
 'SpdTrnd3',
 'SpdTrnd4',
 'SpdTrnd5',
 'Evt0',
 'Evt1',
 'Evt2',
 'Evt3',
 'Evt4',
 'Evt5',
 'Lanes0',
 'Lanes1',
 'Lanes2',
 'Lanes3',
 'Lanes4',
 'Lanes5']

In [236]:
train_df = df.copy()

In [174]:
# feature set 1
train_df['TT_prv2'] = df['TT'].shift(2)
train_df['TT_prv3'] = df['TT'].shift(3)
train_df['flow_prv2'] = df['flow'].shift(3)
train_df['flow_prv3'] = df['flow'].shift(3)
train_df[['occ1_prv2', 'occ2_prv2', 'occ3_prv2', 'occ4_prv2']] = train_df[['occ1', 'occ2', 'occ3', 'occ4']].shift(2)
train_df[['occ1_prv3', 'occ2_prv3', 'occ3_prv3', 'occ4_prv3']] = train_df[['occ1', 'occ2', 'occ3', 'occ4']].shift(3)
train_df[['Spd1_prv2','Spd2_prv2','Spd3_prv2','Spd4_prv2']] = train_df[['Spd1','Spd2','Spd3','Spd4']].shift(2)
train_df[['Spd1_prv3','Spd2_prv3','Spd3_prv3','Spd4_prv3']] = train_df[['Spd1','Spd2','Spd3','Spd4']].shift(3)
drop_cols = ['TT', 'flow',
             'occ1', 'occ2', 'occ3', 'occ4',
             'Spd1','Spd2','Spd3','Spd4']
train_df = train_df.drop(drop_cols, axis=1)

In [237]:
# feature set 2
base_features = ['hldy_seq', 'weekday', 'timeslot', 'TT_mean','W_TT','S_TT']
train_df = df[base_features]

KeyError: "['SpdAvg' 'SpdTrnd' 'occ' 'Evt' 'Lanes'] not in index"

In [121]:
drop_columns = ['Closed2','Closed3','Closed4','Closed5','Ramp0','Ramp1','Ramp2','Ramp3','Ramp4','Ramp5',
                'Evt4','Evt5','Lanes0','Lanes1','Lanes2','Lanes3','Lanes4','Lanes5','Closed0','Closed1',
                'SpdTrnd3','density3','SpdTrnd4','density4','SpdTrnd5','density5','Evt0','Evt1','Evt2','Evt3',
                'FT5','flow5','occ5','lanes5','SpdTrnd0','density0','SpdTrnd1','density1','SpdTrnd2','density2',
                'SpdLast4','JT4','FT4','flow4','occ4','lanes4','Spd5','SpdAvg5','SpdLast5','JT5',
                'Spd3','SpdAvg3','SpdLast3','JT3','FT3','flow3','occ3','lanes3','Spd4','SpdAvg4',
                'occ1','lanes1','Spd2','SpdAvg2','SpdLast2','JT2','FT2','flow2','occ2','lanes2',
                'FT0','flow0','occ0','lanes0','Spd1','SpdAvg1','SpdLast1','JT1','FT1','flow1',
                'Spd0','SpdAvg0','SpdLast0','JT0','TT_nxt3','TT_nxt6','TT_nxt12','W_TT','S_TT','L_TT','TT_chng1',
                'TT_mean','flow_mean','flow_chng1','flow_chng2']

keep_columns = ['holiday','hldy_seq','weekday','timeslot',
                'TT_prv2','TT_prv3',
                'flow_prv2','flow_prv3']

train_df = df[keep_columns]

In [177]:
from datetime import datetime, timedelta

In [178]:
val_start_date, val_end_date = '20170701', '20170731' # include
train_start_date, train_end_date = '20170301', '20170630'

In [222]:
def get_data_loader(mins='30', batch_size=64):
    target = pd.DataFrame(df['TT'])
    target['TT_30min_later'] = target['TT'].shift(periods = -6) # 30min
    target['TT_60min_later'] = target['TT'].shift(periods = -12) # 60min
    train_entries = (datetime.strptime(train_end_date, '%Y%m%d') - datetime.strptime(train_start_date, '%Y%m%d')).days * 288 # days * 5mins
    val_entries = (datetime.strptime(val_end_date, '%Y%m%d') - datetime.strptime(val_start_date, '%Y%m%d')).days * 288
    start_index = df.index.get_loc(datetime.strptime(train_start_date, '%Y%m%d').strftime('%Y-%m-%d %H:%M:%S'))
    
    x_train, x_val, x_test = train_df[start_index: start_index+train_entries].fillna(-99), train_df[start_index+train_entries:start_index+train_entries+val_entries].fillna(-99), train_df[start_index+train_entries+val_entries:].fillna(-99)
    if mins=='30':
        y_train, y_val, y_test = target[start_index: start_index+train_entries]['TT_30min_later'].fillna(-1), target[start_index+train_entries:start_index+train_entries+val_entries]['TT_30min_later'].fillna(-1), target[start_index+train_entries+val_entries:]['TT_30min_later'].fillna(-1)
    elif mins=='60':
        y_train, y_val, y_test = target[start_index: start_index+train_entries]['TT_60min_later'].fillna(-1), target[start_index+train_entries:start_index+train_entries+val_entries]['TT_60min_later'].fillna(-1), target[start_index+train_entries+val_entries:]['TT_60min_later'].fillna(-1)
    x_size = x_train.shape[1]
    y_size = 1 # for regression
    x_train, y_train = torch.from_numpy(x_train.values).float(), torch.from_numpy(y_train.values).float()
    x_val, y_val = Variable(torch.from_numpy(x_val.values).float()).cuda(), Variable(torch.from_numpy(y_val.values).float()).cuda()

    train_dataset = Data.TensorDataset(data_tensor=x_train, target_tensor=y_train)
    train_loader = Data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=2,)
    return train_loader, x_val, y_val

In [223]:
def mape_loss(predict, target, to_numpy=False, model=None, in_=None):
    import numpy
    assert ((type(predict) == type(target) == torch.autograd.variable.Variable) or
            type(predict) == type(target) == numpy.ndarray), 'check the type of inputs, they have to be all torch Variable or numpy ndarray'
    if type(predict) == type(target) == torch.autograd.variable.Variable:
        if to_numpy == True:
            import numpy as np
            loss = (np.sum(np.abs((target.type(torch.FloatTensor).data.numpy().flatten() - 
                                  model(in_)[0].type(torch.FloatTensor).data.numpy().flatten())) 
                          / target.type(torch.FloatTensor).data.numpy().flatten()) 
                    / len(target))
            return loss
        loss = (1 / predict.shape[0] * 
               torch.sum((target - predict).abs() / target))
    elif type(predict) == type(target) == numpy.ndarray:
        loss = (1 / predict.shape[0] * 
               numpy.sum(numpy.abs(target.flatten() - predict.flatten()) / target.flatten()))   
    return loss

In [224]:
epoch = 100

data_loader30, x_val30, y_val30 = get_data_loader('30')
data_loader60, x_val60, y_val60 = get_data_loader('60')
x_size = data_loader30.dataset.data_tensor.shape[1]
y_size = 1
model_30 = FcNet(x_size, y_size)
model_60 = FcNet(x_size, y_size)

model_30.cuda() # using gpu
model_60.cuda()
loss_func = torch.nn.MSELoss()
loss_func = mape_loss
#optimizer = torch.optim.SGD(model.parameters(), lr=0.03)
optimizer30 = torch.optim.Adam(model_30.parameters(), lr=0.03)
optimizer60 = torch.optim.Adam(model_60.parameters(), lr=0.03)

def train(train_loader, x_val, y_val, optimizer, model):
    for t in range(epoch):  
        for step, (b_x, b_y) in enumerate(train_loader):
            b_x, b_y = Variable(b_x).type(torch.FloatTensor).cuda(), Variable(b_y).type(torch.FloatTensor).cuda()
            pred, _, _ = model(b_x)
            loss = loss_func(pred, b_y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        y_val_predict, _, _ = model(x_val)
        print(str(t+1))
        print('training error:', loss.data[0])
        print('validation error:', loss_func(y_val_predict, y_val).data[0])
    

In [225]:
train(data_loader30, x_val30, y_val30, optimizer30, model_30)

1
training error: 1.562144160270691
validation error: 943.390625
2
training error: 1.59318208694458
validation error: 732.0245361328125
3
training error: 1.6245827674865723
validation error: 1073.5233154296875
4
training error: 1.3899320363998413
validation error: 690.969970703125
5
training error: 1.2622833251953125
validation error: 870.482177734375
6
training error: 2.233020544052124
validation error: 734.7381591796875
7
training error: 1.44230055809021
validation error: 742.6342163085938
8
training error: 1.4237971305847168
validation error: 670.364501953125
9
training error: 1.4169710874557495
validation error: 743.3408203125
10
training error: 1.464977741241455
validation error: 717.8665771484375
11
training error: 1.7594354152679443
validation error: 659.8575439453125
12
training error: 1.1132797002792358
validation error: 670.0648803710938
13
training error: 1.2626689672470093
validation error: 687.658203125
14
training error: 1.617522954940796
validation error: 695.70190429687

In [226]:
train(data_loader60, x_val60, y_val60, optimizer60, model_60)

1
training error: 1.6107721328735352
validation error: 697.9923706054688
2
training error: 2.1769866943359375
validation error: 839.1197509765625
3
training error: 1.7473762035369873
validation error: 727.073974609375
4
training error: 1.8863675594329834
validation error: 795.1375732421875
5
training error: 1.7523431777954102
validation error: 771.2346801757812
6
training error: 2.252880096435547
validation error: 706.1643676757812
7
training error: 1.1990128755569458
validation error: 849.4760131835938
8
training error: 1.4467501640319824
validation error: 698.3970947265625
9
training error: 1.3972413539886475
validation error: 814.6390380859375
10
training error: 1.3960238695144653
validation error: 674.3628540039062
11
training error: 1.7760405540466309
validation error: 747.7340698242188
12
training error: 2.107597827911377
validation error: 673.779296875
13
training error: 1.3273688554763794
validation error: 728.6651611328125
14
training error: 1.5338786840438843
validation error

In [202]:
import matplotlib.pyplot as plt
predict30 = model_30(Variable(data_loader30.dataset.data_tensor).type(torch.FloatTensor).cuda())[0].type(torch.FloatTensor).data.numpy()
ground_truth30 = data_loader30.dataset.target_tensor.numpy()
predict60 = model_60(Variable(data_loader30.dataset.data_tensor).type(torch.FloatTensor).cuda())[0].type(torch.FloatTensor).data.numpy()
ground_truth60 = data_loader60.dataset.target_tensor.numpy()

In [48]:
def save_figs(predict, ground_truth, x_val, y_val, mins='30'):
    x = range(288)
    if mins == '30':
        val_predict = model_30(x_val.type(torch.FloatTensor).cuda())[0].type(torch.FloatTensor).data.numpy().flatten()
    elif mins == '60':
        val_predict = model_60(x_val.type(torch.FloatTensor).cuda())[0].type(torch.FloatTensor).data.numpy().flatten()
    else:
        print('??')
        return
    val_ground_truth = y_val.type(torch.FloatTensor).data.numpy().flatten()    
    for i in range(0, len(predict), 288):
        plt.figure(figsize=(8, 6))
        p1, = plt.plot(x, predict[i:i+288], 'r', label='predict')
        p2, = plt.plot(x, ground_truth[i:i+288], 'b', label='target')
        plt.legend(handles=[p1, p2])
        title = df.index[i].split()[0]
        plt.title(title)
        if mins=='30':
            plt.savefig('figs/30/'+title+'.jpg')
        elif mins=='60':
            plt.savefig('figs/60/'+title+'.jpg')
    for i in range(0, len(val_predict), 288):
        plt.figure(figsize=(8, 6))
        p1, = plt.plot(x, val_predict[i:i+288], 'r', label='predict')
        p2, = plt.plot(x, val_ground_truth[i:i+288], 'b', label='target')
        plt.legend(handles=[p1, p2])
        title = df.index[i+len(predict)].split()[0]
        plt.title(title)
        if mins=='30':
            plt.savefig('figs/30/val_'+title+'.jpg')
        elif mins=='60':
            plt.savefig('figs/60/val_'+title+'.jpg')        

In [54]:
save_figs(predict30, ground_truth30, x_val30, y_val30, mins='30')



In [86]:
save_figs(predict60, ground_truth60, x_val60, y_val60, mins='60')



In [203]:
import numpy as np

In [227]:
mape_loss(predict30, ground_truth30)

0.047487866670522696

In [228]:
mape_loss(predict60, ground_truth60)

0.042127369301816098

In [83]:
np.sum((np.abs(ground_truth30.flatten() - predict30.flatten()) / ground_truth30.flatten())) / len(predict30.flatten())

0.045446963417166635

In [229]:
mape_loss(y_val30.type(torch.FloatTensor).data.numpy(), model_30(x_val30)[0].type(torch.FloatTensor).data.numpy())

0.12565896775987415

In [78]:
import numpy as np

In [131]:
np.sum(np.abs((y_val30.type(torch.FloatTensor).data.numpy().flatten() - model_30(x_val30)[0].type(torch.FloatTensor).data.numpy().flatten())) / y_val30.type(torch.FloatTensor).data.numpy().flatten()) / len(y_val30)

0.049658114821822558