In [18]:
# Load packages
import numpy as np
import time
from io import StringIO, BytesIO
from zipfile import ZipFile
import urllib.request
from math import sqrt
import os
# For data processing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# For deep learning
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x228c733b130>

In [9]:
def url2pd(link):
    with ZipFile(link) as my_zip_file:
        for contained_file in my_zip_file.namelist():
            fzip=my_zip_file.open(contained_file)
            data=fzip.read()
    s=str(data,'utf-8')
    data = StringIO(s) 
    print('Done loading a dataset!')
    return pd.read_csv(data)

In [5]:
filenames = os.listdir('./Data')

In [13]:
url_1 = './Data/' + filenames[0]
df = url2pd(url_1)
df.info()
df.head(3)

Done loading a dataset!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 18 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   Vehicle_ID    1048575 non-null  int64  
 1   Frame_ID      1048575 non-null  int64  
 2   Total_Frames  1048575 non-null  int64  
 3   Global_Time   1048575 non-null  int64  
 4   Local_X       1048575 non-null  float64
 5   Local_Y       1048575 non-null  float64
 6   Global_X      1048575 non-null  float64
 7   Global_Y      1048575 non-null  float64
 8   v_Length      1048575 non-null  float64
 9   v_Width       1048575 non-null  float64
 10  v_Class       1048575 non-null  int64  
 11  v_Vel         1048575 non-null  float64
 12  v_Acc         1048575 non-null  float64
 13  Lane_ID       1048575 non-null  int64  
 14  Preceeding    1048575 non-null  int64  
 15  Following     1048575 non-null  int64  
 16  Space_Hdwy    1048575 non-null  float64
 17  Tim

Unnamed: 0,Vehicle_ID,Frame_ID,Total_Frames,Global_Time,Local_X,Local_Y,Global_X,Global_Y,v_Length,v_Width,v_Class,v_Vel,v_Acc,Lane_ID,Preceeding,Following,Space_Hdwy,Time_Hdwy
0,2,13,437,1118846980200,16.467196,35.380427,6451137.641,1873344.962,14.5,4.9,2,40.0,0.0,2,0,0,0.0,0.0
1,2,14,437,1118846980300,16.446594,39.381608,6451140.329,1873342.0,14.5,4.9,2,40.012349,0.123485,2,0,0,0.0,0.0
2,2,15,437,1118846980400,16.425991,43.381541,6451143.018,1873339.038,14.5,4.9,2,39.999855,-0.124939,2,0,0,0.0,0.0


In [14]:
#  keep only columns that are useful for now
kept_cols = ['Vehicle_ID', 'Frame_ID', 'Total_Frames', 'Local_X','Local_Y',
             'v_Length', 'v_Width', 'v_Vel', 'v_Acc', 'Lane_ID']
df = df[kept_cols]

# %%
# Filter time step
print(df.shape)
df = df.iloc[::2,:].copy()
print('After filtering:', df.shape)

vehicle_ids = df.Vehicle_ID.unique()
# Set constant values
HISTORY_LENGTH = 3
FUTURE_LENGTH = 5
n_steps = int(HISTORY_LENGTH/0.2)
n_future = int(FUTURE_LENGTH/0.2)
n_features = len(df)
series_feature_names = ['Local_X','v_Vel']
target_names = ['Local_X','v_Vel']
n_labels = len(target_names)

(1048575, 10)
After filtering: (524288, 10)


In [15]:
print('the number of vehicles is {}'.format(len(vehicle_ids)))

the number of vehicles is 1993


In [None]:
class LSTM(nn.Module):

    def __init__(self, num_classes, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.seq_length = seq_length
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True)
        
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h_0 = Variable(torch.zeros(
            self.num_layers, x.size(0), self.hidden_size))
        
        c_0 = Variable(torch.zeros(
            self.num_layers, x.size(0), self.hidden_size))
        
        # Propagate input through LSTM
        ula, (h_out, _) = self.lstm(x, (h_0, c_0))
        
        h_out = h_out.view(-1, self.hidden_size)
        
        out = self.fc(h_out)
        
        return out

In [None]:
num_epochs = 2000
learning_rate = 0.01

input_size = 1
hidden_size = 2
num_layers = 1

num_classes = 1

lstm = LSTM(num_classes, input_size, hidden_size, num_layers)

criterion = torch.nn.MSELoss()    # mean-squared error for regression
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)
#optimizer = torch.optim.SGD(lstm.parameters(), lr=learning_rate)

# Train the model
for epoch in range(num_epochs):
    outputs = lstm(trainX)
    optimizer.zero_grad()
    
    # obtain the loss function
    loss = criterion(outputs, trainY)
    
    loss.backward()
    
    optimizer.step()
    if epoch % 100 == 0:
      print("Epoch: %d, loss: %1.5f" % (epoch, loss.item()))

In [None]:
lstm.eval()
train_predict = lstm(dataX)

data_predict = train_predict.data.numpy()
dataY_plot = dataY.data.numpy()

data_predict = sc.inverse_transform(data_predict)
dataY_plot = sc.inverse_transform(dataY_plot)

plt.axvline(x=train_size, c='r', linestyle='--')

plt.plot(dataY_plot)
plt.plot(data_predict)
plt.suptitle('Time-Series Prediction')
plt.show()

In [16]:
def series2seq(data, n_in, n_out,labels,series_features, show_result=False):
    
    dat = data.copy()
    cols, names = list(), list()
    
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(dat[series_features].shift(i))
        names += ['{}(t-{})'.format(j, i) for j in series_features]
    
    # forecast sequence (t, t+1, ... t+n) for selected labels
    for i in range(0, n_out):
        cols.append(dat[labels].shift(-i))
        names += ['{}(t+{})'.format(j, i) for j in labels]
        
    # put it all together
    agg = pd.concat(cols, axis=1).dropna()
    agg.columns = names
    # concatenate with constant features

    X = agg.iloc[:,:len(series_features)*n_in].copy()
    X = pd.concat([X,dat.drop(columns=series_features)], axis=1).dropna()
    y = agg.iloc[:,len(series_features)*n_in:].copy()
    
    # Show some information on the data sets X and y
    if show_result:
      X.info()
      print(X.head(), X.shape)
      y.info()
      print(y.head(), y.shape)
    return X, y
# Test the function
series2seq(df[df.Vehicle_ID==vehicle_ids[0]], 
                   n_in=2, n_out=1,labels = target_names,
                   series_features=series_feature_names, 
                   show_result=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 217 entries, 4 to 436
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Local_X(t-2)  217 non-null    float64
 1   v_Vel(t-2)    217 non-null    float64
 2   Local_X(t-1)  217 non-null    float64
 3   v_Vel(t-1)    217 non-null    float64
 4   Vehicle_ID    217 non-null    int64  
 5   Frame_ID      217 non-null    int64  
 6   Total_Frames  217 non-null    int64  
 7   Local_Y       217 non-null    float64
 8   v_Length      217 non-null    float64
 9   v_Width       217 non-null    float64
 10  v_Acc         217 non-null    float64
 11  Lane_ID       217 non-null    int64  
dtypes: float64(8), int64(4)
memory usage: 22.0 KB
    Local_X(t-2)  v_Vel(t-2)  Local_X(t-1)  v_Vel(t-1)  Vehicle_ID  Frame_ID  \
4      16.467196   40.000000     16.425991   39.999855           2        17   
6      16.425991   39.999855     16.384804   39.991544           2        19   
8 

(     Local_X(t-2)  v_Vel(t-2)  Local_X(t-1)  v_Vel(t-1)  Vehicle_ID  Frame_ID  \
 4       16.467196   40.000000     16.425991   39.999855           2        17   
 6       16.425991   39.999855     16.384804   39.991544           2        19   
 8       16.384804   39.991544     16.342611   40.063334           2        21   
 10      16.342611   40.063334     16.304035   40.121870           2        23   
 12      16.304035   40.121870     16.260427   39.635113           2        25   
 ..            ...         ...           ...         ...         ...       ...   
 428      9.106082   69.778018      8.975105   69.861025           2       441   
 430      8.975105   69.861025      8.922096   69.843788           2       443   
 432      8.922096   69.843788      8.970121   69.995391           2       445   
 434      8.970121   69.995391      8.950828   70.047545           2       447   
 436      8.950828   70.047545      8.785970   70.032039           2       449   
 
      Total_Fr

In [17]:
#%% function that prepare sequence for multi object

def treatment_cars(data, n_in, n_out,labels,series_features, show_result=False):
  veh_ids = data.Vehicle_ID.unique()
  dat_X, dat_y = pd.DataFrame(),pd.DataFrame()

  for id in veh_ids:
    dat = data[data.Vehicle_ID==id].copy()
    X, y = series2seq(dat.drop(columns=['Frame_ID']), n_in=n_in, n_out=n_out,labels = labels,series_features=series_features)
    dat_X = pd.concat([dat_X,X],ignore_index=True)
    dat_y = pd.concat([dat_y,y],ignore_index=True)
  if show_result:
    dat_X.info()
    print(dat_X.head(), dat_X.shape)
    dat_y.info()
    print(dat_y.head(), dat_y.shape)
  return dat_X ,dat_y

In [20]:
cars = 5
np.random.seed(23)
veh_list = np.random.choice(vehicle_ids,cars)
sub_df = df[df.Vehicle_ID.isin(veh_list)].copy()



"""## Data preparation"""

# turn the data set into sequences
X, y = treatment_cars(sub_df, 
                      n_in=n_steps, n_out=n_future,
                      labels = target_names,
                      series_features=series_feature_names)

# Split the data set
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                test_size=0.3, random_state=42)


In [23]:
X_train

tensor([[41.1762, 36.7782, 41.0829,  ...,  5.0000, -0.9403,  4.0000],
        [41.6834, 39.8906, 41.6855,  ...,  5.0000,  5.2758,  4.0000],
        [39.4038, 31.9851, 39.5784,  ...,  5.0000,  3.5552,  4.0000],
        ...,
        [30.7057, 49.9952, 30.7387,  ...,  7.4000, -0.4926,  3.0000],
        [30.0492, 30.0037, 30.0790,  ...,  7.4000,  8.9198,  3.0000],
        [39.1515, 28.3825, 39.2220,  ...,  7.4000,  0.9974,  4.0000]])

In [22]:
X_train = torch.Tensor(X_train.values)
X_train

TypeError: new(): data must be a sequence (got builtin_function_or_method)

In [None]:
def lstm_training(cars=5,standard=False, model_name=LSTM_names[0]):
    np.random.seed(23)
    veh_list = np.random.choice(vehicle_ids,cars)
    sub_df = df[df.Vehicle_ID.isin(veh_list)].copy()
    


    """## Data preparation"""
    
    # turn the data set into sequences
    X, y = treatment_cars(sub_df, 
                          n_in=n_steps, n_out=n_future,
                          labels = target_names,
                          series_features=series_feature_names, show_result=True)

    # Split the data set
    X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                    test_size=0.3, random_state=42)
    #print(X_train.shape,X_test.shape, y_train.shape, y_test.shape)
    #X_train.describe()

    ### Standardize the data
    if standard:        
        train_mean = X_train.mean()
        train_std = X_train.std()
        
        X_train = (X_train - train_mean) / train_std
        X_test = (X_test - train_mean) / train_std


    """### Reshape data sets to match the selected model"""

    X_train = X_train.values
    X_test = X_test.values
    # reshape into [# samples, # timesteps,# features]
    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1],1))
    X_test = X_test.reshape((X_test.shape[0], X_test.shape[1],1))
    
    
    """## Prediction model"""
  
    tf.random.set_seed(24)
    # create model
    #mirrored_strategy = tf.distribute.MirroredStrategy()

    #with mirrored_strategy.scope():
    model = create_model(LSTM_names[0],X_train)
    # Interrupt training if `val_loss` stops improving for over 10 epochs
    stop_learn= tf.keras.callbacks.EarlyStopping(patience=10, monitor='val_loss')
    #print(model.summary())
    
    # fit model
    start = time.time()
    model.fit(X_train,y_train, epochs=5, 
                        callbacks=[stop_learn],
                        validation_data=(X_test,y_test), verbose=1)
    end = time.time()

    # Evaluation

    yhat = model.predict(X_test, verbose=1)
    rms = sqrt(mean_squared_error(y_test, yhat))
    #print(yhat[:5])
    
    return [rms, end-start]
result = lstm_training()
print("The RMSE is {0:.3f} and the model was trained within {1:.3f} sec".format(result[0],result[1]))