In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import TimeDistributed, Dense, Conv2D, Flatten 
from keras.layers import MaxPooling2D, Dropout, LSTM
from sklearn.preprocessing import StandardScaler

Using TensorFlow backend.


In [2]:
df = pd.read_csv('combined-data.txt')

noOfcells = df['NoOfCells'][0]
print(df.shape[0])

4163360


In [3]:
#variable for training
cols = list(df)[2:14]
cols

['VehicleId',
 'VehicleType',
 'SignalGroup',
 'LaneId',
 'ApproachId',
 'LocationOnMap',
 'PhaseStatus',
 'PhaseElapsedTime',
 'Speed',
 'Heading',
 'DistanceToStopBar',
 'CellStatus']

In [4]:
df = df[cols].astype(float)

df.head(5)

Unnamed: 0,VehicleId,VehicleType,SignalGroup,LaneId,ApproachId,LocationOnMap,PhaseStatus,PhaseElapsedTime,Speed,Heading,DistanceToStopBar,CellStatus
0,0.0,0.0,2.0,14.0,5.0,2.0,4.0,0.12,-1.0,-1.0,0.0,0.0
1,0.0,0.0,2.0,14.0,5.0,2.0,4.0,0.12,-1.0,-1.0,7.62,0.0
2,0.0,0.0,2.0,14.0,5.0,2.0,4.0,0.12,-1.0,-1.0,15.24,0.0
3,0.0,0.0,2.0,14.0,5.0,2.0,4.0,0.12,-1.0,-1.0,22.86,0.0
4,0.0,0.0,2.0,14.0,5.0,2.0,4.0,0.12,-1.0,-1.0,30.48,0.0


In [5]:
def getTrainingSetSize(df, noOfcells):
    trainingDataPointSize = int(df.shape[0]*0.8/noOfcells)
    training_setsize = trainingDataPointSize*noOfcells
    return training_setsize

In [6]:
def getProcessedDataSet(df):
    input_X = []
    input_Y = []
    dataPoint_X = []
    dataPoint_Y = []
    scaler = StandardScaler()


    for index, rows in df.iterrows():
        if index == 0:
            X_row_list = [rows.VehicleId, rows.VehicleType,  rows.PhaseStatus, rows.PhaseElapsedTime, rows.Speed, rows.DistanceToStopBar]
            dataPoint_X.append(X_row_list)
            dataPoint_Y.append(rows.CellStatus)
        
        elif index % noOfcells != 0:
            X_row_list = [rows.VehicleId, rows.VehicleType,  rows.PhaseStatus, rows.PhaseElapsedTime, rows.Speed, rows.DistanceToStopBar]
            
            dataPoint_X.append(X_row_list)
            dataPoint_Y.append(rows.CellStatus)

        elif index % noOfcells == 0 and len(dataPoint_X) > 0:
            scaler = scaler.fit(dataPoint_X)
            dataPoint_X = scaler.transform(dataPoint_X)
            input_X.append(dataPoint_X)
            input_Y.append(dataPoint_Y)
            
            dataPoint_X = []
            dataPoint_Y = []
            X_row_list = [rows.VehicleId, rows.VehicleType,  rows.PhaseStatus, rows.PhaseElapsedTime, rows.Speed, rows.DistanceToStopBar]
            
            dataPoint_X.append(X_row_list)
            dataPoint_Y.append(rows.CellStatus)
        
        if index == len(df) - 1:
            scaler = scaler.fit(dataPoint_X)
            dataPoint_X = scaler.transform(dataPoint_X)
            input_X.append(dataPoint_X)
            input_Y.append(dataPoint_Y)  


#     input_X = np.array(input_X)
#     input_Y = np.array(input_Y)
    
    
    return input_X, input_Y

In [7]:
training_setsize = getTrainingSetSize(df, noOfcells)

df_forTraining = df.iloc[:training_setsize, 0:]

df_forTesting = df.iloc[training_setsize:, 0:]
df_forTesting.reset_index(inplace=True)

In [8]:
df_forTesting.head()

Unnamed: 0,index,VehicleId,VehicleType,SignalGroup,LaneId,ApproachId,LocationOnMap,PhaseStatus,PhaseElapsedTime,Speed,Heading,DistanceToStopBar,CellStatus
0,3330680,56.0,4.0,2.0,14.0,5.0,2.0,1.0,1.55,0.28,89.14,7.0,1.0
1,3330681,57.0,4.0,2.0,14.0,5.0,2.0,1.0,1.55,0.0,89.14,13.0,1.0
2,3330682,58.0,4.0,2.0,14.0,5.0,2.0,1.0,1.55,0.0,89.15,20.0,1.0
3,3330683,59.0,4.0,2.0,14.0,5.0,2.0,1.0,1.55,0.0,89.2,26.0,1.0
4,3330684,60.0,4.0,2.0,14.0,5.0,2.0,1.0,1.55,0.0,89.7,33.0,1.0


In [9]:
inputX,inputY = getProcessedDataSet(df_forTraining)
inputX = np.array(inputX)
inputY = np.array(inputY)
print(inputX.shape)
print(inputY.shape)

(83267, 40, 6)
(83267, 40)


In [10]:
n_future = 1 #Number of days we want to predict into the future
n_past = 100 #Number of past days we want to use to predict the future

In [11]:
n_future = 1 #Number of days we want to predict into the future
n_past = 100 #Number of past days we want to use to predict the future
trainX = []
for i in range (n_past, inputX.shape[0] - n_future + 1):
    trainX.append(inputX[i-n_past:i,])
    
trainX = np.array(trainX)

In [12]:
trainX = trainX.reshape(trainX.shape[0], trainX.shape[1], trainX.shape[2], trainX.shape[3], 1)
print(trainX.shape)

(83167, 100, 40, 6, 1)


In [13]:
# scaler = StandardScaler()
# scaler = scaler.fit(inputY)
# inputY = scaler.transform(inputY)
# print(inputY.shape)
inputY

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [1., 1., 1., ..., 1., 0., 0.],
       [1., 1., 1., ..., 1., 0., 0.],
       [1., 1., 1., ..., 1., 0., 0.]])

In [14]:
trainY=[]
trainY= inputY[n_past-1:inputY.shape[0]-1:]
print(trainY.shape)

(83167, 40)


In [15]:
test_inputX, test_inputY = getProcessedDataSet(df_forTesting)

In [16]:
test_inputX = np.array(test_inputX)
test_inputY = np.array(test_inputY)
print(test_inputX.shape)
print(test_inputY.shape)

(20817, 40, 6)
(20817, 40)


In [17]:
testX = []
for i in range (n_past, test_inputX.shape[0] - n_future + 1):
    
    testX.append(test_inputX[i-n_past:i,])
    
testX = np.array(testX)

In [18]:
testX = testX.reshape(testX.shape[0], testX.shape[1], testX.shape[2], testX.shape[3], 1)
print(testX.shape)

(20717, 100, 40, 6, 1)


In [19]:
scaler = StandardScaler()
scaler = scaler.fit(test_inputY)
test_inputY = scaler.transform(test_inputY)

In [20]:
testY=[]
testY= test_inputY[n_past-1:test_inputY.shape[0]-1:]

In [21]:
# input_shape = (trainX.shape[1], trainX.shape[2], 1)
print(trainX.shape)
print(trainY.shape)

(83167, 100, 40, 6, 1)
(83167, 40)


In [22]:
model = Sequential()#add model layers

model.add(TimeDistributed(Conv2D(40, (3, 3), activation='relu'), input_shape=(100, trainX.shape[2], trainX.shape[3], 1)))
model.add(TimeDistributed(Dropout(0.2)))
model.add(TimeDistributed(Conv2D(20, (3, 3), activation='relu')))
model.add(TimeDistributed(Dropout(0.2)))
model.add(TimeDistributed(Flatten()))
model.add(LSTM(30, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(15))
model.add(Dropout(0.2))
model.add(Dense(40, init='uniform'))
model.compile(optimizer='adam', loss='mse')

# model.add(TimeDistributed(Conv2D(64, (3, 3), activation='relu'), input_shape=(100, trainX.shape[2], trainX.shape[3], 1)))
# model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(1, 1),data_format='channels_last')))

# # model.add(TimeDistributed(Conv2D(128, (4,4), activation='relu')))
# # model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2),data_format='channels_last')))

# # model.add(TimeDistributed(Conv2D(256, (4,4), activation='relu')))
# # model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2), data_format='channels_last')))

# # extract features and dropout 
# model.add(TimeDistributed(Flatten()))
# model.add(Dropout(0.5))
    
# # input to LSTM
# model.add(LSTM(256, return_sequences=False, dropout=0.5))
# model.add(Dense(256, activation='relu'))
# model.add(Dropout(.5))

# # classifier with sigmoid activation for multilabel
# model.add(Dense(2, activation='sigmoid'))

# # compile the model with binary_crossentropy loss for multilabel
# model.compile(optimizer='rmsprop', loss='binary_crossentropy')

# look at the params before training
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_1 (TimeDist (None, 100, 38, 4, 40)    400       
_________________________________________________________________
time_distributed_2 (TimeDist (None, 100, 38, 4, 40)    0         
_________________________________________________________________
time_distributed_3 (TimeDist (None, 100, 36, 2, 20)    7220      
_________________________________________________________________
time_distributed_4 (TimeDist (None, 100, 36, 2, 20)    0         
_________________________________________________________________
time_distributed_5 (TimeDist (None, 100, 1440)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 30)           176520    
_________________________________________________________________
dropout_3 (Dropout)          (None, 100, 30)          

  if sys.path[0] == '':


In [23]:
#train the model
model.fit(trainX, trainY, validation_data=(testX, testY), epochs=20)

Train on 83167 samples, validate on 20717 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x2bc804b5828>

In [24]:
# evaluate the model
score = model.evaluate(testX, testY, verbose=1)



In [25]:
score

0.8362384101032215

In [26]:
# make predictions
testPredict = model.predict(testX)
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform(testY)
print('testY shape == {}.'.format(testY.shape))

testY shape == (20717, 40).


In [27]:
testScore = math.sqrt(mean_squared_error(testY[:,0], testPredict[:,0]))

In [28]:
testScore

0.3888073867675357

In [32]:
sample_df = pd.read_csv('vehicle-status-data-0.20.csv')

In [33]:
#variable for training
cols = list(sample_df)[2:14]
sample_df = sample_df[cols].astype(float)

sample_df.head(5)

Unnamed: 0,VehicleId,VehicleType,SignalGroup,LaneId,ApproachId,LocationOnMap,PhaseStatus,PhaseElapsedTime,Speed,Heading,DistanceToStopBar,CellStatus
0,0.0,0.0,2.0,14.0,5.0,2.0,4.0,0.12,-1.0,-1.0,0.0,0.0
1,0.0,0.0,2.0,14.0,5.0,2.0,4.0,0.12,-1.0,-1.0,7.62,0.0
2,0.0,0.0,2.0,14.0,5.0,2.0,4.0,0.12,-1.0,-1.0,15.24,0.0
3,0.0,0.0,2.0,14.0,5.0,2.0,4.0,0.12,-1.0,-1.0,22.86,0.0
4,0.0,0.0,2.0,14.0,5.0,2.0,4.0,0.12,-1.0,-1.0,30.48,0.0


In [51]:
sampleX,sampleY = getProcessedDataSet(sample_df)

In [52]:
sampleX = np.array(sampleX)
sampleY = np.array(sampleY)
print(sampleX.shape)
print(sampleY.shape)

(9837, 40, 6)
(9837, 40)


In [53]:
# scaler = StandardScaler()
# scaler = scaler.fit(sampleY)
# # sampleY = scaler.transform(sampleY)
# print(sampleY.shape)

In [54]:
predictionSampleX=[]
for i in range (n_past, sampleX.shape[0] - n_future + 1):
    predictionSampleX.append(sampleX[i-n_past:i,])    

predictionSampleX = np.array(predictionSampleX)

In [55]:
predictionSampleY=[]
predictionSampleY= sampleY[n_past-1:sampleY.shape[0]-1:]
print('predictionSampleX shape == {}.'.format(predictionSampleX.shape))
print('predictionSampleY shape == {}.'.format(predictionSampleY.shape))

predictionSampleX shape == (9737, 100, 40, 6).
predictionSampleY shape == (9737, 40).


In [56]:
predictionSampleX = predictionSampleX.reshape(predictionSampleX.shape[0], predictionSampleX.shape[1], predictionSampleX.shape[2], predictionSampleX.shape[3], 1)
print('predictionSampleX shape == {}.'.format(predictionSampleX.shape))

predictionSampleX shape == (9737, 100, 40, 6, 1).


In [57]:
predictionSampleY

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.]])

In [58]:
samplePredict = model.predict(predictionSampleX)
print(samplePredict)

[[0.9602231  0.9485737  0.9713126  ... 0.04144568 0.04620755 0.05508861]
 [0.9630804  0.9460144  0.9781457  ... 0.03969185 0.0455633  0.05472898]
 [0.9394448  0.92801714 0.93221676 ... 0.08259769 0.08311874 0.06747714]
 ...
 [0.09635562 0.97541213 0.92279625 ... 0.08764454 0.0640429  0.04833072]
 [0.9617469  0.1345338  0.8912692  ... 0.07143436 0.05434237 0.03554437]
 [0.9491525  0.13955495 0.8950623  ... 0.09249301 0.0807277  0.04605099]]


In [59]:
samplePredict = scaler.inverse_transform(samplePredict)
predictionSampleY = scaler.inverse_transform(predictionSampleY)
print('predictionSampleY shape == {}.'.format(predictionSampleY.shape))

predictionSampleY shape == (9737, 40).


In [60]:
samplePredict

array([[1.0181992 , 0.9523477 , 0.89200014, ..., 0.09211352, 0.09493516,
        0.08343454],
       [1.0196233 , 0.95106924, 0.89536536, ..., 0.09163551, 0.09475816,
        0.08334313],
       [1.0078427 , 0.94207895, 0.872746  , ..., 0.10332968, 0.1050758 ,
        0.08658354],
       ...,
       [0.5876219 , 0.9657545 , 0.8681065 , ..., 0.10470522, 0.09983508,
        0.08171678],
       [1.0189587 , 0.54570425, 0.8525799 , ..., 0.10028707, 0.09717005,
        0.07846665],
       [1.0126812 , 0.54821247, 0.85444796, ..., 0.10602669, 0.1044189 ,
        0.08113729]], dtype=float32)

In [61]:
predictionSampleY

array([[1.03802514, 0.97803706, 0.90612827, ..., 0.08081732, 0.08224052,
        0.06943174],
       [1.03802514, 0.97803706, 0.90612827, ..., 0.08081732, 0.08224052,
        0.06943174],
       [1.03802514, 0.97803706, 0.90612827, ..., 0.08081732, 0.08224052,
        0.06943174],
       ...,
       [0.53959541, 0.97803706, 0.90612827, ..., 0.08081732, 0.08224052,
        0.06943174],
       [1.03802514, 0.47849954, 0.90612827, ..., 0.08081732, 0.08224052,
        0.06943174],
       [1.03802514, 0.47849954, 0.90612827, ..., 0.08081732, 0.08224052,
        0.06943174]])

In [62]:
predictionSampleScore = math.sqrt(mean_squared_error(predictionSampleY[:,0], samplePredict[:,0]))

In [63]:
predictionSampleScore

0.03808314194476248

In [67]:
sample_Df = pd.read_csv('prediction-sample-data.csv')

noOfcells = sample_Df['NoOfCells'][0]
print(sample_Df.shape[0])

4000


In [68]:
cols = list(sample_Df)[2:14]
cols

['VehicleId',
 'VehicleType',
 'SignalGroup',
 'LaneId',
 'ApproachId',
 'LocationOnMap',
 'PhaseStatus',
 'PhaseElapsedTime',
 'Speed',
 'Heading',
 'DistanceToStopBar',
 'CellStatus']

In [69]:
sample_Df = sample_Df[cols].astype(float)

sample_Df.head(5)

Unnamed: 0,VehicleId,VehicleType,SignalGroup,LaneId,ApproachId,LocationOnMap,PhaseStatus,PhaseElapsedTime,Speed,Heading,DistanceToStopBar,CellStatus
0,31.0,4.0,2.0,14.0,5.0,2.0,4.0,18.79,0.67,89.14,1.0,1.0
1,32.0,4.0,2.0,14.0,5.0,2.0,4.0,18.79,4.44,89.14,12.0,1.0
2,0.0,0.0,2.0,14.0,5.0,2.0,4.0,18.79,-1.0,-1.0,0.0,0.0
3,33.0,4.0,2.0,14.0,5.0,2.0,4.0,18.79,7.18,89.18,25.0,1.0
4,0.0,0.0,2.0,14.0,5.0,2.0,4.0,18.79,-1.0,-1.0,0.0,0.0


In [70]:
sample_inputX, sample_inputY = getProcessedDataSet(sample_Df)
sample_inputX = np.array(sample_inputX)
sample_inputY = np.array(sample_inputY)
print(sample_inputX.shape)
print(sample_inputY.shape)

(100, 40, 6)
(100, 40)


In [71]:
n_future = 1 #Number of days we want to predict into the future
n_past = 100 #Number of past days we want to use to predict the future
sampleX = []
for i in range (n_past, sample_inputX.shape[0] - n_future + 1):
    sampleX.append(sample_inputX[i-n_past:i,])
    
sampleX = np.array(sampleX)
print(sampleX)

[]


In [72]:
sampleX = sample_inputX.reshape(1, sample_inputX.shape[0], sample_inputX.shape[1], sample_inputX.shape[2], 1)
print(sampleX.shape)

(1, 100, 40, 6, 1)


In [73]:
samplePrediction = model.predict(sampleX)

In [74]:
samplePrediction

array([[ 9.5997381e-01,  1.0337105e+00,  9.5625204e-01,  9.6662784e-01,
         9.5631260e-01,  9.4086850e-01,  8.6684823e-01,  8.2404900e-01,
         8.3637089e-02,  7.2162665e-02,  6.4147472e-02,  1.2329131e-02,
        -6.8371221e-03,  5.4844521e-02,  4.9463253e-02,  5.7829302e-02,
         9.9703498e-02,  7.6690726e-02,  5.7222210e-02,  9.0006843e-02,
         4.1198902e-02,  3.4353942e-02,  7.9938546e-03,  5.0706111e-02,
        -4.4753775e-03, -7.9254061e-04,  2.0731576e-02,  1.6481154e-02,
         4.8128515e-04, -1.8740557e-02,  3.7844405e-02,  3.1452712e-02,
         1.6696431e-02,  5.6938179e-02,  5.0752200e-02,  8.5204430e-02,
         8.2632586e-02,  1.1954935e-01,  8.3294928e-02,  5.2072488e-02]],
      dtype=float32)

In [75]:
samplePrediction = scaler.inverse_transform(samplePrediction)

In [76]:
samplePrediction

array([[1.0180749 , 0.9948767 , 0.884583  , 0.8419657 , 0.7752515 ,
        0.6657973 , 0.56146663, 0.42514566, 0.15370353, 0.1395399 ,
        0.1084322 , 0.101973  , 0.09052688, 0.09576543, 0.0993242 ,
        0.08958975, 0.11395538, 0.1037638 , 0.08843028, 0.10523334,
        0.0947471 , 0.08128144, 0.08371641, 0.08507782, 0.08020329,
        0.08232739, 0.07743625, 0.08687261, 0.0711819 , 0.07798085,
        0.09220748, 0.07988799, 0.0856808 , 0.09644663, 0.08365647,
        0.10541912, 0.09124674, 0.11340102, 0.10512421, 0.08266788]],
      dtype=float32)

In [81]:
sample_Df.tail(40)

Unnamed: 0,VehicleId,VehicleType,SignalGroup,LaneId,ApproachId,LocationOnMap,PhaseStatus,PhaseElapsedTime,Speed,Heading,DistanceToStopBar,CellStatus
3960,31.0,4.0,2.0,14.0,5.0,2.0,4.0,36.16,0.0,89.14,1.0,1.0
3961,33.0,4.0,2.0,14.0,5.0,2.0,4.0,36.16,0.0,89.14,14.0,1.0
3962,34.0,4.0,2.0,14.0,5.0,2.0,4.0,36.16,0.0,89.15,21.0,1.0
3963,35.0,4.0,2.0,14.0,5.0,2.0,4.0,36.16,0.0,89.21,27.0,1.0
3964,36.0,4.0,2.0,14.0,5.0,2.0,4.0,36.16,0.0,89.79,33.0,1.0
3965,37.0,4.0,2.0,14.0,5.0,2.0,4.0,36.16,0.0,90.13,40.0,1.0
3966,38.0,4.0,2.0,14.0,5.0,2.0,4.0,36.16,0.06,90.13,46.0,1.0
3967,39.0,4.0,2.0,14.0,5.0,2.0,4.0,36.16,5.15,90.13,59.0,1.0
3968,0.0,0.0,2.0,14.0,5.0,2.0,4.0,36.16,-1.0,-1.0,0.0,0.0
3969,0.0,0.0,2.0,14.0,5.0,2.0,4.0,36.16,-1.0,-1.0,0.0,0.0
