In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics.classification import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.regression import r2_score, mean_squared_error
from sklearn.preprocessing import MinMaxScaler

from dbn.tensorflow import SupervisedDBNRegression

In [2]:
# Read the dataset
ROAD = "Taft Ave."
YEAR = "2015"
EXT = ".csv"
WINDOWSIZE = 5
TRANSFORMED = True

if TRANSFORMED:
    FILENAME = "eng_win" + str(WINDOWSIZE) + "_mmda_" + ROAD + "_" + YEAR + "_transformed"
else:
    FILENAME = "eng_win" + str(WINDOWSIZE) + "_mmda_" + ROAD + "_" + YEAR


In [3]:
original_dataset = pd.read_csv("data/mmda/" + FILENAME + EXT, skipinitialspace=True)
original_dataset = original_dataset.fillna(0)
original_dataset.head()

Unnamed: 0,dt,lineName,stationName,statusN,statusS,statusN_Emean,statusN_Emin,statusN_Emax,statusN_Esum,statusN_Rmean (window = 5),statusN_Rmin (window = 5),statusN_Rmax (window = 5),statusS_Emean,statusS_Emin,statusS_Emax,statusS_Esum,statusS_Rmean (window = 5),statusS_Rmin (window = 5),statusS_Rmax (window = 5)
0,01/01/2015 0:00,EDSA,Taft Ave.,0.231394,0.408168,0.231394,0.231394,0.231394,0.231394,0.0,0.0,0.0,0.408168,0.408168,0.408168,0.408168,0.0,0.0,0.0
1,01/01/2015 0:15,EDSA,Taft Ave.,0.237548,0.407201,0.234471,0.231394,0.237548,0.468942,0.0,0.0,0.0,0.407685,0.407201,0.408168,0.815369,0.0,0.0,0.0
2,01/01/2015 0:30,EDSA,Taft Ave.,0.243492,0.406253,0.237478,0.231394,0.243492,0.712434,0.0,0.0,0.0,0.407207,0.406253,0.408168,1.221622,0.0,0.0,0.0
3,01/01/2015 0:45,EDSA,Taft Ave.,0.249231,0.405322,0.240416,0.231394,0.249231,0.961665,0.0,0.0,0.0,0.406736,0.405322,0.408168,1.626944,0.0,0.0,0.0
4,01/01/2015 1:00,EDSA,Taft Ave.,0.254765,0.404408,0.243286,0.231394,0.254765,1.21643,0.046279,0.0,0.231394,0.40627,0.404408,0.408168,2.031352,0.081634,0.0,0.408168


##### Preparing Traffic Dataset

In [4]:
original_dataset = pd.read_csv("data/mmda/" + FILENAME + EXT, skipinitialspace=True)
original_dataset = original_dataset.fillna(0)
traffic_dataset = original_dataset
# Remove date time. Remove unused columms
#0-2 = dt + lineName + stationName || 3-4 - statusN - statusS || 5-14 - original weather variables
#15-46 - engineered traffic
cols_to_remove = [0, 1, 2]

# window 1
statusN = list(range(5, 9))
statusS = list(range(12, 16))

cols_to_remove += statusN + statusS

# window >= 2
statusN2 = list(range(9, 12))
statusS2 = list(range(16, 19))

#cols_to_remove += statusN2 + statusS2

#cols_to_remove += [3, 4] #statusN , statusS

traffic_dataset.drop(traffic_dataset.columns[[cols_to_remove]], axis=1, inplace=True)
traffic_dataset.head()

Unnamed: 0,statusN,statusS,statusN_Rmean (window = 5),statusN_Rmin (window = 5),statusN_Rmax (window = 5),statusS_Rmean (window = 5),statusS_Rmin (window = 5),statusS_Rmax (window = 5)
0,0.231394,0.408168,0.0,0.0,0.0,0.0,0.0,0.0
1,0.237548,0.407201,0.0,0.0,0.0,0.0,0.0,0.0
2,0.243492,0.406253,0.0,0.0,0.0,0.0,0.0,0.0
3,0.249231,0.405322,0.0,0.0,0.0,0.0,0.0,0.0
4,0.254765,0.404408,0.046279,0.0,0.231394,0.081634,0.0,0.408168


In [5]:
shift = 1

In [6]:
# To-be Predicted variable 
Y = traffic_dataset.statusS
Y = Y.shift(-shift)
Y = Y.fillna(0)
Y = Y.round(5)
Y = Y[:-shift]

In [7]:
# Other data
X = traffic_dataset [:-shift]
#X = dataset
#X.statusS = X.statusS.round(5)

# Splitting data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, shuffle=False)
X_train = np.array(X_train)
X_test = np.array(X_test)
Y_train = np.array(Y_train)
Y_test = np.array(Y_test)

# Data scaling
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)

In [8]:
# Training
regressor = SupervisedDBNRegression(hidden_layers_structure=[5, 10, 15],
                                    learning_rate_rbm=0.01,
                                    learning_rate=0.01,
                                    n_epochs_rbm=3,
                                    n_iter_backprop=5,
                                    batch_size=16,
                                    activation_function='relu')
regressor.fit(X_train, Y_train)


[START] Pre-training step:
>> Epoch 1 finished 	RBM Reconstruction error 0.192876
>> Epoch 2 finished 	RBM Reconstruction error 0.161004
>> Epoch 3 finished 	RBM Reconstruction error 0.105368
>> Epoch 1 finished 	RBM Reconstruction error 0.095035
>> Epoch 2 finished 	RBM Reconstruction error 0.079706
>> Epoch 3 finished 	RBM Reconstruction error 0.070547
>> Epoch 1 finished 	RBM Reconstruction error 0.041428
>> Epoch 2 finished 	RBM Reconstruction error 0.040087
>> Epoch 3 finished 	RBM Reconstruction error 0.036001
[END] Pre-training step
[START] Fine tuning step:
>> Epoch 0 finished 	ANN training loss 0.031750
>> Epoch 1 finished 	ANN training loss 0.029570
>> Epoch 2 finished 	ANN training loss 0.026038
>> Epoch 3 finished 	ANN training loss 0.020449
>> Epoch 4 finished 	ANN training loss 0.013441
[END] Fine tuning step


SupervisedDBNRegression(batch_size=16, dropout_p=0, l2_regularization=1.0,
            learning_rate=0.01, n_iter_backprop=5, verbose=True)

In [9]:
# Test
X_test = min_max_scaler.transform(X_test)
Y_pred = regressor.predict(X_test)
print('Done.\nR-squared: %f\nMSE: %f' % (r2_score(Y_test, Y_pred), mean_squared_error(Y_test, Y_pred)))


Done.
R-squared: 0.621650
MSE: 0.013560


In [10]:
print(len(Y_pred))
temp = []
for i in range(len(Y_pred)):
    temp.append(Y_pred[i][0])
d = {'Predicted': temp, 'Actual': Y_test}

df = pd.DataFrame(data=d)
df.head()

23477


Unnamed: 0,Actual,Predicted
0,0.34164,0.391683
1,0.33944,0.389944
2,0.33737,0.388308
3,0.33543,0.386773
4,0.33362,0.385339


In [11]:
df

Unnamed: 0,Actual,Predicted
0,0.34164,0.391683
1,0.33944,0.389944
2,0.33737,0.388308
3,0.33543,0.386773
4,0.33362,0.385339
5,0.33193,0.384003
6,0.33037,0.382763
7,0.32892,0.381618
8,0.32758,0.380565
9,0.32636,0.379604


In [12]:
# Save the model
regressor.save('models/pm1-witheng.pkl')

# # Restore
# classifier = SupervisedDBNClassification.load('model.pkl')

# # Test
# Y_pred = classifier.predict(X_test)
# print('Done.\nAccuracy: %f' % accuracy_score(Y_test, Y_pred))


In [13]:
df.to_csv("output/pm1_eng_output_" + FILENAME + EXT, encoding='utf-8')