In [1]:
import numpy as np
import pandas as pd

np.random.seed(1337)  # for reproducibility
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics.classification import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.regression import r2_score, mean_squared_error
from sklearn.preprocessing import MinMaxScaler

from dbn.tensorflow import SupervisedDBNRegression

In [2]:
# Read the dataset
ROAD = "Taft Ave."
YEAR = "2015"
EXT = ".csv"
FILENAME = "merged_mmda_wwo_" + ROAD + "_" + YEAR
original_dataset = pd.read_csv("data/mmda-wwo/" + FILENAME + EXT, skipinitialspace=True)

Unnamed: 0,dt,lineName,stationName,statusN,statusS,tempC,tempF,windspeedMiles,windspeedKmph,winddirDegree,...,heatIndexC,heatIndexF,dewPointC,dewPointF,windChillC,windChillF,windGustMiles,windGustKmph,feelsLikeC,feelsLikeF
0,2015-01-01 00:00:00,EDSA,Taft Ave.,0.5,0.5,0.2,0.222222,0.321429,0.295455,0.152975,...,0.206897,0.226415,0.538462,0.56,0.2,0.222222,0.227273,0.236111,0.206897,0.226415
1,2015-01-01 00:15:00,EDSA,Taft Ave.,0.5,0.5,0.1875,0.215278,0.321429,0.295455,0.152266,...,0.206897,0.221698,0.538462,0.55,0.1875,0.215278,0.232955,0.239583,0.206897,0.221698
2,2015-01-01 00:30:00,EDSA,Taft Ave.,0.5,0.5,0.175,0.208333,0.321429,0.295455,0.151558,...,0.206897,0.216981,0.538462,0.54,0.175,0.208333,0.238636,0.243056,0.206897,0.216981
3,2015-01-01 00:45:00,EDSA,Taft Ave.,0.5,0.5,0.1625,0.201389,0.321429,0.295455,0.15085,...,0.206897,0.212264,0.538462,0.53,0.1625,0.201389,0.244318,0.246528,0.206897,0.212264
4,2015-01-01 01:00:00,EDSA,Taft Ave.,0.5,0.5,0.15,0.194444,0.321429,0.295455,0.150142,...,0.206897,0.207547,0.538462,0.52,0.15,0.194444,0.25,0.25,0.206897,0.207547


##### Preparing Traffic Dataset

In [3]:
original_dataset = pd.read_csv("data/mmda-wwo/" + FILENAME + EXT, skipinitialspace=True)
traffic_dataset = original_dataset
print("Start : " + str(original_dataset.columns[0:original_dataset.shape[1]][5]))
print("End : " + str(original_dataset.columns[0:original_dataset.shape[1]][traffic_dataset.shape[1]-1]))

# Remove date time. Remove unused columms
#0-2 = dt + lineName + stationName || 3-4 - statusN - statusS || 5-end - weather variables
cols_to_remove = [0, 1, 2] + list(range(5, traffic_dataset.shape[1]))

# #Remove Redundant Variables
# #Variables = WindspeedKmph, cond, precipMM, humidity, visibility, pressure, cloudcover, dewPointC, windGustKmph, 
# redundant_variables = [5, 6, 7, 9, 16, 17, 19, 20, 21, 22, 24, 25]
# cols_to_remove += redundant_variables

traffic_dataset.drop(traffic_dataset.columns[[cols_to_remove]], axis=1, inplace=True)
traffic_dataset.head()

Start : tempC
End : feelsLikeF


Unnamed: 0,statusN,statusS
0,0.5,0.5
1,0.5,0.5
2,0.5,0.5
3,0.5,0.5
4,0.5,0.5


In [4]:
# To-be Predicted variable (statusS and dt lang dapat)
Y = traffic_dataset.statusS
Y = Y.round(5)

# Other data
X = traffic_dataset
#X = dataset
#X.statusS = X.statusS.round(5)

# Splitting data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.67, shuffle=False)
X_train = np.array(X_train)
X_test = np.array(X_test)
Y_train = np.array(Y_train)
Y_test = np.array(Y_test)

# Data scaling
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)

In [5]:
# Training
regressor = SupervisedDBNRegression(hidden_layers_structure=[5, 10, 15],
                                    learning_rate_rbm=0.01,
                                    learning_rate=0.01,
                                    n_epochs_rbm=3,
                                    n_iter_backprop=5,
                                    batch_size=16,
                                    activation_function='relu')
regressor.fit(X_train, Y_train)

[START] Pre-training step:
>> Epoch 1 finished 	RBM Reconstruction error 0.162663
>> Epoch 2 finished 	RBM Reconstruction error 0.123562
>> Epoch 3 finished 	RBM Reconstruction error 0.083989
>> Epoch 1 finished 	RBM Reconstruction error 0.058073
>> Epoch 2 finished 	RBM Reconstruction error 0.054354
>> Epoch 3 finished 	RBM Reconstruction error 0.048007
>> Epoch 1 finished 	RBM Reconstruction error 0.008872
>> Epoch 2 finished 	RBM Reconstruction error 0.008842
>> Epoch 3 finished 	RBM Reconstruction error 0.008840
[END] Pre-training step
[START] Fine tuning step:
>> Epoch 0 finished 	ANN training loss 0.118417
>> Epoch 1 finished 	ANN training loss 0.091207
>> Epoch 2 finished 	ANN training loss 0.006058
>> Epoch 3 finished 	ANN training loss 0.000520
>> Epoch 4 finished 	ANN training loss 0.000120
[END] Fine tuning step


SupervisedDBNRegression(batch_size=16, dropout_p=0, l2_regularization=1.0,
            learning_rate=0.01, n_iter_backprop=5, verbose=True)

In [6]:
# Test
X_test = min_max_scaler.transform(X_test)
Y_pred = regressor.predict(X_test)
print('Done.\nR-squared: %f\nMSE: %f' % (r2_score(Y_test, Y_pred), mean_squared_error(Y_test, Y_pred)))

Done.
R-squared: 0.998920
MSE: 0.000127


In [7]:
print(len(Y_pred))
temp = []
for i in range(len(Y_pred)):
    temp.append(Y_pred[i][0])
d = {'Predicted': temp, 'Actual': Y_test}

df = pd.DataFrame(data=d)
df.head()

23477


Unnamed: 0,Actual,Predicted
0,1.0,0.99191
1,1.0,0.99191
2,1.0,0.99191
3,1.0,0.99191
4,1.0,0.99191


In [8]:
df

Unnamed: 0,Actual,Predicted
0,1.0,0.991910
1,1.0,0.991910
2,1.0,0.991910
3,1.0,0.991910
4,1.0,0.991910
5,0.5,0.493651
6,0.5,0.493651
7,0.5,0.493651
8,0.5,0.493651
9,0.5,0.493651


In [9]:
# Save the model
regressor.save('models/pm1-noeng.pkl')

# # Restore
# classifier = SupervisedDBNClassification.load('model.pkl')

# # Test
# Y_pred = classifier.predict(X_test)
# print('Done.\nAccuracy: %f' % accuracy_score(Y_test, Y_pred))


In [10]:
FILENAME = "merged_mmda_wwo_Taft Ave._2015"
EXT = ".csv"
df.to_csv("output/pm1_noeng_output_" + FILENAME + EXT, encoding='utf-8')