In [1]:
import numpy as np
import pandas as pd

np.random.seed(1337)  # for reproducibility
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics.classification import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.regression import r2_score, mean_squared_error
from sklearn.preprocessing import MinMaxScaler

from dbn.tensorflow import SupervisedDBNRegression

In [2]:
# Read the dataset
ROAD = "Taft Ave."
YEAR = "2015"
EXT = ".csv"

WEATHER_WINDOWSIZE = 2
WEATHER_FILENAME = "eng_win" + str(WEATHER_WINDOWSIZE) + "_wwo_" + ROAD + "_" + YEAR
weather_raw_data = pd.read_csv("data/wwo/" + WEATHER_FILENAME + EXT, skipinitialspace=True)
weather_raw_data = weather_raw_data.fillna(0)

TRAFFIC_WINDOWSIZE = 1
TRAFFIC_FILENAME = "eng_win" + str(TRAFFIC_WINDOWSIZE) + "_mmda_" + ROAD + "_" + YEAR
traffic_raw_data = pd.read_csv("data/mmda/" + TRAFFIC_FILENAME + EXT, skipinitialspace=True)
traffic_raw_data = traffic_raw_data.fillna(0)

##### Preparing Traffic Dataset

In [3]:
TRAFFIC_WINDOWSIZE = 1
TRAFFIC_FILENAME = "eng_win" + str(TRAFFIC_WINDOWSIZE) + "_mmda_" + ROAD + "_" + YEAR
traffic_raw_data = pd.read_csv("data/mmda/" + TRAFFIC_FILENAME + EXT, skipinitialspace=True)
traffic_raw_data = traffic_raw_data.fillna(0)
#print("Start : " + str(original_dataset.columns[0:original_dataset.shape[1]][5]))
#print("End : " + str(original_dataset.columns[0:original_dataset.shape[1]][traffic_dataset.shape[1]-1]))

traffic_dataset = traffic_raw_data

# Remove date time. Remove unused columms
#0-2 = dt + lineName + stationName || 3-4 - statusN - statusS || 5-14 - original weather variables
#15-46 - engineered traffic
engineered = list(range(5, traffic_dataset.shape[1]))
cols_to_remove = [0, 1, 2] + engineered

traffic_dataset.drop(traffic_raw_data.columns[[cols_to_remove]], axis=1, inplace=True)
traffic_dataset.head()

Unnamed: 0,statusN,statusS
0,0.5,0.5
1,0.5,0.5
2,0.5,0.5
3,0.5,0.5
4,0.5,0.5


##### Preparing Weather Dataset

In [13]:
WEATHER_WINDOWSIZE = 4
WEATHER_FILENAME = "eng_win" + str(WEATHER_WINDOWSIZE) + "_wwo_" + ROAD + "_" + YEAR
weather_raw_data = pd.read_csv("data/wwo/" + WEATHER_FILENAME + EXT, skipinitialspace=True)
weather_raw_data = weather_raw_data.fillna(0)

weather_dataset = weather_raw_data

# Remove date time. Remove unused columms
#0-2 = dt + lineName + stationName || 3-12 - original weather variables
#12-beyond - engineered
# if windowsize = 1, 4 engineered features per original features
# We have 10 weather variables

cols_to_remove = [0, 1, 2] 

#For Window = 1
# temp = list(range(13, 17))
# windspeedkmph = list(range(17, 21))
# cond = list(range(21, 25))
# precip = list(range(25, 29))
# humid = list(range(29, 33))
# visibility = list(range(33, 37))
# pressure = list(range(37, 41))
# cloudcover = list(range(41, 45))
# dewpoint = list(range(45, 49))
# windgustkmph = list(range(49, 53))

#For Window = >2
temp = list(range(13, 17))
temp2 = list(range(17, 20))
windspeedkmph = list(range(20, 24))
windspeedkmph2 = list(range(24, 27))
cond = list(range(27, 31))
cond2 = list(range(31, 34))
precip = list(range(34, 38))
precip2 = list(range(38, 41))
humid = list(range(41, 45))
humid2 = list(range(45, 48))
visibility = list(range(48, 52))
visibility2 = list(range(52, 55))
pressure = list(range(55, 59))
pressure2 = list(range(59, 62))
cloudcover = list(range(62, 66))
cloudcover2 = list(range(66, 69))
dewpoint = list(range(69, 73))
dewpoint2 = list(range(73, 76))
windgustkmph = list(range(76, 80))
windgustkmph2 = list(range(80, 83))

#Window = 1
cols_to_remove += temp + cond + precip + humid + visibility + pressure +  cloudcover + dewpoint + windgustkmph #Window = 1
#cols_to_remove += temp + windspeedkmph + cond + precip + humid + visibility + pressure +  cloudcover + dewpoint + windgustkmph #Window = 1

#Window = 2
cols_to_remove += temp2 + cond2 + precip2 + humid2 + visibility2 + pressure2 +  cloudcover2 + dewpoint2 + windgustkmph2
#cols_to_remove += temp2 + windspeedkmph2 + cond2 + precip2 + humid2 + visibility2 + pressure2 +  cloudcover2 + dewpoint2 + windgustkmph2


# 3 - 12
cols_to_remove += [7, 8, 9, 10] #Original Weather Variables

weather_dataset = weather_dataset.drop(weather_dataset.columns[[cols_to_remove]], axis=1)
weather_dataset.head()

Unnamed: 0,tempC,windspeedKmph,cond,precipMM,dewPointC,windGustKmph,windspeedKmph_Emean,windspeedKmph_Emin,windspeedKmph_Emax,windspeedKmph_Esum,windspeedKmph_Rmean (window = 4),windspeedKmph_Rmin (window = 4),windspeedKmph_Rmax (window = 4)
0,0.2,0.295455,0.631579,0.0,0.538462,0.236111,0.295455,0.295455,0.295455,0.295455,0.0,0.0,0.0
1,0.1875,0.295455,0.631579,0.0,0.538462,0.239583,0.295455,0.295455,0.295455,0.590909,0.0,0.0,0.0
2,0.175,0.295455,0.631579,0.0,0.538462,0.243056,0.295455,0.295455,0.295455,0.886364,0.0,0.0,0.0
3,0.1625,0.295455,0.631579,0.0,0.538462,0.246528,0.295455,0.295455,0.295455,1.181818,0.073864,0.0,0.295455
4,0.15,0.295455,0.631579,0.0,0.538462,0.25,0.295455,0.295455,0.295455,1.477273,0.147727,0.0,0.590909


##### Preparing Flood Dataset

In [5]:
FLOOD_WINDOWSIZE = 1
FLOOD_FILENAME = "interpolated_flood_pagasa" + "_" + YEAR
flood_raw_data = pd.read_csv("data/flood/" + FLOOD_FILENAME + EXT, skipinitialspace=True)
flood_raw_data = flood_raw_data.fillna(0)

flood_dataset = flood_raw_data
flood_dataset = flood_dataset.drop(flood_dataset.columns[0], axis=1)

weather_dataset['flood_height'] = flood_dataset
weather_dataset.head()

Unnamed: 0,tempC,windspeedKmph,cond,precipMM,dewPointC,windGustKmph,tempC_Emean,tempC_Emin,tempC_Emax,tempC_Esum,...,windspeedKmph_Rmin (window = 4),windspeedKmph_Rmax (window = 4),cond_Emean,cond_Emin,cond_Emax,cond_Esum,cond_Rmean (window = 4),cond_Rmin (window = 4),cond_Rmax (window = 4),flood_height
0,0.2,0.295455,0.631579,0.0,0.538462,0.236111,0.2,0.2,0.2,0.2,...,0.0,0.0,0.631579,0.631579,0.631579,0.631579,0.0,0.0,0.0,0.814856
1,0.1875,0.295455,0.631579,0.0,0.538462,0.239583,0.19375,0.1875,0.2,0.3875,...,0.0,0.0,0.631579,0.631579,0.631579,1.263158,0.0,0.0,0.0,0.814856
2,0.175,0.295455,0.631579,0.0,0.538462,0.243056,0.1875,0.175,0.2,0.5625,...,0.0,0.0,0.631579,0.631579,0.631579,1.894737,0.0,0.0,0.0,0.814856
3,0.1625,0.295455,0.631579,0.0,0.538462,0.246528,0.18125,0.1625,0.2,0.725,...,0.0,0.295455,0.631579,0.631579,0.631579,2.526316,0.157895,0.0,0.631579,0.814856
4,0.15,0.295455,0.631579,0.0,0.538462,0.25,0.175,0.15,0.2,0.875,...,0.0,0.590909,0.631579,0.631579,0.631579,3.157895,0.315789,0.0,1.263158,0.814856


In [6]:
# To-be Predicted variable
# To-be Predicted variable 
Y = traffic_dataset.statusS
Y = Y.shift(-1)
Y = Y.fillna(0)
Y = Y.round(5)

# Other data
X = weather_dataset
#X = dataset
#X.statusS = X.statusS.round(5)

# Splitting data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.67, shuffle=False)
X_train = np.array(X_train)
X_test = np.array(X_test)
Y_train = np.array(Y_train)
Y_test = np.array(Y_test)

# Data scaling
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)

In [7]:
# Training
regressor = SupervisedDBNRegression(hidden_layers_structure=[5, 10, 15],
                                    learning_rate_rbm=0.01,
                                    learning_rate=0.01,
                                    n_epochs_rbm=3,
                                    n_iter_backprop=5,
                                    batch_size=16,
                                    activation_function='relu')
regressor.fit(X_train, Y_train)

[START] Pre-training step:
>> Epoch 1 finished 	RBM Reconstruction error 2.235369
>> Epoch 2 finished 	RBM Reconstruction error 2.107842
>> Epoch 3 finished 	RBM Reconstruction error 2.331831
>> Epoch 1 finished 	RBM Reconstruction error 1.505494
>> Epoch 2 finished 	RBM Reconstruction error 1.526258
>> Epoch 3 finished 	RBM Reconstruction error 1.506649
>> Epoch 1 finished 	RBM Reconstruction error 0.111602
>> Epoch 2 finished 	RBM Reconstruction error 0.151469
>> Epoch 3 finished 	RBM Reconstruction error 0.207979
[END] Pre-training step
[START] Fine tuning step:
>> Epoch 0 finished 	ANN training loss 0.107191
>> Epoch 1 finished 	ANN training loss 0.109036
>> Epoch 2 finished 	ANN training loss 0.103598
>> Epoch 3 finished 	ANN training loss 0.101483
>> Epoch 4 finished 	ANN training loss 0.099218
[END] Fine tuning step


SupervisedDBNRegression(batch_size=16, dropout_p=0, l2_regularization=1.0,
            learning_rate=0.01, n_iter_backprop=5, verbose=True)

In [8]:
# Test
X_test = min_max_scaler.transform(X_test)
Y_pred = regressor.predict(X_test)
print('Done.\nR-squared: %f\nMSE: %f' % (r2_score(Y_test, Y_pred), mean_squared_error(Y_test, Y_pred)))


Done.
R-squared: -0.738889
MSE: 0.203874


In [9]:
print(len(Y_pred))
temp = []
for i in range(len(Y_pred)):
    temp.append(Y_pred[i][0])
d = {'Predicted': temp, 'Actual': Y_test}

df = pd.DataFrame(data=d)
df.head()

23477


Unnamed: 0,Actual,Predicted
0,1.0,0.928698
1,1.0,0.933587
2,1.0,0.938146
3,1.0,0.942376
4,0.5,0.945617


In [10]:
df

Unnamed: 0,Actual,Predicted
0,1.0,0.928698
1,1.0,0.933587
2,1.0,0.938146
3,1.0,0.942376
4,0.5,0.945617
5,0.5,0.948858
6,0.5,0.952161
7,0.5,0.955464
8,0.5,0.958766
9,0.5,0.961838


In [11]:
# Save the model
regressor.save('models/pm2-newfeatures(temp-humid).pkl')

# # Restore
# classifier = SupervisedDBNClassification.load('model.pkl')

# # Test
# Y_pred = classifier.predict(X_test)
# print('Done.\nAccuracy: %f' % accuracy_score(Y_test, Y_pred))


In [12]:
df.to_csv("output/pm2_eng_output_" + WEATHER_FILENAME + EXT, encoding='utf-8')