In [2]:
import numpy as np
import pandas as pd

np.random.seed(1337)  # for reproducibility
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics.classification import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.regression import r2_score, mean_squared_error
from sklearn.preprocessing import MinMaxScaler

from dbn.tensorflow import SupervisedDBNRegression

## Loading Datasets

In [3]:
ROAD = "Taft Ave."
YEAR = "2015"
EXT = ".csv"

In [4]:
TRAFFIC_WINDOWSIZE = 1
TRAFFIC_FILENAME = "eng_win" + str(TRAFFIC_WINDOWSIZE) + "_mmda_" + ROAD + "_" + YEAR
traffic_raw_data = pd.read_csv("data/mmda/" + TRAFFIC_FILENAME + EXT, skipinitialspace=True)
traffic_raw_data = traffic_raw_data.fillna(0)
#print("Start : " + str(original_dataset.columns[0:original_dataset.shape[1]][5]))
#print("End : " + str(original_dataset.columns[0:original_dataset.shape[1]][traffic_dataset.shape[1]-1]))

traffic_dataset = traffic_raw_data

# Remove date time. Remove unused columms
#0-2 = dt + lineName + stationName || 3-4 - statusN - statusS || 5-14 - original weather variables
#15-46 - engineered traffic
engineered = list(range(5, traffic_dataset.shape[1]))
cols_to_remove = [0, 1, 2] + engineered

traffic_dataset.drop(traffic_raw_data.columns[[cols_to_remove]], axis=1, inplace=True)
traffic_dataset.head()

Unnamed: 0,statusN,statusS
0,0.5,0.5
1,0.5,0.5
2,0.5,0.5
3,0.5,0.5
4,0.5,0.5


#### Weather Dataset

In [11]:
WEATHER_WINDOWSIZE = 2
WEATHER_FILENAME = "eng_win" + str(WEATHER_WINDOWSIZE) + "_wwo_" + ROAD + "_" + YEAR
weather_raw_data = pd.read_csv("data/wwo/" + WEATHER_FILENAME + EXT, skipinitialspace=True)
weather_raw_data = weather_raw_data.fillna(0)

weather_dataset = weather_raw_data

# Remove date time. Remove unused columms
#0-2 = dt + lineName + stationName || 3-12 - original weather variables
#12-beyond - engineered
# if windowsize = 1, 4 engineered features per original features
# We have 10 weather variables

temp = list(range(13, 17))
windspeedkmph = list(range(17, 21))
cond = list(range(21, 25))
precip = list(range(25, 29))
humid = list(range(29, 33))
visibility = list(range(33, 37))
pressure = list(range(37, 41))
cloudcover = list(range(41, 45))
dewpoint = list(range(45, 49))
windgustkmph = list(range(49, weather_dataset.shape[1]))

cols_to_remove = [0, 1, 2] 
#cols_to_remove += temp + windspeedkmph + cond + precip + humid + visibility + pressure + cloudcover + dewpoint + windgustkmph

weather_dataset = weather_dataset.drop(weather_dataset.columns[[cols_to_remove]], axis=1)
weather_dataset.head()

Unnamed: 0,tempC,windspeedKmph,cond,precipMM,humidity,visibility,pressure,cloudcover,dewPointC,windGustKmph,...,dewPointC_Rmean (window = 2),dewPointC_Rmin (window = 2),dewPointC_Rmax (window = 2),windGustKmph_Emean,windGustKmph_Emin,windGustKmph_Emax,windGustKmph_Esum,windGustKmph_Rmean (window = 2),windGustKmph_Rmin (window = 2),windGustKmph_Rmax (window = 2)
0,0.2,0.295455,0.631579,0.0,0.84058,1.0,0.758621,0.33,0.538462,0.236111,...,0.0,0.0,0.0,0.236111,0.236111,0.236111,0.236111,0.0,0.0,0.0
1,0.1875,0.295455,0.631579,0.0,0.84058,1.0,0.758621,0.325,0.538462,0.239583,...,0.269231,0.0,0.538462,0.237847,0.236111,0.239583,0.475694,0.118056,0.0,0.236111
2,0.175,0.295455,0.631579,0.0,0.84058,1.0,0.758621,0.32,0.538462,0.243056,...,0.538462,0.538462,1.076923,0.239583,0.236111,0.243056,0.71875,0.237847,0.236111,0.475694
3,0.1625,0.295455,0.631579,0.0,0.84058,1.0,0.758621,0.315,0.538462,0.246528,...,0.538462,0.538462,1.076923,0.241319,0.236111,0.246528,0.965278,0.241319,0.239583,0.482639
4,0.15,0.295455,0.631579,0.0,0.84058,1.0,0.758621,0.31,0.538462,0.25,...,0.538462,0.538462,1.076923,0.243056,0.236111,0.25,1.215278,0.244792,0.243056,0.489583


In [15]:
weather_dataset['statusS'] = traffic_dataset.statusS
weather_dataset['statusN'] = traffic_dataset.statusN
weather_dataset.head()

Unnamed: 0,tempC,windspeedKmph,cond,precipMM,humidity,visibility,pressure,cloudcover,dewPointC,windGustKmph,...,dewPointC_Rmax (window = 2),windGustKmph_Emean,windGustKmph_Emin,windGustKmph_Emax,windGustKmph_Esum,windGustKmph_Rmean (window = 2),windGustKmph_Rmin (window = 2),windGustKmph_Rmax (window = 2),statusS,statusN
0,0.2,0.295455,0.631579,0.0,0.84058,1.0,0.758621,0.33,0.538462,0.236111,...,0.0,0.236111,0.236111,0.236111,0.236111,0.0,0.0,0.0,0.5,0.5
1,0.1875,0.295455,0.631579,0.0,0.84058,1.0,0.758621,0.325,0.538462,0.239583,...,0.538462,0.237847,0.236111,0.239583,0.475694,0.118056,0.0,0.236111,0.5,0.5
2,0.175,0.295455,0.631579,0.0,0.84058,1.0,0.758621,0.32,0.538462,0.243056,...,1.076923,0.239583,0.236111,0.243056,0.71875,0.237847,0.236111,0.475694,0.5,0.5
3,0.1625,0.295455,0.631579,0.0,0.84058,1.0,0.758621,0.315,0.538462,0.246528,...,1.076923,0.241319,0.236111,0.246528,0.965278,0.241319,0.239583,0.482639,0.5,0.5
4,0.15,0.295455,0.631579,0.0,0.84058,1.0,0.758621,0.31,0.538462,0.25,...,1.076923,0.243056,0.236111,0.25,1.215278,0.244792,0.243056,0.489583,0.5,0.5


#### Flood Dataset

In [16]:
FLOOD_WINDOWSIZE = 1
FLOOD_FILENAME = "interpolated_flood_pagasa" + "_" + YEAR
flood_raw_data = pd.read_csv("data/flood/" + FLOOD_FILENAME + EXT, skipinitialspace=True)
flood_raw_data = flood_raw_data.fillna(0)

flood_dataset = flood_raw_data
flood_dataset = flood_dataset.drop(flood_dataset.columns[0], axis=1)

weather_dataset['flood_height'] = flood_dataset
weather_dataset.head()

Unnamed: 0,tempC,windspeedKmph,cond,precipMM,humidity,visibility,pressure,cloudcover,dewPointC,windGustKmph,...,windGustKmph_Emean,windGustKmph_Emin,windGustKmph_Emax,windGustKmph_Esum,windGustKmph_Rmean (window = 2),windGustKmph_Rmin (window = 2),windGustKmph_Rmax (window = 2),statusS,statusN,flood_height
0,0.2,0.295455,0.631579,0.0,0.84058,1.0,0.758621,0.33,0.538462,0.236111,...,0.236111,0.236111,0.236111,0.236111,0.0,0.0,0.0,0.5,0.5,0.814856
1,0.1875,0.295455,0.631579,0.0,0.84058,1.0,0.758621,0.325,0.538462,0.239583,...,0.237847,0.236111,0.239583,0.475694,0.118056,0.0,0.236111,0.5,0.5,0.814856
2,0.175,0.295455,0.631579,0.0,0.84058,1.0,0.758621,0.32,0.538462,0.243056,...,0.239583,0.236111,0.243056,0.71875,0.237847,0.236111,0.475694,0.5,0.5,0.814856
3,0.1625,0.295455,0.631579,0.0,0.84058,1.0,0.758621,0.315,0.538462,0.246528,...,0.241319,0.236111,0.246528,0.965278,0.241319,0.239583,0.482639,0.5,0.5,0.814856
4,0.15,0.295455,0.631579,0.0,0.84058,1.0,0.758621,0.31,0.538462,0.25,...,0.243056,0.236111,0.25,1.215278,0.244792,0.243056,0.489583,0.5,0.5,0.814856


## Correlate

In [17]:
weather_dataset.corr()

Unnamed: 0,tempC,windspeedKmph,cond,precipMM,humidity,visibility,pressure,cloudcover,dewPointC,windGustKmph,...,windGustKmph_Emean,windGustKmph_Emin,windGustKmph_Emax,windGustKmph_Esum,windGustKmph_Rmean (window = 2),windGustKmph_Rmin (window = 2),windGustKmph_Rmax (window = 2),statusS,statusN,flood_height
tempC,1.000000,-0.007605,0.481600,-0.056429,-0.838525,0.079958,-0.238549,-0.160244,0.284565,-0.142279,...,-0.264365,-0.215951,-0.070997,0.075957,-0.160807,-0.162487,-0.160807,0.323177,0.095285,-0.072133
windspeedKmph,-0.007605,1.000000,0.043842,0.394904,-0.146529,-0.269017,-0.138577,0.213816,-0.239925,0.935749,...,0.192126,0.181496,0.061686,-0.069039,0.930446,0.930077,0.930446,0.101277,0.051301,0.209137
cond,0.481600,0.043842,1.000000,0.030830,-0.447379,-0.090585,-0.136204,0.045433,0.109487,-0.014572,...,-0.055253,-0.036696,-0.019667,0.022100,-0.029200,-0.030078,-0.029200,0.143669,0.047232,-0.099054
precipMM,-0.056429,0.394904,0.030830,1.000000,0.252669,-0.806017,-0.484652,0.500202,0.315964,0.596349,...,-0.160470,-0.143241,0.123256,0.150820,0.592356,0.589274,0.592356,0.017662,0.021161,-0.175910
humidity,-0.838525,-0.146529,-0.447379,0.252669,1.000000,-0.275902,-0.105513,0.295515,0.247395,0.068615,...,-0.029596,-0.007321,0.206154,0.182675,0.088696,0.090029,0.088696,-0.304857,-0.085335,-0.179746
visibility,0.079958,-0.269017,-0.090585,-0.806017,-0.275902,1.000000,0.436105,-0.496318,-0.311956,-0.461063,...,0.106659,0.086230,-0.075762,-0.101986,-0.460159,-0.459345,-0.460159,-0.033461,-0.019734,0.184467
pressure,-0.238549,-0.138577,-0.136204,-0.484652,-0.105513,0.436105,1.000000,-0.407769,-0.586977,-0.274911,...,0.401113,0.297579,-0.053581,-0.225587,-0.271498,-0.270848,-0.271498,-0.006166,0.040651,0.296318
cloudcover,-0.160244,0.213816,0.045433,0.500202,0.295515,-0.496318,-0.407769,1.000000,0.231943,0.371129,...,-0.044864,-0.031906,0.066998,0.075477,0.364391,0.363902,0.364391,-0.057772,0.006598,-0.156995
dewPointC,0.284565,-0.239925,0.109487,0.315964,0.247395,-0.311956,-0.586977,0.231943,1.000000,-0.126956,...,-0.543214,-0.417642,0.279680,0.493823,-0.125503,-0.126323,-0.125503,0.018936,0.017479,-0.451438
windGustKmph,-0.142279,0.935749,-0.014572,0.596349,0.068615,-0.461063,-0.274911,0.371129,-0.126956,1.000000,...,0.108525,0.102795,0.108778,-0.002112,0.997019,0.996730,0.997019,0.039567,0.028615,0.126425


In [19]:
weather_dataset.corr().to_csv("stats/cross-correlation2_" + YEAR + EXT, encoding='utf-8')