# Deep Learning (DL) - Practicum 2

In [1]:
import os
import json
import argparse
from time import time

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, GRU, SimpleRNN
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
from tensorflow.keras.layers import TimeDistributed, RepeatVector

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [2]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

## Helpful functions

In [3]:
def load_config_file(nfile, abspath=False):
    """
    Read the configuration from a json file

    :param abspath:
    :param nfile:
    :return:
    """
    ext = '.json' if 'json' not in nfile else ''
    pre = '' if abspath else './'
    fp = open(pre + nfile + ext, 'r')

    s = ''

    for l in fp:
        s += l

    return json.loads(s)

def lagged_vector(data, lag=1, ahead=0):
    """
    Returns a vector with columns that are the steps of the lagged time series
    Last column is the value to predict

    Because arrays start at 0, Ahead starts at 0 but actually means one step ahead

    :param data:
    :param lag:
    :return:
    """
    lvect = []
    for i in range(lag):
        lvect.append(data[i: -lag - ahead + i])
    lvect.append(data[lag + ahead:])

    return np.stack(lvect, axis=1)


def lagged_matrix(data, lag=1, ahead=0):
    """
    Returns a matrix with columns that are the steps of the lagged time series
    Last column is the value to predict
    :param data:
    :param lag:
    :return:
    """
    lvect = []

    for i in range(lag):
        lvect.append(data[i: -lag - ahead + i, :])
    lvect.append(data[lag + ahead:, :])
    return np.stack(lvect, axis=1)


def _generate_dataset_one_var(data, datasize, testsize, lag=1, ahead=1):
    """
    Generates dataset assuming only one variable for prediction
    Here ahead starts at 1 (I know it is confusing)

    :return:
    """
    scaler = StandardScaler()
    data = scaler.fit_transform(data)
    # print('DATA Dim =', data.shape)

    wind_train = data[:datasize, :]
    # print('Train Dim =', wind_train.shape)

    train = lagged_vector(wind_train, lag=lag, ahead=ahead - 1)

    train_x, train_y = train[:, :lag], train[:, -1:, 0]#El ultim element del lagged vector es el que volem predir

    wind_test = data[datasize:datasize + testsize, 0].reshape(-1, 1)
    test = lagged_vector(wind_test, lag=lag, ahead=ahead - 1)

    test_x, test_y = test[:, :lag], test[:, -1:, 0]

    return train_x, train_y, test_x, test_y

def _generate_dataset_multiple_var(data, datasize, testsize, lag=1, ahead=1, x_vars=None, y_vars=None):
    """
    Generates dataset assuming there is more than one variable for prediction
    Here ahead starts at 1 (I know it is confusing)

    :return:
    """
    scaler = StandardScaler()
    data = scaler.fit_transform(data)
    print('DATA Dim =', data.shape)

    wind_train = data[:datasize, :]
    print('Train Dim =', wind_train.shape)

    train = lagged_matrix(wind_train, lag=lag, ahead=ahead - 1)

    train_x, train_y = train[:, :lag, x_vars], train[:, -1:, y_vars]
    train_y = train_y.reshape((train_x.shape[0],len(y_vars)))

    wind_test = data[datasize:datasize + testsize, :]
    test = lagged_matrix(wind_test, lag=lag, ahead=ahead - 1)

    test_x, test_y = test[:, :lag, x_vars], test[:, -1:, y_vars]
    test_y = test_y.reshape((test_x.shape[0],len(y_vars)))

    return train_x, train_y, test_x, test_y

def _generate_dataset_multiple_var_multistep(data, datasize, testsize, lag=1, ahead=1, n_steps_out=1, x_vars=None, y_vars=None):
    """
    Generates dataset assuming there is more than one variable for prediction
    Here ahead starts at 1 (I know it is confusing)

    :return:
    """
    scaler = StandardScaler()
    data = scaler.fit_transform(data)
    print('DATA Dim =', data.shape)

    wind_train = data[:datasize, :]
    print('Train Dim =', wind_train.shape)

    train = lagged_matrix(wind_train, lag=lag+n_steps_out, ahead=ahead - 1)

    train_x, train_y = train[:, :lag, x_vars], train[:, lag+1:, y_vars]

    train_y = train_y.reshape((train_x.shape[0], n_steps_out,len(y_vars)))

    wind_test = data[datasize:datasize + testsize, :]
    test = lagged_matrix(wind_test, lag=lag+n_steps_out, ahead=ahead - 1)

    test_x, test_y = test[:, :lag, x_vars], test[:, lag+1:, y_vars]
    test_y = test_y.reshape((test_x.shape[0], n_steps_out,len(y_vars)))

    return train_x, train_y, test_x, test_y


def generate_dataset(config, ahead=1, data_path=None):
    """
    Generates the dataset for training, test and validation

    :param ahead: number of steps ahead for prediction

    :return:
    """
    dataset = config['dataset']
    datanames = config['datanames']
    datasize = config['datasize']
    testsize = config['testsize']
    nsteps_out = config['nstepsout']
    x_vars = config['x_vars']
    y_vars = config['y_vars']
    vars = config['vars']
    lag = config['lag']

    airq = {}

    # Reads numpy arrays for all sites and keep only selected columns

    aqdata = np.load(data_path)
    
    airq['data'] = aqdata
    if vars is not None:
        airq['data'] = airq['data'][:, vars]

    if dataset == 0:
        return _generate_dataset_one_var(airq['data'][:, 0].reshape(-1, 1), datasize, testsize,
                                         lag=lag, ahead=ahead)
    
    #Option to do multiple predictions from mulltiple data
    elif dataset == 1:
        return _generate_dataset_multiple_var(airq['data'][:, :], datasize, testsize,
                                         lag=lag, ahead=ahead, x_vars=x_vars, y_vars=y_vars)
    
    elif dataset ==2:
        return _generate_dataset_multiple_var_multistep(airq['data'][:, :], datasize, testsize,
                                         lag=lag, ahead=ahead, n_steps_out=nsteps_out, x_vars=x_vars, y_vars=y_vars)
    # Just add more options to generate datasets with more than one variable for predicting one value
    # or a sequence of values

    raise NameError('ERROR: No such dataset type')


def architecture(neurons, drop, nlayers, activation, activation_r, rnntype, impl=1, multistep=False):
    """
    RNN architecture

    :return:
    """
    RNN = LSTM if rnntype == 'LSTM' else GRU
    if rnntype == 'SimpleRNN':
        model = Sequential()
        if nlayers == 1:
            model.add(SimpleRNN(neurons, input_shape=(train_x.shape[1], train_x.shape[2]), implementation=impl,
                          recurrent_dropout=drop, activation=activation))
        else:
            model.add(SimpleRNN(neurons, input_shape=(train_x.shape[1], train_x.shape[2]), implementation=impl,
                          recurrent_dropout=drop, activation=activation,
                          return_sequences=True))
            for i in range(1, nlayers - 1):
                model.add(SimpleRNN(neurons, recurrent_dropout=drop, implementation=impl,
                              activation=activation, return_sequences=True))
            model.add(SimpleRNN(neurons, recurrent_dropout=drop, activation=activation,
                          implementation=impl))
    else:    
        model = Sequential()
        if nlayers == 1:
            model.add(RNN(neurons, input_shape=(train_x.shape[1], train_x.shape[2]), implementation=impl,
                          recurrent_dropout=drop, activation=activation, recurrent_activation=activation_r))
        else:
            model.add(RNN(neurons, input_shape=(train_x.shape[1], train_x.shape[2]), implementation=impl,
                          recurrent_dropout=drop, activation=activation, recurrent_activation=activation_r,
                          return_sequences=True))
            for i in range(1, nlayers - 1):
                model.add(RNN(neurons, recurrent_dropout=drop, implementation=impl,
                              activation=activation, recurrent_activation=activation_r, return_sequences=True))
            model.add(RNN(neurons, recurrent_dropout=drop, activation=activation,
                          recurrent_activation=activation_r, implementation=impl))
    
    if multistep:
        model.add(RepeatVector(train_y.shape[1]))
        #model.add(LSTM(200, activation='relu', return_sequences=True))
        model.add(TimeDistributed(Dense(train_y.shape[2])))
    else:
        model.add(Dense(train_y.shape[1]))
    #model.add(Dense(1))

    return model

# Training the architecture and testing

In [11]:
for i in range(10):
    j = i+1
    args = {
        'name':'EXPERIMENT_12_'+str(j),
        'verbose':False,
        'best':True,
        'tboard':True,
        'config':{
            "data": {
                "datanames": ["data"],
                "vars": [0,1,2,3,4,5,6,7,8,9,10,11],
                "datasize": 25300,
                "testsize": 6342,
                "dataset": 1,
                "nstepsout":288,
                "lag": 6,
                "ahead": 1,
                "x_vars": [0,1,2,3,4,5,6,7,8,9,10,11],
                "y_vars": [0,1,2,3,4,5]
            },
            "arch": {
                "neurons": 8,
                #"rnn": "SimpleRNN",
                "rnn": "LSTM",
                #"rnn": "GRU",
                "drop": 0.0,
                "nlayers": 1,
                "activation": "tanh",
                "activation_r": "hard_sigmoid",
                "multistep": False
            },
            "training": {
                "batch": 500,
                "epochs": 50,
                "optimizer": "adam",
                "lrate": 0.001
            }
        }
    }
    print('Experiment: ',args['name'])
    verbose = 1 if args['verbose'] else 0
    impl = 2

    ahead = args['config']['data']['ahead']

    if verbose:
        print('-----------------------------------------------------------------------------')
        print('Steps Ahead = %d ' % ahead)

    # Data path:
    aq_data_path = './data/data2.npy'

    #Split in training and test
    train_x, train_y, test_x, test_y = generate_dataset(args['config']['data'], ahead=ahead, data_path=aq_data_path)


    ############################################
    # Model

    model = architecture(neurons=args['config']['arch']['neurons'],
                         drop=args['config']['arch']['drop'],
                         nlayers=args['config']['arch']['nlayers'],
                         activation=args['config']['arch']['activation'],
                         activation_r=args['config']['arch']['activation_r'], rnntype=args['config']['arch']['rnn'], impl=impl,
                        multistep=args['config']['arch']['multistep'])
    if verbose:
        model.summary()
        print('lag: ', args['config']['data']['lag'],
              '/Neurons: ', args['config']['arch']['neurons'],
              '/Layers: ', args['config']['arch']['nlayers'],
              '/Activations:', args['config']['arch']['activation'], args['config']['arch']['activation_r'])
        print('Tr:', train_x.shape, train_y.shape, 'Ts:', test_x.shape, test_y.shape)
        print()


    ############################################
    # Training

    optimizer = args['config']['training']['optimizer']

    if optimizer == 'rmsprop':
        if 'lrate' in args['config']['training']:
            optimizer = RMSprop(lr=args['config']['training']['lrate'])
        else:
            optimizer = RMSprop(lr=0.001)

    model.compile(loss='mean_squared_error', optimizer=optimizer)

    cbacks = []

    if args['tboard']:
        tensorboard = TensorBoard(log_dir="logs/{}".format(args['name']))
        cbacks.append(tensorboard)

    if args['best']:
        modfile = './model{}.h5'.format(args['name'])
        mcheck = ModelCheckpoint(filepath=modfile, monitor='val_loss', verbose=0, save_best_only=True,
                                 save_weights_only=False, mode='auto', period=1)
        cbacks.append(mcheck)

    model.fit(train_x, train_y, batch_size=args['config']['training']['batch'],
              epochs=args['config']['training']['epochs'],
              validation_data=(test_x, test_y),
              verbose=verbose, callbacks=cbacks)


    ############################################
    # Results

    if args['best']:
        model = load_model(modfile)

    score = model.evaluate(test_x, test_y, batch_size=args['config']['training']['batch'], verbose=0)

    print()
    print('MSE test= ', score)
    test_yp = model.predict(test_x, batch_size=args['config']['training']['batch'], verbose=0)
    if args['config']['arch']['multistep']:
        r2test = r2_score(test_y.reshape((args['config']['data']['testsize']-args['config']['data']['nstepsout']-args['config']['data']['lag'], -1)), 
                      test_yp.reshape((args['config']['data']['testsize']-args['config']['data']['nstepsout']-args['config']['data']['lag'], -1)))
    else:
        print('MSE test persistence =', mean_squared_error(test_y[ahead:], test_y[0:-ahead]))
        r2test = r2_score(test_y, test_yp)
        r2pers = r2_score(test_y[ahead:, :], test_y[0:-ahead, :])
        print('R2 test persistence =', r2pers)
        resfile = open('result.txt', 'a')
        resfile.write('NAME = %s, DATAS= %d, LAG= %d, AHEAD= %d, RNN= %s, NLAY= %d, NNEUR= %d, DROP= %3.2f, ACT= %s, RACT= %s, '
                      'OPT= %s, MSE= %3.5f, R2Test = %3.5f, R2pers = %3.5f\n' %
                      (args['name'],
                       args['config']['data']['dataset'],
                       args['config']['data']['lag'],
                       args['config']['data']['ahead'],
                       args['config']['arch']['rnn'],
                       args['config']['arch']['nlayers'],
                       args['config']['arch']['neurons'],
                       args['config']['arch']['drop'],
                       args['config']['arch']['activation'],
                       args['config']['arch']['activation_r'],
                       args['config']['training']['optimizer'],
                       mean_squared_error(test_y[ahead:], test_y[0:-ahead]),
                       r2test, r2pers
                       ))
        resfile.close()
    print('R2 test= ', r2test)
    print()
    print()

Experiment:  EXPERIMENT_12_1
DATA Dim = (31642, 12)
Train Dim = (25300, 12)

MSE test=  0.06587199534084048
MSE test persistence = 0.06522573412805915
R2 test persistence = 0.6967526438927013
R2 test=  -3.562509483973706e+31


Experiment:  EXPERIMENT_12_2
DATA Dim = (31642, 12)
Train Dim = (25300, 12)

MSE test=  0.06547007664939332
MSE test persistence = 0.06522573412805915
R2 test persistence = 0.6967526438927013
R2 test=  -4.1708051431953305e+31


Experiment:  EXPERIMENT_12_3
DATA Dim = (31642, 12)
Train Dim = (25300, 12)

MSE test=  0.06675434149531272
MSE test persistence = 0.06522573412805915
R2 test persistence = 0.6967526438927013
R2 test=  -2.6401092904615446e+31


Experiment:  EXPERIMENT_12_4
DATA Dim = (31642, 12)
Train Dim = (25300, 12)

MSE test=  0.06771351812693122
MSE test persistence = 0.06522573412805915
R2 test persistence = 0.6967526438927013
R2 test=  -4.7624091893755085e+31


Experiment:  EXPERIMENT_12_5
DATA Dim = (31642, 12)
Train Dim = (25300, 12)

MSE test=  0


MSE test=  0.0666892597921583
MSE test persistence = 0.06522573412805915
R2 test persistence = 0.6967526438927013
R2 test=  -4.274581947331881e+31


Experiment:  EXPERIMENT_12_8
DATA Dim = (31642, 12)
Train Dim = (25300, 12)

MSE test=  0.06705848632966559
MSE test persistence = 0.06522573412805915
R2 test persistence = 0.6967526438927013
R2 test=  -6.031653593965686e+31


Experiment:  EXPERIMENT_12_9
DATA Dim = (31642, 12)
Train Dim = (25300, 12)

MSE test=  0.06648191899508638
MSE test persistence = 0.06522573412805915
R2 test persistence = 0.6967526438927013
R2 test=  -4.239714409961112e+31


Experiment:  EXPERIMENT_12_10
DATA Dim = (31642, 12)
Train Dim = (25300, 12)

MSE test=  0.06602469147853271
MSE test persistence = 0.06522573412805915
R2 test persistence = 0.6967526438927013
R2 test=  -7.229691117279218e+31




# Results exeriments 3, 4 and 5 (comparing recurrent units):

In [9]:
from scipy import stats
import numpy as np

In [48]:
ex3 = [0.0702885, 0.0698503, 0.068926648, 0.0689567899, 0.06882927, 0.06800899, 0.0698788, 0.069624, 0.068284, 0.0706218]
ex4 = [0.065104033759,0.065086519879,0.065374186293,0.06606917133,0.06425994090,0.065160629259,0.064823167516,0.066148472510,0.065208579683,0.064284252642]
ex5 = [0.064711208771,0.064268316499,0.064549140001,0.06560491689,0.065434561352,0.065677783245,0.064509176975,0.063491246301,0.06503563564,0.064476142090]

print('Experiment 3 (SimpleRNN): mean={}, std={}'.format(np.mean(ex3), np.std(ex3)))
print('Experiment 4 (LSTM): mean={}, std={}'.format(np.mean(ex4), np.std(ex4)))
print('Experiment 5 (GRU): mean={}, std={}'.format(np.mean(ex5), np.std(ex5)))
print()
print('Statistical tests:')
statistic, p = stats.ttest_ind(ex3, ex4, equal_var = False)
print('P-value between 3 and 4: ',p)
statistic, p = stats.ttest_ind(ex3, ex5, equal_var = False)
print('P-value between 3 and 5: ',p)
statistic, p = stats.ttest_ind(ex4, ex5, equal_var = False)
print('P-value between 4 and 5: ',p)

Experiment 3 (SimpleRNN): mean=0.06932690979, std=0.0008147144342447596
Experiment 4 (LSTM): mean=0.0651518953771, std=0.0005958420260438794
Experiment 5 (GRU): mean=0.0647758127764, std=0.0006423812622199252

Statistical tests:
P-value between 3 and 4:  8.782134765659256e-10
P-value between 3 and 5:  2.293063116128389e-10
P-value between 4 and 5:  0.2142519527375881


In [3]:
ex5 = [60+46,120+8,60+56,120+5,60+55,60+56,120+13,60+51,60+57,120+6]
ex4 = [60+40,60+43,120+50,60+47,180+9,60+43,60+45,60+45,60+49,60+43]
ex3 = [60+39,60+40,60+41,60+39,60+39,120+29,60+40,60+40,60+41,60+41]
print('Experiment 3 (SimpleRNN): mean={}, std={}'.format(np.mean(ex3), np.std(ex3)))
print('Experiment 4 (LSTM): mean={}, std={}'.format(np.mean(ex4), np.std(ex4)))
print('Experiment 5 (GRU): mean={}, std={}'.format(np.mean(ex5), np.std(ex5)))

Experiment 3 (SimpleRNN): mean=104.9, std=14.720394016465727
Experiment 4 (LSTM): mean=119.4, std=30.437476899375216
Experiment 5 (GRU): mean=119.3, std=7.950471684120383


# Results exeriments 6 and 7 (comparing lag):

In [None]:
ex6 = [0.064544521164, 0.064573173834, 0.0671816517, 0.06518608620, 0.06616969068, 0.0681723778, 0.0659906549, 0.06724898523, 0.0680107645, 0.064508417852]
ex7 = []

# Results exeriments 8 and 9 (comparing layer depth):

In [17]:
ex4 = [0.065104033759,0.065086519879,0.065374186293,0.06606917133,0.06425994090,0.065160629259,0.064823167516,0.066148472510,0.065208579683,0.064284252642]
ex5 = [0.064711208771,0.064268316499,0.064549140001,0.06560491689,0.065434561352,0.065677783245,0.064509176975,0.063491246301,0.06503563564,0.064476142090]

ex8 = [0.06633847372,0.06867757954,0.065555703599,0.064632198796,0.065394043569,0.065534148121,0.064773644439,0.065256058188,0.064208210938,0.06799024366]
ex9 = [0.06638802220,0.06776956759,0.06857582754,0.06551856710,0.06760998399,0.06606791560,0.06658691154,0.06690377829,0.06639571663,0.06669685807]

print('Experiment 8 (LSTM): mean={}, std={}'.format(np.mean(ex8), np.std(ex8)))
print('Experiment 9 (GRU): mean={}, std={}'.format(np.mean(ex9), np.std(ex9)))
print()
print('Statistical tests:')
statistic, p = stats.ttest_ind(ex4, ex8, equal_var = False)
print('P-value between 4 and 8: ',p)
statistic, p = stats.ttest_ind(ex4, ex9, equal_var = False)
print('P-value between 4 and 9: ',p)
statistic, p = stats.ttest_ind(ex5, ex8, equal_var = False)
print('P-value between 5 and 8: ',p)
statistic, p = stats.ttest_ind(ex5, ex9, equal_var = False)
print('P-value between 5 and 9: ',p)
statistic, p = stats.ttest_ind(ex8, ex9, equal_var = False)
print('P-value between 8 and 9: ',p)

Experiment 8 (LSTM): mean=0.065836030457, std=0.0013746487692277989
Experiment 9 (GRU): mean=0.066851314855, std=0.0008544805535459683

Statistical tests:
P-value between 4 and 8:  0.19527534962903093
P-value between 4 and 9:  0.00015990364013512986
P-value between 5 and 8:  0.05660285835285115
P-value between 5 and 9:  2.174367904458802e-05
P-value between 8 and 9:  0.0793448655083408


In [14]:
ex8 = [120+15,120+29,120+13,120+15,120+19,120+19,120+16,120+19,120+17,120+22]
ex9 = [120+16,120+25,120+18,120+32,120+21,120+21,180+8,180+8,120+44,120+37]
print('Experiment 8 (LSTM): mean={}, std={}'.format(np.mean(ex8), np.std(ex8)))
print('Experiment 9 (GRU): mean={}, std={}'.format(np.mean(ex9), np.std(ex9)))

Experiment 8 (LSTM): mean=138.4, std=4.317406628984581
Experiment 9 (GRU): mean=155.0, std=18.477012745571184


# Results exeriments 10, 11 and 12 (comparing number of units):

In [12]:
ex4 = [0.065104033759,0.065086519879,0.065374186293,0.06606917133,0.06425994090,0.065160629259,0.064823167516,0.066148472510,0.065208579683,0.064284252642]

ex10 = [0.06539966677,0.06451291433,0.0650349539,0.06342062253,0.06702626501,0.0662864150,0.06439023295,0.0664226764,0.06590704824,0.06563936058]
ex11 = [0.06759049661,0.06507482560,0.06526857264,0.0650890127,0.06547051935,0.0652739467,0.06535641515,0.06604867044,0.06622054329,0.06466293701]
ex12 = [0.06587199534,0.06547007664,0.06675434149,0.06771351812,0.06970491373,0.06728385470,0.0666892597,0.067058486,0.06648191899,0.06602469147]


print('Experiment 10 (16-LSTM-1): mean={}, std={}'.format(np.mean(ex10), np.std(ex10)))
print('Experiment 11 (64-GRU-1): mean={}, std={}'.format(np.mean(ex11), np.std(ex11)))
print('Experiment 12 (8-GRU-1): mean={}, std={}'.format(np.mean(ex12), np.std(ex12)))

print('Statistical tests:')
statistic, p = stats.ttest_ind(ex4, ex10, equal_var = False)
print('P-value between 4 and 10: ',p)
statistic, p = stats.ttest_ind(ex4, ex11, equal_var = False)
print('P-value between 4 and 11: ',p)
statistic, p = stats.ttest_ind(ex4, ex12, equal_var = False)
print('P-value between 4 and 12: ',p)
statistic, p = stats.ttest_ind(ex10, ex11, equal_var = False)
print('P-value between 10 and 11: ',p)
statistic, p = stats.ttest_ind(ex10, ex12, equal_var = False)
print('P-value between 10 and 12: ',p)
statistic, p = stats.ttest_ind(ex11, ex12, equal_var = False)
print('P-value between 11 and 12: ',p)

Experiment 10 (16-LSTM-1): mean=0.065404015571, std=0.0010319065377750473
Experiment 11 (64-GRU-1): mean=0.06560559394900001, std=0.0007909108581842161
Experiment 12 (8-GRU-1): mean=0.066905305618, std=0.001131794704751865
Statistical tests:
P-value between 4 and 10:  0.5355430906233059
P-value between 4 and 11:  0.18741783636653073
P-value between 4 and 12:  0.001112961073813509
P-value between 10 and 11:  0.6477891587975152
P-value between 10 and 12:  0.00879908489675361
P-value between 11 and 12:  0.012168877619894145


# DATA

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
dateparse = lambda dates: pd.datetime.strptime(dates, '%x %H:%M')

data = pd.read_csv('./data/AtchisonUV_20150801_to_20151119.csv', parse_dates=['Date'], index_col='Date',date_parser=dateparse)

In [3]:
data.head()

Unnamed: 0_level_0,Benzene,CS2,Ozone,SO2,Toluene,Xylene,Wind Direction,Wind Speed,Wind Origin
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-10-10 03:15:00,2.5,2.5,2.5,2.5,2.5,220.61,162.0,6.0,SSE
2015-10-10 02:00:00,2.5,2.5,2.5,2.5,2.5,184.78,158.0,7.0,SSE
2015-10-10 04:30:00,2.5,2.5,2.5,2.5,573.36,144.61,166.0,5.0,SSE
2015-10-10 01:50:00,2.5,2.5,2.5,2.5,537.12,125.71,154.0,8.0,SSE
2015-10-10 04:20:00,2.5,2.5,2.5,2.5,424.89,105.5,166.0,5.0,SSE


In [5]:
data = data.sort_index()

In [6]:
data.tail()

Unnamed: 0_level_0,Benzene,CS2,Ozone,SO2,Toluene,Xylene,Wind Direction,Wind Speed,Wind Origin
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-11-18 23:40:00,2.5,2.5,20.87,2.5,2.5,2.5,154.0,2.0,SSE
2015-11-18 23:45:00,2.5,2.5,20.67,2.5,2.5,2.5,113.0,1.0,ESE
2015-11-18 23:50:00,2.5,2.5,20.61,2.5,2.5,2.5,107.0,2.0,ESE
2015-11-18 23:55:00,2.5,2.5,20.59,2.5,2.5,2.5,100.0,2.0,E
2015-11-19 00:00:00,2.5,2.5,20.47,2.5,2.5,2.5,86.0,2.0,E


In [30]:
def wind_parser(wind):
    if wind=='N':
        return [1,0,0,0]
    elif wind=='NNE':
        return [.75,0,.25,0]
    elif wind=='NE':
        return [.5,0,.5,0]
    elif wind=='ENE':
        return [.25,0,.75,0]
    elif wind=='E':
        return [0,0,1,0]
    elif wind=='ESE':
        return [0,.25,.75,0]
    elif wind=='SE':
        return [0,.5,.5,0]
    elif wind=='SSE':
        return [0,.75,.25,0]
    elif wind=='S':
        return [0,1,0,0]
    elif wind=='SSW':
        return [0,.75,0,.25]
    elif wind=='SW':
        return [0,.5,0,.5]
    elif wind=='WSW':
        return [0,.25,0,.75]
    elif wind=='W':
        return [0,0,0,1]
    elif wind=='WNW':
        return [.25,0,0,.75]
    elif wind=='NW':
        return [.5,0,0,.5]
    elif wind=='NNW':
        return [.75,0,0,.25]
    else:
        return [0,0,0,0]

In [None]:
a = np.load('')