# Chapter 12 - Strategies to Build Superior Models

_pg. 169-184_

## UK Unemployment Rate Data

In [2]:
# From Chapter 8 pg.105-112
import numpy as np
import pandas as pd
import urllib

# UPDATED
# url = "http://www.bankofengland.co.uk/research/Documents/onebank/threecenturies_v2.3.xlsx"
# dataset has been updated to version 3 and renamed
# now found @ http://www.bankofengland.co.uk/research/Documents/datasets/millenniumofdata_v3_final.xlsx
# 

# ATTN: you need to redefine these lines according to your system
loc = "/home/dpindk/irlab/big-data-summer-school-2017/UK_Economic.xls"
urllib.urlretrieve(url, loc)

Ex = pd.ExcelFile(loc)

XLRDError: Unsupported format, or corrupt file: Expected BOF record; found '\xef\xbb\xbf<!DOC'

## Extraction of Spreadsheet Data

In [None]:
loc = "/home/dpindk/irlab/big-data-summer-school-2017/stockindexes.xls"
Excel_file = pd.ExcelFile(loc)

# print Excel_file.sheet_names
for name in Excel_file.sheet_names:
    print name

In [None]:
ftse_data = Excel_file.parse("FTSE100")
dj_data = Excel_file.parse("Dow Jones Industrial")

In [None]:
print ftse_data.head()

In [None]:
print dj_data.head()

In [None]:
ftse100 = ftse_data.iloc[4:1357, 1]
dj = dj_data.iloc[4:1357, 1]

## Check Data Values

In [None]:
print ftse100.head()

In [None]:
print ftse100.tail()

In [None]:
print dj.head()

In [None]:
print dj.tail()

## How to Work with Multiple Targets

In [None]:
yt = pd.concat([ftse100, dj], axis=1)
print yt.head()

In [None]:
yt = yt.reset_index(drop=True)
yt.columns = ['ftse100', 'dj']
print yt.head()

In [None]:
yt = yt.pct_change(1)
win = 30
vol_t = yt.rolling(window=win, center=True).std()

In [None]:
# Add plot here

## Creation of Hand Crafted Features

In [None]:
x1 = np.log((vol_t.shift(1) / vol_t.shift(2)) * vol_t.shift(1))
x2 = np.log((vol_t.shift(1) / vol_t.shift(3)) * vol_t.shift(1))
x3 = np.log((vol_t.shift(1) / vol_t.shift(4)) * vol_t.shift(1))
x4 = np.log((vol_t.shift(1) / vol_t.shift(5)) * vol_t.shift(1))
x5 = np.log((vol_t.shift(1) / vol_t.shift(6)) * vol_t.shift(1))

In [None]:
data = pd.concat([vol_t, x1, x2, x3, x4, x5], axis=1)
data.columns = ['ftse_t', 'dj_t',
                'ftse_t-1', 'dj_t-1',
                'ftse_t-2', 'dj_t-2',
                'ftse_t-3', 'dj_t-3',
                'ftse_t-4', 'dj_t-4',
                'ftse_t-5', 'dj_t-5',]
data = data.dropna()

## Target and Features in One Place

In [None]:
cols_y = ['ftse_t', 'dj_t']
y = data[cols_y]
cols = ['ftse_t-1', 'dj_t-1',
        'ftse_t-2', 'dj_t-2',
        'ftse_t-3', 'dj_t-3',
        'ftse_t-4', 'dj_t-4',
        'ftse_t-5', 'dj_t-5',]
x = data[cols]

## Scaling Data

In [None]:
from sklearn import preprocessing
num_attrib = 10
scaler_x = preprocessing.MinMaxScaler(feature_range=(-1,1))
x = np.array(x).reshape((len(x), num_attrib))
x = scaler_x.fit_transform(x)

num_response = 2

scaler_y = preprocessing.MinMaxScaler(feature_range=(0,1))
y = np.array(y).reshape((len(y), num_response))
y = scaler_y.fit_transform(y)

## Train and Test Sets

In [None]:
train_end = 1131
data_end = len(y)
x_train = x[0:train_end, ]
x_test = x[train_end + 1:data_end, ]
y_train = y[0:train_end]
y_test = y[train_end + 1:data_end]
x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1]))
x_test = np.reshape(x_test, (x_test.shape[0], 1, x_test.shape[1]))

In [None]:
print "Shape of x_train is ", x_train.shape

In [None]:
print "Shape of x_test is ", x_test.shape

## Model Specification and Fit

In [None]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.layers.recurrent import SimpleRNN
from keras.optimizers import SGD

In [None]:
seed = 2016
num_epochs=20
np.random.seed(seed)
fit1 = Sequential()
# UPDATED
# fit1.add(SimpleRNN(output_dim=10, activation='sigmoid', input_shape=(1, num_attrib)))
fit1.add(SimpleRNN(units=10, activation='sigmoid', input_shape=(1, num_attrib)))
# UPDATED
# fit1.add(Dense(output_dim=num_response, activation='linear'))
fit1.add(Dense(units=num_response, activation='linear'))
sgd = SGD(lr=0.01, momentum=0.90, nesterov=True)
fit1.compile(loss='mean_squared_error', optimizer=sgd)
fit1.fit(x_train, y_train, batch_size=1, epochs=num_epochs)

In [None]:
score_train = fit1.evaluate(x_train, y_train, batch_size=1)
score_test = fit1.evaluate(x_test, y_test, batch_size=1)
print 
print "in train  MSE = ", round(score_train, 5)
print "in test   MSE = ", round(score_test, 5)

In [None]:
pred1 = fit1.predict(x_test)
pred1 = scaler_y.inverse_transform(np.array(pred1).reshape((len(pred1), 2)))

In [None]:
# add plot here