In [1]:
%matplotlib inline

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM

Using TensorFlow backend.


In [3]:
import pandas_datareader as pdr
import pandas as pd
from datetime import datetime
import bs4 as bs
import requests
import os
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [93]:
class setup_data:
    
    def __init__(self):
        
        self.raw_df = None
        self.tickers = None
        self.scaler = MinMaxScaler()
    
    def get_sp500_tickers(self):
        resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
        soup = bs.BeautifulSoup(resp.text, 'lxml')
        table = soup.find('table', {'class': 'wikitable sortable'})
        tickers = []
        for row in table.findAll('tr')[1:]:
            ticker = row.findAll('td')[0].text
            tickers.append(ticker)
        self.tickers = tickers
        return self

    def get_data_from_yahoo(self, tkr, start):
        tckr = pdr.DataReader(tkr, 'google', start)
        return tckr

    def pull_raw_data(self, start, end = None, write_path=None):
        
        if self.tickers is None:
            self.get_sp500_tickers()
            
        if end is None:
            end = datetime.now().strftime("%Y-%m-%d")
        
        bad_tickers = {"LMT":"NYSE:LMT", 'NWL':'NYSE:NWL', 'NBL':'NYSE:NBL'}

        self.raw_df = pd.DataFrame(index=pd.date_range(start, end)[::-1])
        
        for t in self.tickers:
            if t in bad_tickers.keys():
                t=bad_tickers[t]
            one_ticker = self.get_data_from_yahoo(t, start)[['Close', 'Volume']]
            one_ticker.columns = [x + t for x in one_ticker.columns]
            self.raw_df = self.raw_df.merge(one_ticker, 
                                            left_index=True, right_index=True, how='left')

        if write_path is not None:
            if not os.path.exists(write_path):
                os.makedirs(write_path)
            fname = 'sp_500_{0}_{1}.csv'.format(start, end)
            self.raw_df.to_csv(os.path.join(write_path, fname))
        return self
    
    def read_raw_data(self, filename):
        self.raw_df = pd.read_csv(filename, index_col=0)
        self.tickers = self.raw_df.columns
        self.raw_df.index = pd.to_datetime(self.raw_df.index)
        return self
    
    def preprocess_stocks(self, df):
        
        # encode non-trading days as -1
        df = df.fillna(-1)
        df = pd.DataFrame(self.scaler.fit_transform(df), columns=df.columns)
        
        return df
    
    def transform_y(self, arr):
        c = np.zeros((arr.shape[0], arr.shape[1]*2), dtype=arr.dtype)
        c[:,::2] = arr
        return self.scaler.inverse_transform(c)[:,::2]
    
    def transform_X(self, arr):
        return self.scaler.inverse_transform(arr)
    
    def make_modeling_data(self, window=30, step=1):
        
        if self.raw_df is None:
            raise Exception("Raw data needs to be loaded first by using pull_raw_data() or read_raw_data()")
        
        df = self.preprocess_stocks(self.raw_df)
        
        # based on the previous window days, what is tomorrows's s&p 500 closing prices?
        window_ = window + 1
        num_samples = len(df)-window_
        nstocks = len(df.columns)
        M = np.zeros((num_samples, window_, nstocks), dtype=np.float32)
        for i in range(0, num_samples):
            M[i, :, :] = df.iloc[i:(i +window_)].to_dense()
        X = M[:, 1:window_, :]
        y = M[:, 0, :]
        return X, y
    
    def make_train_test(self, train_end_date, **kwargs):
        
        X, y = self.make_modeling_data(**kwargs)
        self.date_to_int = {v:k for k, v in enumerate(setup.raw_df.index.strftime('%Y-%m-%d'))}
        self.stock_to_int = {v:k for k, v in enumerate(setup.raw_df.columns)}
        
        ix_test = self.date_to_int[train_end_date]

        X_test = X[:ix_test, :, :]
        X_train = X[ix_test:, :, :]
        y_test = y[:ix_test, ::2]
        y_train = y[ix_test:, ::2]
        
        return X_train, X_test, y_train, y_test
    
    def make_predictions(keras_model, test_examples):
        
        pass
    
    def predict_date(keras_model, date):
        
        pass
        

In [94]:
setup = setup_data()
# setup.pull_raw_data(start = '2006-01-01', write_path = '/home/andrew/data/stocks')
setup.read_raw_data('/home/andrew/data/stocks/sp_500_2006-01-01_2017-07-13.csv')
X_train, X_test, y_train, y_test = setup.make_train_test(train_end_date = '2017-05-23', window=30, step=1)

In [115]:
# build the LSTM model
model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2]), dropout=0.2))
model.add(Dense(y_train.shape[1]))
model.add(Activation('linear'))
model.compile(loss='mean_squared_error', optimizer='adam')

In [116]:
model.fit(X_train, y_train,
          batch_size=64,
          epochs=24, 
          verbose=1, 
          validation_data=(X_test, y_test))

Train on 4130 samples, validate on 51 samples
Epoch 1/24
Epoch 2/24
Epoch 3/24
Epoch 4/24
Epoch 5/24
Epoch 6/24
Epoch 7/24
Epoch 8/24
Epoch 9/24
Epoch 10/24
Epoch 11/24
Epoch 12/24
Epoch 13/24
Epoch 14/24
Epoch 15/24
Epoch 16/24
Epoch 17/24
Epoch 18/24
Epoch 19/24
Epoch 20/24
Epoch 21/24
Epoch 22/24
Epoch 23/24
Epoch 24/24


<keras.callbacks.History at 0x7f785d7d7c50>

In [120]:
y_pred = model.predict(X_test)
trues = pd.DataFrame(setup.transform_y(y_test), columns=setup.raw_df.columns[::2], index=setup.raw_df.index[:y_test.shape[0]])
preds = pd.DataFrame(setup.transform_y(y_pred), columns=setup.raw_df.columns[::2], index=setup.raw_df.index[:y_test.shape[0]])# # .shape

In [123]:
pd.DataFrame({'true':trues['CloseMMM'], 'pred':preds['CloseMMM']})

Unnamed: 0,pred,true
2017-07-13,205.524841,211.089996
2017-07-12,202.038071,211.299988
2017-07-11,184.829025,209.660004
2017-07-10,166.340363,210.48999
2017-07-09,-4.551937,-1.0
2017-07-08,9.921835,-1.0
2017-07-07,198.232712,209.589996
2017-07-06,200.662094,208.020004
2017-07-05,200.955505,209.759995
2017-07-04,184.707413,-1.0


In [124]:
(trues-preds).mean().mean()

1.6828903

In [None]:
# ideas...drop non-trading days, increase regularization, predict +/- instead of price