In [14]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# Test driving the RNN model with Pytorch

Using a very simple RNN model to predict an index of S&P 500. This is similar to the way regressions would do this kind of jobs.

Setting things up so that RNN model be utilized, familiarize oneself with Pytorch. 

Yes, the model is naive, but it is also easy to tweak things so that it actually produces something more meaningful.

# Not trying to split into training set and test set

This is purely for the purpose of setting things up to get to know Pytorch.

In [15]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset


class StockDataset(Dataset):

    def __init__(self, x_data_to_process, y_data_to_process):
        """ inputs for x and y values are given as pandas obj """
        self.data = pd.merge(x_data_to_process, y_data_to_process, on='Date')
        self.data = self.data.values    # from pd to np

        print('The shape of the data is {}'.format(self.data.shape))

        self.x_data = self.data[:, 1:x_data_to_process.shape[1]].astype(np.float32)
        self.y_data = self.data[:, x_data_to_process.shape[1]:].astype(np.float32)

        """ Normalize x_data, putting it off for now """
        self.rebase_to_one()

        """ convert to torch """
        self.x_data = torch.from_numpy(self.x_data)
        self.y_data = torch.from_numpy(self.y_data)

        self.len = self.data.shape[0]

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len

    def rebase_to_one(self):
        """ self.x_data rebased to one on the firsts element """
        self.x_data = self.x_data.T
        list_of_first_elem_price = [i[0] for i in self.x_data]
        shape_row, shape_column = self.x_data.shape[0], self.x_data.shape[1]

        for i in range(shape_row):
            for j in range(shape_column):
                self.x_data[i][j] /= list_of_first_elem_price[i]
        self.x_data = self.x_data.T



def get_dictionary_of_data():
    def get_dictionary_of_data_helper(list_of_stocks):
        dict_of_stocks = {}
        for i in range(len(list_of_stocks)):
            key = str(list_of_stocks[i]).lower()
            path ='../input/Data/Stocks/{}.us.txt'.format(key)
            try:
                dict_of_stocks[key] = pd.read_csv(path)
            except FileNotFoundError:
                print("File of {} not found".format(key))

        return dict_of_stocks

    # Use these tickers if possible to predict S&P's value
    spx_partial_list = np.array(['MMM', 'AXP', 'AAPL', 'BA', 'CAT', 'CVX', 'CSCO',
                                     'KO', 'DWDP', 'XOM', 'GE', 'GS', 'HD', 'IBM', 'INTC',
                                     'JNJ', 'JPM', 'MCD', 'MRK', 'MSFT', 'NKE', 'PFE', 'PG',
                                     'TRV', 'UNH', 'UTX', 'VZ', 'V', 'WMT', 'DIS'])

    return get_dictionary_of_data_helper(spx_partial_list)


def closing_prices_in_pd(dict_of_stocks):
    """ merge stock data on Dates """
    temp_dict = {}
    for i in dict_of_stocks.keys():
        temp_dict[i] = (dict_of_stocks[i])[['Date', 'Close', 'Volume']]

    merged = None
    for i in temp_dict.keys():
        if merged is None:
            merged = temp_dict[i]
        elif temp_dict[i].shape[0] > 3000: # Arbitrary selection, longer than 3000
            merged = pd.merge(merged, temp_dict[i], on='Date')

    return merged


In [16]:
from torch.autograd import Variable


class Model(torch.nn.Module):

    def __init__(self, input_size, rnn_hidden_size, output_size):

        super(Model, self).__init__()

        self.rnn = torch.nn.RNN(input_size, rnn_hidden_size,
                                num_layers=2, nonlinearity='relu',
                                batch_first=True)
        self.h_0 = self.initialize_hidden(rnn_hidden_size)

        self.linear = torch.nn.Linear(rnn_hidden_size, output_size)

    def forward(self, x):

        x = x.unsqueeze(0)
        self.rnn.flatten_parameters()
        out, self.h_0 = self.rnn(x, self.h_0)

        out = self.linear(out)

        # third_output = self.relu(self.linear3(second_output))
        # fourth_output = self.relu(self.linear4(third_output))
        # output = self.rnn(lineared_output)
        # output = self.dropout(output)
        return out

    def initialize_hidden(self, rnn_hidden_size):
        # n_layers * n_directions, batch_size, rnn_hidden_size
        return Variable(torch.randn(2, 1, rnn_hidden_size),
                        requires_grad=True)

In [17]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader

RNN_HIDDEN_SIZE = 32


def train(input_size, hidden_size, output_size, train_loader):
    plt.figure(1, figsize=(12, 5))

    file_path = 'my_model.model'

    try:
        model = torch.load(file_path)
    except:
        model = Model(input_size, hidden_size, output_size)

    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    epochs = 1

    for epoch in range(epochs):
        predictions = []
        correct_values = []

        for i, data in enumerate(train_loader):
            xs, ys = data
            xs, ys = Variable(xs), Variable(ys)

            y_pred = model(xs)
            loss = criterion(y_pred, ys)
            optimizer.zero_grad()
            loss.backward(retain_graph=True)
            torch.nn.utils.clip_grad_norm(model.parameters(), 0.5)
            optimizer.step()

            predictions.append(y_pred.cpu().data.numpy().ravel())
            correct_values.append(ys.cpu().data.numpy().ravel())

        def stacking_for_charting(given_list):
            ret = np.array([0])
            for i in given_list:
                ret = np.hstack((ret, i.ravel()))
            return ret[1:]

        predictions_for_chart = stacking_for_charting(predictions)
        correct_values_for_chart = stacking_for_charting(correct_values)

        print(predictions_for_chart)

        steps = np.linspace(epoch*predictions_for_chart.shape[0],
                            (epoch+1)*predictions_for_chart.shape[0],
                            predictions_for_chart.shape[0])
        plt.plot(steps, predictions_for_chart, 'r-')
        plt.plot(steps, correct_values_for_chart, 'b-')
        plt.draw()
        plt.pause(0.05)

    torch.save(model, file_path)
    plt.show()

In [18]:
def main():

    """ GETTING THE DICTIONARY OF STOCK DATA """
    X_data_source = get_dictionary_of_data()

    """ THE ETF DATA TO BE USED AS Y """
    Y_data_source = {'spy': pd.read_csv('../input/Data/ETFs/spy.us.txt')}

    X_data = closing_prices_in_pd(X_data_source)
    Y_data = closing_prices_in_pd(Y_data_source)

    Y_data = Y_data.drop(['Volume'], axis=1)

    dataset = StockDataset(X_data, Y_data)
    train_loader = DataLoader(dataset=dataset, batch_size=32, shuffle=False, num_workers=1)

    input_size = X_data.shape[1]-1
    hidden_size = RNN_HIDDEN_SIZE
    output_size = Y_data.shape[1]-1

    train(input_size, hidden_size, output_size, train_loader)

In [19]:
main()