In [1]:
import os
import sys
import time
import pandas as pd
import datetime
#import pandas.io.data as web
from pandas_datareader import data
import matplotlib.pyplot as plt
from matplotlib import style
import glob
import numpy as np

# Build train and test datasets

In [2]:
# Load csvs and create train, test datasets
def get_ticker_idx_maps(dfs):
    sorted_keys = sorted(dfs.keys())
    ticker_idx = {ticker: i for i,ticker in enumerate(sorted_keys)}
    idx_ticker = {i: ticker for i,ticker in enumerate(sorted_keys)}
    return ticker_idx, idx_ticker
    
def dict_2d_to_3d(dfs, ticker_idx, colname_idx, shift_value):
    sorted_keys = sorted(dfs.keys())
    # dfs with tickers in keys -> dfs with keys as index(tickers)
    dfs_int_keys = {ticker_idx[ticker]: dfs[ticker].values for ticker in sorted_keys}
    # then convert it to 3d array
    X = np.array(list(np.array(pd.DataFrame.from_dict(dfs_int_keys[key])) for key in sorted(dfs_int_keys.keys())))
    # y = np.roll(x,1,axis=1)
    # y[:,0] = 0
    y = np.array(list(dfs_int_keys[key][:,colname_idx['Adj Close']] for key in sorted(dfs_int_keys.keys())))
    #print("before shifting: y[0][:", shift_value, '] =', y[0][:shift_value])
    #print("before shifting: y[0][-", shift_value, ':] =', y[0][-shift_value:])
    for i in range(len(y)):
        y[i] = np.roll(y[i], -shift_value, axis=0)
        # The last shift_value rows have no data
        #y[i][-shift_value:] = None
        
        # Remove the last shift_value elements of the arrays
        y[i] = y[i][:len(y[i])-shift_value].copy()
        X[i] = X[i][:len(X[i])-shift_value].copy()
   
    #print(" X[0][:", shift_value, "]", X[0][:shift_value])
    #print('After shifting: y[0][0]', y[0][0])
    #print(' y[len(y)-shift_value-1:]', y[len(y)-shift_value-1:])

    return(X, y) #np.dstack(temp)


def load_csv_data(source_dir="../data", days_for_prediction=30):
    '''
    Input:
    - source_dir: directory where the stock price CSVs are located
    - days_for_prediction: number of days for the prediction prices. Must be at least 30 days
    Description:
    Read csv files in source_dir, load into dataframes and split into
    X_train, Y_train, X_test, Y_test
    '''
    assert (days_for_prediction >= 30), "days_for_prediction must be >= 30"

    csv_file_pattern = os.path.join(source_dir, "*.csv")
    csv_files = glob.glob(csv_file_pattern)
    dfs1 = {}
    dfs2 = {}
    for filename in csv_files:
        arr = filename.split('/')
        ticker = arr[-1].split('.')[0]
        #print(filename, ticker)
        #print(ticker, df.head())
        
        #  Date, Open, High , Low , Close, Adj Close, Volume
        df = pd.read_csv(filename, parse_dates=[0]) #index_col='Date')
        
        # Split 80/20
        split_len = int(len(df) * .8)
        #print(split_len)
        df1 = df.iloc[:split_len,:]
        df2 = df.iloc[split_len:,:]
        #print("df1.tail", df1.tail())
        #print("df2.head", df2.head())
        dfs1[ticker] = df1
        dfs2[ticker] = df2

        #print(len(df), len(df1)/len(df), len(df2)/len(df))
    
    colname_idx = {colname: i for i, colname in enumerate(list(dfs1[ticker]))}
    ticker_idx1, idx_ticker1 = get_ticker_idx_maps(dfs1)
    ticker_idx2, idx_ticker2 = get_ticker_idx_maps(dfs2)

    # Validate that the 2 mapping sets are the same for train and test data
    shared_items = set(ticker_idx1.items()) & set(ticker_idx2.items())
    assert(len(shared_items) == len(ticker_idx1) and len(shared_items) == len(ticker_idx2))
    
    # Convert dict of 2d arrays to 3d arrays
    train_X, train_y = dict_2d_to_3d(dfs1, ticker_idx1, colname_idx, days_for_prediction)
    test_X, test_y = dict_2d_to_3d(dfs2, ticker_idx2, colname_idx, days_for_prediction)
    return ticker_idx1, idx_ticker1, train_X, train_y, test_X, test_y

ticker_idx, idx_ticker, train_X, train_y, test_X, test_y = load_csv_data("../data/sp500") #, days_for_prediction=30)
#print(train_X, test_X)
print(train_X.shape, test_X.shape)
#print(train_X[0], test_X[0])

(501,) (501,)


In [17]:
print(train_y[0])

[52.183887 51.626209 61.823978 ..., 38.55048 38.267483 38.515968]


In [16]:
# check shape for these 501 stocks
for i in range(len(train_X)):
    print(i, idx_ticker[i], 'train:', train_X[i].shape, train_y[i].shape,'test:', test_X[i].shape, test_y[i].shape) #round(train_X[i].shape[0]/(train_X[i].shape[0]+test_X[i].shape[0]),2))

0 A train: (3542, 7) (3542,) test: (863, 7) (863,)
1 AAL train: (2389, 7) (2389,) test: (575, 7) (575,)
2 AAP train: (3159, 7) (3159,) test: (768, 7) (768,)
3 AAPL train: (3542, 7) (3542,) test: (863, 7) (863,)
4 ABBV train: (926, 7) (926,) test: (210, 7) (210,)
5 ABC train: (3542, 7) (3542,) test: (863, 7) (863,)
6 ABT train: (3542, 7) (3542,) test: (863, 7) (863,)
7 ACN train: (3230, 7) (3230,) test: (786, 7) (786,)
8 ADBE train: (3542, 7) (3542,) test: (863, 7) (863,)
9 ADI train: (3542, 7) (3542,) test: (863, 7) (863,)
10 ADM train: (3542, 7) (3542,) test: (863, 7) (863,)
11 ADP train: (3542, 7) (3542,) test: (863, 7) (863,)
12 ADS train: (3249, 7) (3249,) test: (790, 7) (790,)
13 ADSK train: (3542, 7) (3542,) test: (863, 7) (863,)
14 AEE train: (3542, 7) (3542,) test: (863, 7) (863,)
15 AEP train: (3542, 7) (3542,) test: (863, 7) (863,)
16 AES train: (3542, 7) (3542,) test: (863, 7) (863,)
17 AET train: (3542, 7) (3542,) test: (863, 7) (863,)
18 AFL train: (3542, 7) (3542,) test: 

### Note: Not all stocks' shapes are the same! 
Can we train this 3d array (501,,)?


In [4]:
#print('train_X[0][29:39]', train_X[0][29:39])
print(len(train_X[0]), len(train_y[0]))
print('train_X[0][30:40,5]', train_X[0][30:40,5])
# Should be same as above
print('train_y[0][:10]', train_y[0][:10])
assert(np.array_equal(train_X[0][30:40,5], train_y[0][:10]))


print('train_X[0][-40:,5]', train_X[0][-40:,5])
# Should be empty
print('train_y[0][-10:]', train_y[0][-10:])
print(train_y[0])

3542 3542
train_X[0][30:40,5] [52.183887 51.626209 61.823978 59.752552 58.318485 63.098698 68.038239
 68.914604 64.373398 66.205826]
train_y[0][:10] [52.183887 51.626209 61.823978 59.752552 58.318485 63.098698 68.038239
 68.914604 64.373398 66.205826]
train_X[0][-40:,5] [36.898647 36.843559 37.759476 37.917862 38.110703 37.766369 38.055607
 37.986744 38.041836 38.820019 39.866795 39.515579 39.226326 39.625763
 39.41227 39.570667 39.461605 39.565147 39.47541 38.798965 39.289051
 39.095768 39.654881 40.303711 40.317516 40.676445 40.676445 41.332191
 41.6497 41.760143 41.905094 42.001732 42.056946 41.049179 39.944782
 40.241585 40.10353900000001 39.792931 41.076797 40.138054]
train_y[0][-10:] [39.882652 39.896469 40.56601 40.324417 40.138054 39.247623 39.385674
 38.55048 38.267483 38.515968]
[52.183887 51.626209 61.823978 ..., 38.55048 38.267483 38.515968]


# Now try with baseline model