In [1]:
import os
import sys
import time
import pandas as pd
import datetime
#import pandas.io.data as web
from pandas_datareader import data
import matplotlib.pyplot as plt
from matplotlib import style
import glob
import numpy as np

# Build train and test datasets

In [2]:
X = np.zeros((2,3,4))
print(X)


[[[ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]

 [[ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]]


In [4]:
# Load csvs and create train, test datasets
def get_ticker_idx_maps(dfs):
    sorted_keys = sorted(dfs.keys())
    ticker_idx = {ticker: i for i,ticker in enumerate(sorted_keys)}
    idx_ticker = {i: ticker for i,ticker in enumerate(sorted_keys)}
    return ticker_idx, idx_ticker


#Create 3d array from dict entry
def create_3d(max_row, map):
    a = []
    for k,v in map.items():
        col_num = len(v[0])
        # Pad the smaller array with nan values
        padded_rows = max_rows - len(v)
        if (padded_rows == 0):
            a += [v]
        else:
            pad = [[np.nan]* col_num]*padded_rows #np.zeros((]padded_rows,col_num))
            a += [pad + v]
    return np.array(a)

def pad_rows(max_rows, col_num, v):
    padded_rows = max_rows - len(v)
    #print('pad_rows: max_rows', max_rows,'col_num =', col_num, 'padded_rows =', padded_rows)
#    print(v)
    a = []
    if (padded_rows == 0):
        a = v
    else:
        # Concate the 2 matrices vertically
        if (col_num == 1):
            pad = [np.nan]*padded_rows
            a = pad + list(v)
        else:
            pad = np.array([[np.nan]* col_num]*padded_rows)
            #print(pad.shape, v.shape)
            #a = np.column_stack((pad,v))
            a = np.concatenate((pad,v),axis=0)

    return np.array(a)


def dict_2d_to_3d(dfs, ticker_idx, colname_idx, shift_value):
    sorted_keys = sorted(dfs.keys())
    # dfs with tickers in keys -> dfs with keys as index(tickers)
    dfs_int_keys = {ticker_idx[ticker]: dfs[ticker].values for ticker in sorted_keys}
    # then convert it to 3d array
    X = np.array(list(np.array(pd.DataFrame.from_dict(dfs_int_keys[key])) for key in sorted(dfs_int_keys.keys())))
    y = np.array(list(dfs_int_keys[key][:,colname_idx['Adj Close']] for key in sorted(dfs_int_keys.keys())))

    #print("before shifting: y[0][:", shift_value, '] =', y[0][:shift_value])
    #print("before shifting: y[0][-", shift_value, ':] =', y[0][-shift_value:])
    print("dict_2d_to_3d: X.shape =", X.shape, "y.shape=", y.shape)

    max_row_count = 0
    X2 = None
    y2 = None
    for i in range(len(y)):
        y[i] = np.roll(y[i], -shift_value, axis=0)
        # The last shift_value rows have no data
        #y[i][-shift_value:] = None
        
        # Remove the last shift_value elements of the arrays
        #y[i] = y[i][:len(y[i])-shift_value].copy()
        #X[i] = X[i][:len(X[i])-shift_value].copy()
        yi = y[i][:len(y[i])-shift_value].copy()
        Xi = X[i][:len(X[i])-shift_value].copy()
        rowcount = len(Xi)
        if (rowcount > max_row_count):
            max_row_count = rowcount
        #print(max_row_count)
        
        yi_res = pad_rows(max_row_count, 1, yi)
        Xi_res = pad_rows(max_row_count, X[0].shape[1], Xi)

        if (X2 is None):
            y2 = np.array([yi_res]).reshape(len(yi_res),1)
            X2 = np.array([Xi_res]).reshape(Xi_res.shape[0], Xi_res.shape[1])
        else:
            y2 = np.concatenate((y2, np.array([yi_res]).reshape(len(yi_res),1)), axis=0)
            X2 = np.concatenate((X2, Xi_res), axis=0)

    # 2d -> 3d
    X2 = X2.reshape(len(y), Xi_res.shape[0], Xi_res.shape[1])
    y2 = y2.reshape(len(y), Xi_res.shape[0], 1)    
    return X2, y2


def dfs_to_3d(dfs, ticker_idx, colname_idx, shift_value):
    sorted_keys = sorted(dfs.keys())
    # dfs with tickers in keys -> dfs with keys as index(tickers)
    dfs_int_keys = {ticker_idx[ticker]: dfs[ticker].values for ticker in sorted_keys}
    # then convert it to 3d array
    X = np.array(list(np.array(pd.DataFrame.from_dict(dfs_int_keys[key])) for key in sorted(dfs_int_keys.keys())))
    y = np.array(list(dfs_int_keys[key][:,colname_idx['Adj Close']] for key in sorted(dfs_int_keys.keys())))
    print("dfs_to_3d: X.shape =", X.shape, "y.shape=", y.shape)

    for i in range(len(y)):
        y[i] = np.roll(y[i], -shift_value, axis=0)
        # Remove the last shift_value elements of the arrays
        y[i] = np.array(y[i][:len(y[i])-shift_value]) #.copy()
        X[i] = np.array(X[i][:len(X[i])-shift_value]) #.copy()
        
    return X, y #np.dstack(temp)


def load_csv_data(source_dir="../data", days_for_prediction=30):
    '''
    Input:
    - source_dir: directory where the stock price CSVs are located
    - days_for_prediction: number of days for the prediction prices. Must be at least 30 days
    Description:
    Read csv files in source_dir, load into dataframes and split into
    X_train, Y_train, X_test, Y_test
    '''
    assert (days_for_prediction >= 30), "days_for_prediction must be >= 30"

    csv_file_pattern = os.path.join(source_dir, "*.csv")
    csv_files = glob.glob(csv_file_pattern)
    dfs1 = {}
    dfs2 = {}
    for filename in csv_files:
        arr = filename.split('/')
        ticker = arr[-1].split('.')[0]
        print(filename, ticker)
        #print(ticker, df.head())        
        #  Date, Open, High , Low , Close, Adj Close, Volume
        #df = pd.read_csv(filename, parse_dates=[0]) #index_col='Date')       
        #  Open, High , Low , Close, Adj Close, Volume
        df = pd.read_csv(filename, index_col='Date')

        # Split 80/20
        split_len = int(len(df) * .8)
        #print(split_len)
        df1 = df.iloc[:split_len,:]
        df2 = df.iloc[split_len:,:]

        #print("df1.tail", df1.tail())
        #print("df2.head", df2.head())
        dfs1[ticker] = df1
        dfs2[ticker] = df2

        #print(len(df), len(df1)/len(df), len(df2)/len(df))
    #print(max_row_count1, max_row_count2)
    
    colname_idx = {colname: i for i, colname in enumerate(list(dfs1[ticker]))}
    ticker_idx1, idx_ticker1 = get_ticker_idx_maps(dfs1)
    ticker_idx2, idx_ticker2 = get_ticker_idx_maps(dfs2)

    # Validate that the 2 mapping sets are the same for train and test data
    shared_items = set(ticker_idx1.items()) & set(ticker_idx2.items())
    assert(len(shared_items) == len(ticker_idx1) and len(shared_items) == len(ticker_idx2))
    
    # Convert dict of 2d arrays to 3d arrays
    #train_X, train_y = dict_2d_to_3d(dfs1, ticker_idx1, colname_idx, days_for_prediction)
    #test_X, test_y = dict_2d_to_3d(dfs2, ticker_idx2, colname_idx, days_for_prediction)
    train_X, train_y = dfs_to_3d(dfs1, ticker_idx1, colname_idx, days_for_prediction)
    test_X, test_y = dfs_to_3d(dfs2, ticker_idx2, colname_idx, days_for_prediction)


    return colname_idx, ticker_idx1, idx_ticker1, train_X, train_y, test_X, test_y

colname_idx, ticker_idx, idx_ticker, train_X, train_y, test_X, test_y = load_csv_data("../data") #/sp500") #, days_for_prediction=30)
#print(train_X, test_X)
#print(len(colname_idx), train_max_rowcount, test_max_rowcount)
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)
#print(train_X[0], test_X[0])
print(train_X[0].shape, train_X[1].shape)

../data/AAPL.csv AAPL
../data/all_tweets.csv all_tweets


CParserError: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.


In [None]:
print(len(train_y[0]), train_y[0])
print(len(train_y[1]), train_y[1])
print(colname_idx['Adj Close'])

In [None]:
print((train_X[0]).shape)

In [None]:
# check shape for these 501 stocks
# Shape is [number_of_stocks, number_price_entries, number_of_features]
for i in range(len(train_X)):
    print(i, idx_ticker[i], 'train:', train_X[i].shape, train_y[i].shape,'test:', test_X[i].shape, test_y[i].shape) #round(train_X[i].shape[0]/(train_X[i].shape[0]+test_X[i].shape[0]),2))

In [None]:
#print('train_X[0][29:39]', train_X[0][29:39])
import math

adj_close_idx = colname_idx['Adj Close']

'''
Checking to see if y price is matching with X data + 30 days
'''
def check(train_X, train_y, shift_value=30):
    print('Checking X, y data ... ')
    for k in range(len(train_X)):
        train_yi = train_y[k]
        train_Xi = train_X[k]
        for i in range(len(train_yi) - shift_value):
            x_val = train_Xi[i + shift_value, adj_close_idx]
            #print(x_val, train_yi[i])
            #if (not math.isnan(train_yi[i][0]) and train_yi[i][0] != x_val):
            if (not math.isnan(train_yi[i]) and train_yi[i] != x_val):
                print("error: ", k, i, train_yi[i], x_val)

    print("Everything looks good for X and y!")
    
'''
def check2(train_Xi, train_yi):
    print(len(train_Xi), len(train_yi))
    print('train_Xi[30:40,adj_close_idx]', train_Xi[30:40,adj_close_idx])
    # Should be same as above
    y_1st_10 = train_yi[:10].reshape(10)
    print('train_yi[:10]', y_1st_10)
    assert(np.array_equal(train_Xi[30:40,adj_close_idx], y_1st_10))


    print('train_Xi[-40:,adj_close_idx]', train_Xi[-40:,adj_close_idx])
    print('train_yi[-10:]', train_yi[-10:])
    #print(train_yi)
'''

check(train_X, train_y)
check(test_X, test_y)

# Now try with baseline model

### 1 model w 3-d matrix

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Activation, LSTM
from keras.utils import plot_model

max_features=len(colname_idx)

window_size=30
model = Sequential()
# model.add(LSTM(hidden, input_shape=(examples, features)))
model.add(LSTM(units=window_size, input_shape=(None,max_features), return_sequences=True, dropout=0.2))
model.add(LSTM(window_size*2, return_sequences=False, dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
png_file='lstm.png'
plot_model(model, to_file=png_file, show_shapes=True, show_layer_names=True)
print(model.summary())

In [None]:
from IPython.display import Image
Image(filename=png_file)

In [None]:
# Train
print(train_X.shape, train_y.shape)
model.fit(x=train_X, y=train_y, batch_size=1, epochs=10, shuffle=True)
score = model.evaluate(test_X, test_Y, batch_size=16)

## 1 model per stock

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Activation, LSTM
from keras.utils import plot_model

max_features=len(colname_idx)
models=[]

def plot_results(predicted_data, true_data):
    fig = plt.figure(facecolor='white')
    ax = fig.add_subplot(111)
    ax.plot(true_data, label='True Data')
    plt.plot(predicted_data, label='Prediction')
    plt.legend()
    plt.show()


def normalize_windows(window_data):
    normalised_data = []
    for window in window_data:
        normalised_window = [((p / window[0]) - 1) for p in window]
        normalised_data.append(normalised_window)
    return normalised_data


window_size = 30
for i in range(len(train_X)):    
    model = Sequential()
    # model.add(LSTM(hidden, input_shape=(examples, features)))
    #model.add(LSTM(max_features, input_shape=(None, train_X[i].shape[1]), dropout=0.2))
    print(i, type(train_X[i])) #.shape)
    model.add(LSTM(neurons, batch_input_shape=(batch_size, X.shape[1], X.shape[2]), stateful=True))


    #model.add(LSTM(1, input_shape=(max_features,1), return_sequences=True, dropout=0.2))
    #model.add(LSTM(max_features, return_sequences=False, dropout=0.2))
    
    model.add(Dense(1, activation='sigmoid'))
    #model.compile(loss='mse', optimizer='rmsprop')
    model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
    print('model.output_shape', model.output_shape)

    plot_model(model, to_file=idx_ticker[i] + '.png', show_shapes=True, show_layer_names=True)
    print(idx_ticker[i], model.summary())
    print('train_X[i].shape', train_X[i].shape, 'train_y[i].shape', train_y[i].shape)
    
    # Normalize data
    '''
    print("Normalizing data....")
    train_X[i] = normalize_windows(train_X[i])
    train_y[i]= normalize_windows(train_y[i])
    test_X[i] = normalize_windows(test_X[i])
    test_y[i]= normalize_windows(test_y[i])
    '''

    
    # Reshape to match with input from 1st LSTM
    x = np.reshape(train_X[i], (train_X[i].shape[0], max_features, 1))
    y = np.reshape(train_y[i], (train_y[i].shape[0], 1)) 
    print(x.shape, y.shape)

    model.fit(x, y, batch_size=100, epochs=5, shuffle=True)
    # Test
    x2 = np.reshape(test_X[i], (test_X[i].shape[0], max_features,1))
    y2 =np.reshape(test_y[i], (test_y[i].shape[0], 1))
    score = model.evaluate(x2, y2, batch_size=16)
    print("score = ", score)
    predicted = model.predict(x2)
    print("predicted.shape", predicted.shape)
    plot_results(predicted, y2)

    models.append(model)



In [None]:
from IPython.display import Image
Image(filename=idx_ticker[0] + '.png') 

In [None]:
map = {0:[[2,3],[4,5],[6,6]], 1: [[1,1],[2,2]], 2:[[0,0],[3,4]], 3:[[0,0],[1,1]]}
print(map)

In [None]:
a = np.array([v for k,v in map.items()])
print(a)
print(a.shape)
a2 = np.zeros((4,3,2))
print(a2.shape)