In [1]:
"""author: GK"""
import pandas as pd
import collections
import numpy as np
from os import mkdir

In [2]:
""" Import data """

all_features_df = pd.read_csv('All_features_df.csv')
tickers = all_features_df.Name.unique()
all_features_df['date'] = pd.to_datetime(all_features_df['date'],format='%Y-%m-%d')
all_features_df.set_index(['Name','date'],inplace = True) # custom created features
orig_features_df = all_features_df.loc[:,['open','high','low','close','volume']] # O,H,L,C,Volume only

In [None]:
""" Target industries """
weblink = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
industries_df = pd.read_html(weblink)[0]
industries_df.to_csv('wiki_sp500_industries.csv')
print(industries_df.head())

In [None]:
"""for prediction"""
orig_features_df

In [None]:
all_features_df

# RNN

**Goal**: use about 80% of past years' data to predict 20% of future years' data. <br>  <br>
Split roughly<br>
Train: 4 years (Feb 2013 to Feb 2017)<br>
Test: 1 year (Feb 2017 to Feb 2018)<br>

Alternative approaches: 
1. XGboost with technical indicators (https://medium.com/@hsahu/stock-prediction-with-xgboost-a-technical-indicators-approach-5f7e5940e9e3)
2. Multinomial regression
3. ARIMA

In [None]:
orig_features_df.loc['AAPL'].index

In [None]:
# https://www.tensorflow.org/guide/keras/rnn

import tensorflow as tf
from tensorflow.keras import layers,callbacks
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error,accuracy_score,make_scorer
from sklearn.model_selection import GridSearchCV
np.random.seed(42)

In [None]:
# trial for AAPL data
dataset = orig_features_df.loc['AAPL'].to_numpy()
n_features = np.shape(dataset)[1]
# normalize the dataset - LSTMS are scale dependent
scaler = MinMaxScaler(feature_range=(0, 1),copy = True)
dataset = scaler.fit_transform(dataset.reshape(-1,n_features))

In [None]:
dataset

In [None]:
# lookback for LSTM
lstm_period = 32

def look_back(data,period = 7):
    X,y = [],[]
    for i in range(period,len(data)):
        X.append(data[i-period:i,:])
        y.append(data[i,:])
    return np.array(X),np.array(y)
X,y = look_back(dataset,lstm_period)       

In [None]:
# Train Test Split: 80 : 20
train_ratio = 0.8
train_size = int(len(X)*train_ratio)
test_size = len(X) - train_size
X_train, X_test = X[0:train_size,:], X[train_size:,:]
y_train, y_test = y[0:train_size], y[train_size:]
print(len(X_train), len(X_test),len(y_train), len(y_test))

In [2]:
def createLSTMModel(n1_cells = 128, n2_cells = 16,dropout_rate=0, optimizer='Adam',\
                    activation='tanh', loss='mean_squared_error', 
                    epochs = 100,batch_size = 16):
    
    # simple model: Build-Compile-Fit-Predict approach
    model = tf.keras.Sequential()
    model.add(layers.LSTM(n1_cells,activation=activation,return_sequences = True,use_bias=True,\
                               input_shape = (lstm_period,n_features))) #recurrent_activation = 'sigmoid'
    model.add(layers.Dropout(dropout_rate))
    model.add(layers.LSTM(n2_cells,activation=activation,use_bias=True))
    model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(n_features))
    model.compile(loss = loss, optimizer = optimizer, metrics = ['accuracy'])

    # Reshape Data: (Sample,Timestep,Features) 
    model.fit(X_train,y_train,shuffle = False,epochs = epochs,verbose = 1,batch_size = batch_size,\
                   callbacks=[callbacks.EarlyStopping(monitor='loss', patience=3)])
    return model



In [None]:
## Grid Search CV

grid_param_LSTM = {
    'n1_cells' : [16],
    'n2_cells' : [4],
    'batch_size': [1],
    'epochs': [100],   
    'optimizer': ['Adam', 'RMSProp'],
#     'loss': ['logcosh', 'mse'],
#     'activation': ['relu', 'linear','sigmoid', 'tanh'],
#     'dropout_rate':[0,0.5]
}

model_LSTM=KerasRegressor(build_fn=createLSTMModel)


GridLSTM = GridSearchCV(estimator=model_LSTM,
                     param_grid=grid_param_LSTM,
                     scoring={'neg_mean_squared_error'},
                     refit = 'neg_mean_squared_error', cv=3,n_jobs = -1)

X_train = X_train.reshape((X_train.shape[0],X_train.shape[1],n_features))
X_test = X_test.reshape((X_test.shape[0],X_test.shape[1],n_features))

GridLSTM.fit(X_train, y_train)
best_model = GridLSTM.best_estimator_



In [None]:
try:
    mkdir('Results')
except:
    pass
best_model.save('Results/lstm_model.h5')


from sklearn.externals import joblib
GridLSTM.save('Results/grid_lstm_model.h5')

In [None]:
y_pred_seen = best_model.predict(X_train)
# y_pred_unseen = best_model.predict(X_test) # This is not completely correct as it is peeking into the future
print(np.shape(y_pred_seen))
print(np.shape(y_train))

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize = (16,10))
plt.plot(scaler.inverse_transform(y_pred_seen)[:,0],color = 'b',label = 'Predicted_SEEN')
plt.plot(scaler.inverse_transform(y_train)[:,0],color = 'k',label = 'Actual_SEEN_data')
plt.xlabel('#(day)')
plt.ylabel('Price')
plt.title('Prediction in SEEN AAPL data')
plt.legend()
plt.show()


"""Note the commented block here is peeking into the future data while prediciting """
# pred_unseen_to_plot = len(y_train)
# plt.figure(figsize = (16,10))
# plt.plot(scaler.inverse_transform(y_pred_unseen)[:,0],color = 'r',label = 'Predicted_UNSEEN')
# plt.plot(scaler.inverse_transform(y_test)[:,0],color = 'k',label = 'Actual_UNSEEN_data')
# plt.xlabel('#(day)')
# plt.ylabel('Price')
# plt.title('Prediction in UNSEEN AAPL data')
# plt.legend()
# plt.show()

In [None]:
# predictions for 365 days
# start = X_train[-1:]
start = X_test[0:1]
preds = [[np.nan]*n_features]
for i in range(250):
    start = start.reshape((start.shape[0],start.shape[1],n_features))
    a = best_model.predict(start)
    # prepare next start point
    first_n_minus_1 = start[0,-(lstm_period-1):,0:n_features]
    first_n_minus_1 = first_n_minus_1.reshape((1,first_n_minus_1.shape[0],n_features))
    a = a.reshape((1,a.shape[0],n_features))
    start = np.concatenate((first_n_minus_1,a), axis = 1)   
    preds = np.concatenate((preds,a[0]),axis = 0)

    

In [None]:
pred_unseen_to_plot = len(y_train)
plt.figure(figsize = (16,10))
plt.plot(scaler.inverse_transform(preds)[:,0],color = 'r',label = 'Predicted_UNSEEN')
plt.plot(scaler.inverse_transform(y_test)[:,0],color = 'k',label = 'Actual_UNSEEN_data')
plt.xlabel('#(day)')
plt.ylabel('Price')
plt.title('Prediction in UNSEEN AAPL data')
plt.legend()
plt.show()