#Setup

In [None]:
pip install keras-adabound

Collecting keras-adabound
  Downloading https://files.pythonhosted.org/packages/bf/74/85de8379eba8e0f819ef9b62ff32d24a3f624758800e12bd9572e3afb546/keras-adabound-0.6.0.tar.gz
Building wheels for collected packages: keras-adabound
  Building wheel for keras-adabound (setup.py) ... [?25l[?25hdone
  Created wheel for keras-adabound: filename=keras_adabound-0.6.0-cp36-none-any.whl size=6608 sha256=5e42fb69e4287aaedf9f10ff9938d4e867e1dc55acf49f009e251ba8fa4dca85
  Stored in directory: /root/.cache/pip/wheels/f1/81/9c/04af926d62bddd280c97af1704a9baaef511664b56865958e8
Successfully built keras-adabound
Installing collected packages: keras-adabound
Successfully installed keras-adabound-0.6.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
#from keras_adabound import AdaBound
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import CSVLogger
from tensorflow.keras.layers import Dense, LSTM, Dropout, GRU
from sklearn.metrics import mean_squared_error as calc_mse
import time

In [None]:
csv = pd.read_csv('https://raw.githubusercontent.com/clement880101/MLStocks/master/Combined_Stock_Data_MSFT.csv', date_parser= True)
csv.columns

Index(['date', 'Msft_open', 'Msft_high', 'Msft_low', 'Msft_close',
       'Msft_adjusted_close', 'Msft_volume', 'Msft_dividend',
       'Msft_split_coefficent', 'Msft_Real Middle Band',
       ...
       'Amzn_ADX', 'Amzn_SMA', 'SPY_open', 'SPY_high', 'SPY_low', 'SPY_close',
       'SPY_adjusted_close', 'SPY_volume', 'SPY_dividend',
       'SPY_split_coefficent'],
      dtype='object', length=114)

# Recurrent LSTM Neural Network

###help functions

In [None]:
#helper functions
def to_dataframe(csv):
    # returns dataframe
    df = pd.read_csv(csv, date_parser=True)
    return df

def reverse_order(df):
    # reverse order of data so earliest day is day 0
    reversed_df = df[::-1].reset_index(drop=True)
    return reversed_df

def remove_dates(df):
  #stores dates in a dictionary
  dates = {}
  for i in range(df.shape[0]):
    date = df.iloc[i]['date']
    dates[date] = i

  df = df.drop(['date'],axis=1)

  return dates, df

def scale_data(df, target_column):
    # scale data
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(df)

    # save this value to convert stock prediction to nominal data
    upscale_value = 1 / scaler.scale_[target_column]
    return scaled_data, scaler, upscale_value

def split_data(df, date_value):
  df_before = df[df['date'] < date_value].copy()
  df_after = df[df['date'] >= date_value].copy()
  return df_before, df_after

def create_xy(data, scope, target_column):
  x = []
  y = []
  for i in range(scope, data.shape[0]):
    # the xTest will have an array of the last x "scope" days of data
    # yTest will be the the opening value of the next day
    x.append(data[i-scope:i])
    y.append(data[i,target_column])
  
  #the length of x is the data length - scope 
  #in each x there is a batch size of x "scope" points
  return np.array(x), np.array(y)
#recurrent neural network

class Rnn:
    # set values for Rnn object
    def __init__(self, rows_size, columns_size):
        self.rows = rows_size
        self.columns = columns_size
        self.model = None
        self.logs = None


    # train function for Rnn class
    def structure(self, layers, units_for_layers, dropouts_for_layers):
        #initilize Sequential rnn
        self.model = Sequential()
        # add first layer and define input shape
        self.model.add(LSTM(units_for_layers[0], activation = 'relu', return_sequences = True, input_shape = (self.rows, self.columns)))
        self.model.add(Dropout(dropouts_for_layers[0]))
        # for adding additional layers
        if layers > 2:
            for i in range(1,layers):

                return_setting = True
                #dont need to return values upstream on last layer
                if i == layers - 1: return_setting = False

                self.model.add(LSTM(units_for_layers[i], activation = 'relu', return_sequences = return_setting))
                self.model.add(Dropout(dropouts_for_layers[i]))


        #final endpoint for rnn layers
        self.model.add(Dense(units = 1))
        return None

    def summary(self):
        return self.model.summary()
      
    def history(self):
      return self.model.history

    def train(self, xTrain, yTrain, epochs, batch_size, optimizer, file_name, ada_low_lr = None, ada_high_lr = None):
        #compiles model that was created
        if optimizer != 'adaboost':
            self.model.compile(optimizer=optimizer, loss = 'mean_squared_error')
        else:
            self.model.compile(optimizer= AdaBound(lr=ada_low_lr, final_lr=ada_high_lr), loss = 'mean_squared_error')

        #create log for getting stats
        CSV_logger = CSVLogger(file_name + '.csv',separator=',',append=False)
        #fit model to data
        self.model.fit(xTrain, yTrain, epochs=epochs, batch_size=batch_size, callbacks=[CSV_logger])
        self.logs = CSV_logger
        return None

    def predict(self, input_data):
        y_hat = self.model.predict(input_data)
        return y_hat

In [None]:
def create_file_name(array):
  string = 'Stats'
  for item in array:
    string = string + '_' + str(item)
  return string

###main

In [None]:
#arrange df and split by date
df = reverse_order(csv)
training_data, test_data = split_data(df, '2018-01-01')

#store date labels and drop columns 
dates_train, training_data = remove_dates(training_data)
dates_test, test_data = remove_dates(test_data)

target_column = 5 #this is MSFT adjusted column
#scale_data on training data and get scaler with value
training_data, scaler, upscale_value = scale_data(training_data, target_column-1)
test_data = scaler.transform(test_data)


In [None]:
#create xTrain,yTrain.. and xTest,yTest
#each x in xTrain will be an array of x days
xTrain, yTrain = create_xy(training_data, 5, target_column-1)
xTest, yTest = create_xy(test_data, 5, target_column-1)

###Params loop

In [None]:
#batch dimensions
row_size = xTrain.shape[1]
column_size = xTrain.shape[2]
# params to test
layers = [3,5]
epochs = [10, 25, 50]
batches = [100, 200]
optimizer = 'adam'

In [None]:
params = []
csv_name = 'MSFT_CSD'

for l in layers:
  units = []
  dropouts = []
  for i in range(l):
    units.append(column_size)
    if i == 0:
      dropouts.append(0.2)
    else: 
      dropouts.append(0.5)

  for e in epochs:
    for b in batches:
      file_name = create_file_name([csv_name,l,e,b])
      params.append({'layers': l,'epochs': e,'batches': b, 'units': units, 'dropouts': dropouts, 'optimizer': optimizer, 'file_name': file_name})

In [None]:
params[0]['file_name']

'Stats_MSFT_CSD_3_10_100'

### Create evaluation loop

In [None]:
#track_time_to_train
time_training = []
networks = []
for param in params:
  t0 = time.clock()
  #Steps for rnn:
  #1. initialize, #2. structure, #train, #summary, #predict
  print('Starting this param'), print(param)
  nnet = Rnn(row_size,column_size)
  nnet.structure(param['layers'], param['units'], param['dropouts'])
  nnet.train(xTrain, yTrain, param['epochs'],param['batches'],param['optimizer'],param['file_name'])
  networks.append(nnet)
  t1 = time.clock()
  time_training.append(t1-t0)


Starting this param
{'layers': 3, 'epochs': 10, 'batches': 100, 'units': [113, 113, 113], 'dropouts': [0.2, 0.5, 0.5], 'optimizer': 'adam', 'file_name': 'Stats_MSFT_CSD_3_10_100'}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Starting this param
{'layers': 3, 'epochs': 10, 'batches': 200, 'units': [113, 113, 113], 'dropouts': [0.2, 0.5, 0.5], 'optimizer': 'adam', 'file_name': 'Stats_MSFT_CSD_3_10_200'}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Starting this param
{'layers': 3, 'epochs': 25, 'batches': 100, 'units': [113, 113, 113], 'dropouts': [0.2, 0.5, 0.5], 'optimizer': 'adam', 'file_name': 'Stats_MSFT_CSD_3_25_100'}
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 2

###Testing

Testing Loop

In [None]:
def save_test_image(yTest, y_pred, file_name):
  plt.plot(yTest, color='black', label='Actual')
  plt.plot(y_pred, color='red', label='Predict')
  plt.title('Testing')
  plt.xlabel('time [days]')
  plt.ylabel('price')
  plt.legend(loc='best')
  plt.savefig(fname = (file_name +'_test_img.jpg'))
  plt.close()
  return None

In [None]:
def save_train_image(yTrain, y_pred, file_name):
  plt.plot(yTrain, color='black', label='Actual')
  plt.plot(y_pred, color='red', label='Predict')
  plt.title('Training')
  plt.xlabel('time [days]')
  plt.ylabel('price')
  plt.legend(loc='best')
  plt.savefig(fname = (file_name+'_train_img.jpg'))
  plt.close()
  return None

In [None]:
time_training

[14.649014999999999,
 12.533528,
 31.189546999999997,
 26.369709999999998,
 57.88372199999999,
 48.36545100000001,
 24.421193999999986,
 21.057907999999998,
 51.88860199999999,
 44.69563400000004,
 98.30523099999999,
 80.91186799999997]

In [None]:
time_evaluating = []
header = ['file_name: layers, epoch, batch', 'mse_scaled', 'mse_unscaled', 'overall_time']
results = []
yTest_scaled = yTest * upscale_value
yTrain_scaled = yTrain * upscale_value

for i in range(len(networks)):
  t0 = time.clock()
  nnet = networks[i]
  file_name = params[i]['file_name']
  #test images and result
  yPred_test = nnet.predict(xTest)
  yPred_test_scaled = yPred_test * upscale_value
  save_test_image(yTest_scaled, yPred_test_scaled, file_name)

  mse_unscaled = calc_mse(yTest, yPred_test)
  mse_scaled = calc_mse(yTest_scaled, yPred_test_scaled)

  #training images
  yPred_train = nnet.predict(xTrain)
  yPred_train_scaled = yPred_train * upscale_value
  save_test_image(yTrain_scaled, yPred_train_scaled, file_name)
  t1 = time.clock()

  time_evaluating = t1-t0
  overall_time = time_training[i] + time_evaluating
  stats = [file_name, mse_scaled, mse_unscaled, overall_time]
  results.append(stats)

In [None]:
results_df = pd.DataFrame(data = results, columns= header)
results_df.to_csv(csv_name+'_results.csv')

ValueError: ignored

In [None]:
!zip -r /content/file.zip /content/

from google.colab import files
files.download("/content/file.zip")

###Scrap code

In [None]:
# plt.figure(figsize=(14,5))
# plt.plot(yTest, color = 'red', label = 'Actual MSFT Adj. Stock Price')
# plt.plot(y_pred, color = 'blue', label = 'Predicted MSFT Adj. Stock Price')
# plt.title('MSFT Stock Price Prediction')
# plt.xlabel('Days')
# plt.ylabel('MSFT Adj. Stock Price')
# plt.legend()
# plt.show()

In [None]:
# date = '2020-01-01' 
# obj = datetime.datetime.strptime(date,"%Y-%m-%d") + timedelta(days=1)
# datetime.datetime.strftime(obj, "%Y-%m-%d")

In [None]:
# def get_valid_date(value, dates, up_or_down):
#   if value in dates:
#     print('date was valid')
#     return dates.get(value)

#   if value not in dates:
#     valid = False
#     nearest_value = value
#     while valid == False:
#       if up_or_down == 'up':
#         #increase days
#         new_datetime = datetime.datetime.strptime(nearest_value,"%Y-%m-%d") + timedelta(days=1)
#       else: 
#         new_datetime = datetime.datetime.strptime(nearest_value,"%Y-%m-%d") - timedelta(days=1)

#       nearest_value = datetime.datetime.strftime(new_datetime, "%Y-%m-%d")
#       if value in dates == True:
#         valid == true
#         print('The nearest valid date was' + value)
#     return dates.get(nearest_value)

In [None]:
# y_pred =networks[0].predict(xTest)
# y_pred = y_pred * upscale_value
# yTest = yTest * upscale_value
# print('Our mse error on the testing data is: ')
# print(calc_mse(yTest, y_pred))

# plt.plot(yTest, color='black', label='Actual')
# plt.plot(y_pred, color='red', label='Predict')
# plt.title('Testing')
# plt.xlabel('time [days]')
# plt.ylabel('price')
# plt.legend(loc='best')
# plt.savefig(param['file_name']+'_test_img')

# y_pred = networks[0].predict(xTrain)
# plt.plot(yTrain, color='black', label='Actual')
# plt.plot(y_pred, color='red', label='Predict')
# plt.title('Training')
# plt.xlabel('time [days]')
# plt.ylabel('price')
# plt.legend(loc='best')
# plt.savefig(param['file_name']+'_train_img')

In [None]:
# #predict xTest and upscale y values
# y_pred = nnet.predict(xTest)
# y_pred_scaled = y_pred * upscale_value

# yTest
# print('Our mse error on the testing data is: ')
# error = calc_mse(y_pred, yTest)
# print(round(error,2))

# print('Our mse error on the testing data after rescale is: ')
# error = calc_mse(y_pred_scaled, yTest_scaled)
# print(round(error,2))

In [None]:
# #batch dimensions
# rows = xTrain.shape[1]
# columns = xTrain.shape[2]

# #Steps for rnn:
# #1. initialize, #2. structure, #train, #summary, #predict
# nnet = Rnn(rows,columns)

# units = [50, 60, 80, 120] #nodes for each layer
# dropouts = [0.2,0.3,0.4,0.5] #strength of dropouts
# nnet.structure(4, units, dropouts)
# nnet.summary()

# print('')

# #train model
# nnet.train(xTrain, yTrain, 25, 50, 'adam', 'test')

In [None]:
# y_pred =networks[0].predict(xTest)
# y_pred = y_pred * upscale_value
# yTest = yTest * upscale_value
# print('Our mse error on the testing data is: ')
# print(calc_mse(yTest, y_pred))

# plt.plot(yTest, color='black', label='Actual')
# plt.plot(y_pred, color='red', label='Predict')
# plt.title('Testing')
# plt.xlabel('time [days]')
# plt.ylabel('price')
# plt.legend(loc='best')
# plt.savefig(param['file_name']+'_test_img')

# y_pred = networks[0].predict(xTrain)
# plt.plot(yTrain, color='black', label='Actual')
# plt.plot(y_pred, color='red', label='Predict')
# plt.title('Training')
# plt.xlabel('time [days]')
# plt.ylabel('price')
# plt.legend(loc='best')
# plt.savefig(param['file_name']+'_train_img')