In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas_datareader import data
import yfinance as yf
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam
from keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, accuracy_score, confusion_matrix, precision_score, classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
import matplotlib.dates as mdates

from google.colab import drive
drive.mount('/content/gdrive')

path = '/content/gdrive/MyDrive/Colab_Notebooks/FYP/'

In [None]:
# Read csv
full_training_data = pd.read_csv(path+'/data/full_training_data_cleaned.csv',index_col=False)

In [None]:
#Min-Max scaling
min_max_column = ['Open', 'High', 'Low', 'Volume',
                  'wsentiments', 'HSI_overnight_ret',
                  'HSI_intraday_overnight_ret_diff', 'CSI300_overnight_ret',
                  'SSE50_overnight_ret', 'HSI_stoch_k', 'HSI_rsi', 'HSI_daily_ret',
                  'HSI_intraday_ret', 'HSI_ret_range']

mms_X = MinMaxScaler()
full_training_data[min_max_column] = mms_X.fit_transform(full_training_data[min_max_column])

mms_y = MinMaxScaler()
full_training_data['Adj Close'] = mms_y.fit_transform(full_training_data[['Adj Close']])

In [None]:
# Data Transformation for buy/sell/neutral label
full_training_data['HSI_OO_ter_0.005'] = full_training_data['HSI_OO_ter_0.005'].map({'buy':1, 'sell':-1, 'neutral':0})

In [None]:
# Feature selection

classification_feature_column = ['Open', 'High', 'Low', 'Volume', 'HSI_overnight_ret',
                             'HSI_intraday_overnight_ret_diff',
                             'CSI300_overnight_ret', 'SSE50_overnight_ret',
                             'HSI_stoch_k', 'HSI_rsi', 'HSI_daily_ret',
                             'HSI_disc_macd_1', 'HSI_intraday_ret',
                             'HSI_ret_range', 'HSI_OO_ter_0.005']

classification_nlp_column = ['Open', 'High', 'Low', 'Volume', 'HSI_overnight_ret',
                         'HSI_intraday_overnight_ret_diff',
                         'CSI300_overnight_ret', 'SSE50_overnight_ret',
                         'HSI_stoch_k', 'HSI_rsi', 'HSI_daily_ret',
                         'HSI_disc_macd_1', 'HSI_intraday_ret', 'HSI_ret_range',
                         'wsentiments', 'HSI_OO_ter_0.005']

classification_feature_df = full_training_data[classification_feature_column]
classification_nlp_df = full_training_data[classification_nlp_column]

In [None]:
# Create dataset for sequential input
def create_dataset(dataset, look_back):
    dataX, dataY = [], []
    num = dataset.shape[1] - 1
    for i in range(len(dataset) - look_back - 1):
        a = dataset[i:(i + look_back), : -1]
        dataX.append(a)
        dataY.append(dataset[i + look_back, num])
    return np.array(dataX), np.array(dataY)

In [None]:
# Training
np.random.seed(7)
classification_feature_df_values = classification_feature_df.values
classification_nlp_df_values = classification_nlp_df.values
look_back = 10

def train(regression_df_values, version):
  train_size = int(len(regression_df_values) * 0.7)
  val_size = int(len(regression_df_values) * 0.15)
  test_size = len(regression_df_values) - train_size - val_size
  train, val, test = regression_df_values[0:train_size+1, :], regression_df_values[train_size:train_size+val_size+1, :], regression_df_values[train_size+val_size:len(regression_df_values)+1, :]
  num = train.shape[1] - 1

  trainX, trainY = create_dataset(train, look_back)
  valX, valY = create_dataset(val, look_back)
  testX, testY = create_dataset(test, look_back)
  trainX = np.reshape(trainX, (trainX.shape[0], look_back, num))
  valX = np.reshape(valX, (valX.shape[0], look_back, num))
  testX = np.reshape(testX, (testX.shape[0],look_back, num))

  model=Sequential()
  model.add(LSTM(1024,input_shape=(look_back,num)))
  model.add(Dense(1))
  optimizer = Adam(lr=0.001)
  model.compile(loss='mean_squared_error', optimizer=optimizer)
  model.summary()

  model.fit(trainX, trainY, validation_data=(valX, valY), epochs=20, batch_size=64, verbose=1)
  train_predict=model.predict(trainX)
  val_predict=model.predict(valX)
  test_predict=model.predict(testX)

  train_predict = list(map(lambda x: -1 if (x < -0.5) else (0 if (x < 0.5) else 1), train_predict))
  val_predict = list(map(lambda x: -1 if (x < -0.5) else (0 if (x < 0.5) else 1), val_predict))
  test_predict = list(map(lambda x: -1 if (x < -0.5) else (0 if (x < 0.5) else 1), test_predict))

  train_pred_df = pd.DataFrame({'Date': full_training_data.loc[look_back:train_size-1, 'Date'], 'Predicted': train_predict})
  val_pred_df = pd.DataFrame({'Date': full_training_data.loc[train_size+look_back+1:train_size+val_size, 'Date'], 'Predicted': val_predict})
  test_pred_df = pd.DataFrame({'Date': full_training_data.loc[train_size+val_size+look_back+1:train_size+val_size+test_size, 'Date'], 'Predicted': test_predict})

  result = pd.concat([train_pred_df, val_pred_df])
  result = pd.concat([result, test_pred_df])
  result = result.sort_values(by='Date').reset_index(drop=True)

  # output result
  result.to_csv(path+f'lstm_stock_prediction_classification_{version}.csv', index=False)

  print(f"Accuracy of training: {accuracy_score(trainY, train_predict)}")
  print(f"Accuracy of validation: {accuracy_score(valY, val_predict)}")
  print(f"Accuracy of testing: {accuracy_score(testY, test_predict)}")

  # Calculating the precision score of classifier
  print(f"Precision Score of training: {precision_score(trainY, train_predict, average=None)}")
  print(f"Precision Score of training: {precision_score(valY, val_predict, average=None)}")
  print(f"Precision Score of testing: {precision_score(testY, test_predict, average=None)}")

  # confusion matrix function a matrix containing the summary of predictions
  print(f"Confusion matrix of training: {confusion_matrix(trainY, train_predict)}")
  print(f"Confusion matrix of validation: {confusion_matrix(valY, val_predict)}")
  print(f"Confusion matrix of testing: {confusion_matrix(testY, test_predict)}")

  print(f"Classification report of training: {classification_report(trainY, train_predict, digits=3)}")
  print(f"Classification report of validation: {classification_report(valY, val_predict, digits=3)}")
  print(f"Classification report of testing: {classification_report(testY, test_predict, digits=3)}")

  return train, val, test, trainY, valY, testY, train_predict, val_predict, test_predict, train_size, val_size, test_size

**LSTM Vanilla + Feature**

In [None]:
trainX, valX, testX, trainY, valY, testY, train_predict, val_predict, test_predict, train_size, val_size, test_size = train(regression_feature_df_values, 'feature')



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1024)              4255744   
                                                                 
 dense (Dense)               (None, 1)                 1025      
                                                                 
Total params: 4,256,769
Trainable params: 4,256,769
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy of training: 0.531496062992126
Accuracy of validation: 0.42574257425742573
Accuracy of testing: 0.44
Precision Score of training: [0.86792453 0.39339339 0.76229508]
Precision Score of training: [1.     

**LSTM Vanilla + Feature + NLP**

In [None]:
trainX, valX, testX, trainY, valY, testY, train_predict, val_predict, test_predict, train_size, val_size, test_size = train(regression_nlp_df_values,'nlp')



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 1024)              4259840   
                                                                 
 dense_1 (Dense)             (None, 1)                 1025      
                                                                 
Total params: 4,260,865
Trainable params: 4,260,865
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy of training: 0.5118110236220472
Accuracy of validation: 0.38613861386138615
Accuracy of testing: 0.44
Precision Score of training: [0.95       0.38692098 0.79207921]
Precision Score of training: [1.  