In [214]:
# Data Acquisition

Install the following necessary packages

#pip install pandas
#pip install numpy
#pip install scipy
#pip install yahoo_finance_api2
#pip install pandas-datareader
#pip install ta
#pip install seaborn
#pip install sklearn

import pandas as pd
import numpy as np

# Importing tickers/Symbols field from the scrapped stockdata

df = pd.read_csv('/home/nielit/Desktop/datasample.csv',header=None)
df.columns=["ticker"]
df.head()

len(df.ticker)

len(df.ticker.unique())

# Check for duplicate values

df_dup = df[df.duplicated()]
df_dup

# Drop the duplicate values

df = df.drop(df.index[300])

tic = list(df.ticker)

# Import necessary packages

import pandas_datareader as pdr
from datetime import datetime

from tqdm import tqdm

# Collecting the stock market data from open source yahoo finance API for the following stock tickers for past 2 years

data = list()
for i in tqdm(tic):
    try:
        data.append(pdr.get_data_yahoo(symbols=i, start=datetime(2018, 1, 2), end=datetime(2019, 12, 31)))
    except:
        pass

for i in range(len(data)):
    
    data[i]['Symbol'] = tic[i]

df = pd.DataFrame(data[0])

for i in np.arange(1,len(data)):
    df = df.append(data[i])

df.isnull().sum()

Our required dataframe df looks something like this..

df.head()

# Saving the dataframe in local

df.to_csv("/home/nielit/Desktop/StockData2.csv")

# Loading the dataframe back from local 

df_stock = pd.read_csv("/home/nielit/Desktop/StockData2.csv")

# The stock data is in the exact format to store in a database

df_stock.info()

# Using MongoDB as my database

from pymongo import MongoClient 
from random import randint
try: 
    client_mongo  = MongoClient() 
    print("Connected successfully!!!") 
except:   
    print("Could not connect to MongoDB") 

# Creating a database db and collection stock

db = client_mongo.db_6
collection = db.stock

### Importing the dataset into database using MongoClient

for i in df_stock.values:
    collection.insert_one({"Date":i[0],"High":i[1],"Low":i[2],"Open":i[3],"Close":i[4],"Volume":i[5],"Adj Close":i[6],"Symbol":i[7]})

The stock data is now stored in MongoDB

### Exporting the dataset from MongoClient back here

df = pd.DataFrame(list(db.stock.find()))
df.head()

# Reformatting the data back into our desired form from MongoDB

df = df.drop(['_id'],axis=1)
df = df[['Date','High','Low','Open','Close','Volume','Adj Close','Symbol']]
df = df.set_index('Date')
df.head()

import pandas as pd
import numpy as np
df = pd.read_csv(r"C:\Users\JAGANNATH\Desktop\Project\Project\StockData2.csv")
df = df.sort_values(by='Date')
df['Date'] = pd.to_datetime(df['Date'])
df = df.set_index(['Date'])
df.head()

df.info()

print("There are {} Symbols with obervations over {} days.".format(df.Symbol.unique().size, df.index.unique().size))

# Check for null values

df.isna().sum()

Converting the End Of Day data into five seperate time series data frames Open, High, Low, Close and Volume (OHLCV).

open_val = df[["Symbol", "Open"]].pivot(columns = "Symbol", values = "Open")
high_val = df[["Symbol", "High"]].pivot(columns = "Symbol", values = "High")
low_val = df[["Symbol", "Low"]].pivot(columns = "Symbol", values = "Low")
close_val = df[["Symbol", "Close"]].pivot(columns = "Symbol", values = "Close")
volume = df[["Symbol", "Volume"]].pivot(columns = "Symbol", values = "Volume")

# Open value dataframe

open_val.head()

# Close value dataframe

close_val.head()

close_val.info()

We will create a dataframe containing the future close returns at time t, since we are predicting the close returns of next day value.

next_val = (close_val.shift(-1) / close_val - 1)

# The last day return looks something like this

next_val.tail()

We will create a data frame containing close returns of the current day, which is calculated with respect to the close price of the previous day.

ret_val = (close_val / close_val.shift(1)) - 1

# The current day return looks something like this

ret_val.head()

# Dataframe with ratio of high/low for each symbols

hl_val = high_val / low_val
hl_val.head()

## Data Pre-pocessing

Missing values in the close_val dataframe occurs since not all the Symbols contain information from the start date. These missing values may also occur if trade doesn't happen on that day.
Let's check the non-missing values.

# Histogram for non-missing values for 488 days

import matplotlib.pyplot as plt
hist = close_val.notna().sum().hist(bins=10)
plt.xlabel("Observations")
plt.ylabel("Symbols")
plt.title("Non-missing values in Symbols")
print("There are {} symbols with full data available.".format(close_val.columns[(close_val.notna().sum() == close_val.shape[0]).values].shape[0])) 
plt.show()

close_val.notna().sum()

# Lets' take Symbols with more than 400 non-missing values

valid_sym = close_val.columns[(close_val.notna().sum() >= 400).values]
open_val = open_val[valid_sym]
high_val = high_val[valid_sym]
low_val = low_val[valid_sym]
close_val = close_val[valid_sym]
volume = volume[valid_sym]
next_val = next_val[valid_sym]
ret_val = ret_val[valid_sym]
hl_val = hl_val[valid_sym]

# The valid symbols looks something like this..

valid_sym

Let's look at the average correlation to next day return values and rank the symbols accordingly.

corr_val = pd.DataFrame()

for Symbol in valid_sym:
    df = pd.concat([open_val[Symbol], high_val[Symbol], low_val[Symbol], close_val[Symbol], volume[Symbol], next_val[Symbol], ret_val[Symbol], hl_val[Symbol]], axis=1, 
                   keys=["open", "high", "low", "close", "volume", "next_val", "ret_val", "hl_val"])
    corr_val = corr_val.append({"symbol": Symbol, "avgcorr": df.corr().drop("next_val", axis = 1).loc['next_val'].abs().mean()}, ignore_index = True)

# Histogram of values of average correlation to next day return
corr_val.avgcorr.hist();
plt.xlabel("Average correlation")
plt.ylabel("Frequency")
plt.title("Average correlation to next day return")
plt.show()

Based on the histogram above, we can see that there are very few symbols with good correlation to next day values. For high profitability, we are taking the symbols that possess good predictive power.
Let's consider the tickers with average correlation of more than the 90%.

sym90 = corr_val[corr_val.avgcorr > corr_val.avgcorr.quantile(0.90)].symbol.values

open_val = open_val[sym90]
high_val = high_val[sym90]
low_val = low_val[sym90]
close_val = close_val[sym90]
volume = volume[sym90]
next_val = next_val[sym90]
ret_val = ret_val[sym90]
hl_val = hl_val [sym90]

# Modify original dataframes

close_val = close_val.fillna(method="ffill") # close
close_val = close_val.fillna(method="bfill") # to handle the first row of close
volume = volume.applymap(lambda x: 0 if pd.isna(x) is True else x) # volume
open_val = open_val.fillna(close_val)
high_val = high_val.fillna(close_val)
low_val = low_val.fillna(close_val)

# calculate other dataframes
next_val = (close_val.shift(-1) / close_val) - 1 # future return
ret_val = (close_val / close_val.shift(1)) - 1 # return
hl_val = high_val / low_val # high/low

## Technical Analysis

We want to create features that possess some level of predictive power which could indicate the future direction of the market. Statistically, these features should have good correlation  power to the market movement.

There are many such transformations in Statistics. 

According to work by Borovkova, some of the key technical indicators that can be used are categorised into four groups, Momentum, Trend, Volume and Volatility. Some of the commonly used indicators are:

#### Momentum:-

    1. Money flow index 
    2. Relative strength index
    3. Stochastic oscillator
    4. William %R
    
#### Trend:-

    1. Exponential moving average
    2. Moving average convergence-divergence
    3. Commodity channel index
    4. Ichimoku Indicator
    
#### Volume:-

    1. Accumulation/distribution index
    
#### Volatility:-

    1. Bollinger bands 

# Technical Analysis package

import ta

##### 1. Money flow index

import ta

mfi = pd.DataFrame()

for Symbol in close_val.columns:
    
    temp = ta.momentum.money_flow_index(high=high_val[Symbol], low=low_val[Symbol], close=close_val[Symbol], volume=volume[Symbol], fillna=True)
    mfi = pd.concat([mfi, temp], axis=1,sort=True)

# renaming the columns
mfi.columns = close_val.columns

##### 2. Relative Strength Index

rsi = close_val.apply(ta.momentum.rsi, fillna=True)

##### 3. Stochastic oscillator

stoch_k = pd.DataFrame()
stoch_d = pd.DataFrame()

for Symbol in close_val.columns:
    temp = ta.momentum.stoch(high=high_val[Symbol], low=low_val[Symbol], close=close_val[Symbol], fillna=True)
    stoch_k = pd.concat([stoch_k, temp], axis=1,sort=True)
    
    temp = ta.momentum.stoch_signal(high=high_val[Symbol], low=low_val[Symbol], close=close_val[Symbol], fillna=True)
    stoch_d = pd.concat([stoch_d, temp], axis=1,sort=True)

# renaming the columns
stoch_k.columns = close_val.columns
stoch_d.columns = close_val.columns

##### 4. William %R

will_r = pd.DataFrame()

for Symbol in close_val.columns:
    temp = ta.momentum.wr(high=high_val[Symbol], low=low_val[Symbol], close=close_val[Symbol], fillna=True)
    will_r = pd.concat([will_r, temp], axis=1,sort=True)

# renaming the columns
will_r.columns = close_val.columns

##### 5. Exponential moving average

ema = close_val.apply(ta.trend.ema_indicator, fillna=True)

##### 6. Moving average convergence-divergence

macd = close_val.apply(ta.trend.macd_diff, fillna=True)

##### 7. Commodity channel index

cci = pd.DataFrame()

for Symbol in close_val.columns:
    temp = ta.trend.cci(high=high_val[Symbol], low=low_val[Symbol], close=close_val[Symbol], fillna=True)
    cci = pd.concat([cci, temp], axis=1, sort=True)

# renaming the columns
cci.columns = close_val.columns

##### 8. Ichimoku Indicator

ichi_a = pd.DataFrame()
ichi_b = pd.DataFrame()

for Symbol in close_val.columns:
    temp = ta.trend.ichimoku_a(high=high_val[Symbol], low=low_val[Symbol], fillna=True)
    ichi_a = pd.concat([ichi_a, temp], axis=1, sort=True)

    temp = ta.trend.ichimoku_b(high=high_val[Symbol], low=low_val[Symbol], fillna=True)
    ichi_b = pd.concat([ichi_b, temp], axis=1, sort=True)

# renaming the columns
ichi_a.columns = close_val.columns
ichi_b.columns = close_val.columns

##### 9. Accumulation/distribution index

ad = pd.DataFrame()

for Symbol in close_val.columns:
    temp = ta.volume.acc_dist_index(high=high_val[Symbol], low=low_val[Symbol], close=close_val[Symbol], volume=volume[Symbol],fillna=True)
    ad = pd.concat([ad, temp], axis=1, sort=True)

# renaming the columns
ad.columns = close_val.columns

##### 10. Bollinger bands

bb_up = close_val.apply(ta.volatility.bollinger_hband, fillna=True)
bb_down = close_val.apply(ta.volatility.bollinger_lband, fillna=True)

## Data Analysis

# Listing all the above needed dataframes

dfois = [open_val, high_val, low_val, close_val,volume, ret_val, hl_val,
         mfi, ema, rsi, stoch_k, stoch_d, macd, will_r, cci,
         ichi_a, ichi_b, ad, bb_up, bb_down, next_val]
dfois_str = ['open', 'high', 'low', 'close', 'volume', 'rtn', 'hl','mfi', 'ema', 'rsi', 'stoch_k', 'stoch_d', 'macd', 
             'will_r', 'cci', 'ichi_a', 'ichi_b', 'ad', 'bb_up', 'bb_down', 'next_val']

import seaborn as sns

corr_val = np.empty([len(dfois),len(dfois)])

# Loop over every dataframe and find correlation

for i, df1 in enumerate(dfois):
    for j, df2 in enumerate(dfois):
        corr_val[i][j] = df1.corrwith(df2).mean()
    
fig, ax = plt.subplots(figsize=(15,15))
df_corr = pd.DataFrame(corr_val, columns=dfois_str, index=dfois_str)

# Heatmap using seaborn

sns.heatmap(df_corr, vmin=-1, vmax=1, center=0, cmap=sns.diverging_palette(10, 240, n=500), ax=ax)

ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right');

ax.set_title('Avg. colleration matrix heatmap');

In additian to the correlation matrix heatmap, let's have a look at the histogram of data to look for outliers. we will normalise the data from 0 to 1 for every ticker so that they can be visualised and benchmarked appropriately.

from sklearn import preprocessing

# Normalise and draw subplots for each feature

fig, axs = plt.subplots(10,2, figsize=(20, 30))   
for i, ax in enumerate(axs.flatten()):
    if i >= 20:
        pass
    else:
        nomaliser = preprocessing.MinMaxScaler(feature_range=(0,1))
        df_val = nomaliser.fit_transform(dfois[i])
        ax.hist(df_val.reshape(df_val.shape[0]*df_val.shape[1]), bins=50)
        ax.set_title(dfois_str[i])
        ax.set_ylabel('Frequency')
        ax.grid()

Analysing the correlation heatmap and histograms above.

Open, high, low and close values are highly correlated to each other. This is expected because the values are within close proximity to each other. As the technical indicators are calculated using these values and intrinsicly retain its information, open, high, and low values can be removed from feature list.

Most of the distributions above are either normal or uniform in shape, expect for hl and volume. This suggests that there will be a large number of outliers in those two features. Although the other features with normal distribution may have outliers in the tail section, they can be considered negligible. In addition, observing the correlation matrix, they also appear to have low correlation to future return, which is the metric we will be predicting. Therefore, it justifies to leave hl and volume out from the feature list.
Similarly, based on the correlation matrix heatmap, some values appear to not have much correlation to next day value. They are: mfi and ad. To keep the prediction model simple, these features can also be regarded as not useful.

Finally, it also appears that stoch_k is highly correlated to william_r. This is expected because the mathematical expression of both the indicators are similar. Here, william_r will be removed from the feature list.

Based on the discussion above, the final feature list would be: close, rtn, ema, rsi, stoch_k, stoch_d, macd, cci, ichi_a, ichi_b, bb_up, bb_down.

Our strategy is to predict the UP and DOWN movement of a stock together. Based on experience, this is a better strategy compared to predicting the direction of future return as the performance metric based on this can be misleading. For example, if we build a model that minimises the mean-squared-error, that still doesn't mean that the direction of the movement can be correct. If the actual movement is 0.1%, then investing based on 0.5% is better compared to -0.1% prediction although the later might have a error value.

Now, we first need to create a target dataframe that categorises into 1 and 0: 1 for UP and 0 for DOWN. Then, let us look at the distribution of the values.

# Create target dataframe
df_target = (next_val > 0).astype(int)

# Look at the proportion
values, counts = np.unique(df_target.values.reshape(df_target.shape[0]*df_target.shape[1]), return_counts=True)
plt.bar(values,counts,tick_label=['DOWN','UP'])
plt.title('Number of UPs and DOWNs')
plt.ylabel('Total number')
plt.xlabel('Categories')
plt.show()

# List all the dataframes of interest
dfois = [close_val, ret_val, ema, rsi, stoch_k, stoch_d, macd, cci, ichi_a, ichi_b, bb_up, bb_down, df_target]
dfois_str = ['close', 'rtn', 'ema', 'rsi', 'stoch_k', 'stoch_d', 'macd', 'cci','ichi_a', 'ichi_b', 'bb_up', 'bb_down', 'target']

Based on the analysis above, it appears that there are more DOWNs than UPs. Therefore, some balancing is required before sending the data for machine learning. Before going into forecasting section, the final list of dataframes should be defined based on the features list finalised earlier.

## Prediction

##### Pre-processing the data

Based on the Data Analysis done above, this section builds an LSTM neural network model for the prediction of the next day stock price moment; whether it is going upward or downward.

First, let's split the data to training, validation, and test sets. The training and validation sets will be using during the LSTM network training, while the test set will be used for trading strategy implementation and additional testing of the final model.

# Train set
dfois_train = []
for df in dfois:
    dfois_train.append(df.iloc[df.index < '2019-03-01'])

# Test set
dfois_test = []
for df in dfois:
    dfois_test.append(df.iloc[df.index >= '2019-08-01'])

# Validation set
dfois_eval = []
for df in dfois:
    dfois_eval.append(df.iloc[(df.index >= '2019-03-01') & (df.index < '2019-08-01')])

Next, let us preprocess the data. Let us normalise all the data to be between 0 and 1. Note that the maximum and minimum is with respect to each ticker in each train dataframe

# List of normalisers corresponding to each dataframe
nomalisers = []

for i, df in enumerate(dfois[:-1]):
    # create the normaliser object
    nomalisers.append(preprocessing.MinMaxScaler(feature_range=(0,1)))
    
    # columns and indexes
    columns = dfois_train[i].columns
    index_train = dfois_train[i].index
    index_test = dfois_test[i].index
    index_eval = dfois_eval[i].index
    
    # fit normalise
    nomalisers[i].fit(dfois_train[i])
    
    # trasform
    train_data = nomalisers[i].transform(dfois_train[i])
    eval_data = nomalisers[i].transform(dfois_eval[i])
    test_data = nomalisers[i].transform(dfois_test[i])
    
    # replace list
    dfois_train[i] = pd.DataFrame(train_data, columns=columns, index=index_train)
    dfois_eval[i] = pd.DataFrame(eval_data, columns=columns, index=index_eval)
    dfois_test[i] = pd.DataFrame(test_data, columns=columns, index=index_test)

Now, let us scale the data to have mean 0 and unit variance. This scaling is done by training the scaler using data of entire normalised training dataframe. This will be used to transform the training, validation, and test sets.

# List of scalers corresponding to each dataframe
scalers = []

# Standardise the data by fitting the train set
for i, _ in enumerate(dfois[:-1]):
    # create the Scaler object
    scalers.append(preprocessing.StandardScaler())
    
    # columns and indexes
    columns = dfois_train[i].columns
    index_train = dfois_train[i].index
    index_test = dfois_test[i].index
    index_eval = dfois_eval[i].index
    
    # fit scale
    flat_arr = dfois_train[i].values.reshape(dfois_train[i].shape[0]*dfois_train[i].shape[1],1)
    scalers[i].fit(np.tile(flat_arr, dfois_train[i].shape[1]))
    
    # trasform
    train_data = scalers[i].transform(dfois_train[i])
    eval_data = scalers[i].transform(dfois_eval[i])
    test_data = scalers[i].transform(dfois_test[i])
    
    # replace list
    dfois_train[i] = pd.DataFrame(train_data, columns=columns, index=index_train)
    dfois_eval[i] = pd.DataFrame(eval_data, columns=columns, index=index_eval)
    dfois_test[i] = pd.DataFrame(test_data, columns=columns, index=index_test)

##### 60 Time Steps

Let's sequence the data. We are going to use the past 60 days of data for the next day prediction, we need to append the past 60 days worth data and append them in an array. We first write a function and execute onto all three sets of data.

from collections import deque
import random

# Look at the past 60 days
SEQ_LEN = 60

def sequence_data(df_list, shuffle=True):
    # list containing the data
    sequential_data = []

    for Symbol in close_val.columns:
        # initialise dataframe
        df_ticker = pd.DataFrame()

        # concatenate the dataframes
        for df in df_list:
            df_ticker = pd.concat([df_ticker, df[Symbol]], axis=1)

        prev_days = deque(maxlen=SEQ_LEN)
        # for values in every row
        for i in df_ticker.values: 
            # remove the targets
            prev_days.append([n for n in i[:-1]])  
            # append when sequence length is reached
            if len(prev_days) == SEQ_LEN: 
                sequential_data.append([np.array(prev_days), i[-1]])
                
        # shuffle - we do not need to do this for test set
        if shuffle == True:
            random.shuffle(sequential_data)

    return sequential_data

sequential_data_train = sequence_data(dfois_train)
sequential_data_eval = sequence_data(dfois_eval)
sequential_data_test = sequence_data(dfois_test, shuffle=False) # do not shuffle just this one

# Print the length
print('Training data length: {}'.format(len(sequential_data_train)))
print('Validation data length: {}'.format(len(sequential_data_eval)))
print('Testing data length: {}'.format(len(sequential_data_test)))

# balance train and evaluation data
def balance_data(sequential_data):
    ups = [] 
    downs = [] 
    
    # separate the sequence into ups and downs
    for seq, target in sequential_data:
        if target == 0:
            downs.append([seq, target])
        elif target == 1:
            ups.append([seq, target])
    
    # shuffle to randomise
    random.shuffle(ups)
    random.shuffle(downs)
    
    # get the shorter length
    lower = min(len(ups), len(downs))
    
    # truncate the list to shorter length
    ups = ups[:lower]
    downs = downs[:lower]
    
    # merge and shuffle
    sequential_data = ups+downs
    random.shuffle(sequential_data)
    
    return sequential_data

# separate train and target data
def separate_data(sequential_data):
    X = []
    y = []
    
    # loop over every row in sequential data
    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)

    return np.array(X), y

# perform balancing by calling the function
train_x, train_y = separate_data(balance_data(sequential_data_train))
validation_x, validation_y = separate_data(balance_data(sequential_data_eval))

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint, ModelCheckpoint

# Batch size and epochs
BATCH_SIZE = 512
EPOCHS = 3

# Build LSTM prediction model
model = Sequential()

model.add(LSTM(128, input_shape=(train_x.shape[1:]), activation='tanh', return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, activation='tanh', return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, activation='tanh'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

# Compile model
model.compile(loss='sparse_categorical_crossentropy', 
              optimizer=tf.keras.optimizers.Adam(lr=0.001, decay=1e-6),
              metrics=['accuracy'])

# Model summary
print(model.summary())

history = model.fit(train_x, train_y,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_data=(validation_x, validation_y))

# save the model
model.save("lstm.l1")

# load the model
model = load_model("lstm.l1")

Let's look at the accuracy based on the test dataset as well as the classification report based on the 2018/2019 data which the model has never seen during training.

from sklearn.metrics import classification_report

# test performance using test set
test_x, test_y = separate_data(sequential_data_test)

# get the prediction
pred = model.predict_classes(test_x)

# get prediction probability
pred_proba = model.predict(test_x)

# accuracy using test
score = model.evaluate(test_x, test_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
print('')

# classification report
print('Classification report:')
print(classification_report(test_y, pred))

## Trading strategy

Let us define a simple trading strategy. Of all 188 tickers used, open position using $1 for each ticker based on the predicted movement by the LSTM network, and close position the next day. This means that everyday, an investment of $150 is made and position must be closed the next day.

Note that, as the LSTM network requires 60 days of past data, the trading is simulated from 2019-08-01. The cumulative profit and loss (PnL) will be plotted.

The accuracy score based on the test set shows that the results are in line with research whereby the accuracy on test set is lower than that of the training and validation set. The prediction of upward movement has low precision compared to that of the downward trend, which means that there are more false positives when predicting the upward movement. Often, the recall score is inversely proportional to the precision. Therefore, the best measure is the f1-score, which is acceptable for developing a trading strategy.

# build index list
pred_index = dfois_test[0].index[dfois_test[0].index >= dfois_test[0].head(60).index[-1]]

# build prediction dataframe - 1 is BUY and 0 is SELL
df_pred = pd.DataFrame(pred.reshape(dfois_test[0].shape[0]-SEQ_LEN+1,dfois_test[0].shape[1], order='F'),
                       index = pred_index,
                       columns = dfois_test[0].columns)

# dataframe for right prediction - 1 is CORRECT and 0 is WRONG
df_right = (df_pred.astype(bool) == df_target.loc[pred_index].astype(bool)).astype(int)

# dataframe for wrong prediction - 1 is WRONG and 0 is CORRECT
df_wrong = (~df_right.astype(bool)).astype(int)

# datframe for profit
df_profit = df_right*next_val.loc[pred_index].abs()

# dataframe for loss
df_loss = df_wrong*next_val.loc[pred_index].abs()

plt.subplots(figsize=(15,5))
plt.plot((df_profit.sum(axis=1) - df_loss.sum(axis=1)).cumsum())
plt.grid()
plt.xlabel('Trading dates')
plt.ylabel('Cumulative PnL (AUD)')
plt.title('Graph of cumulative profit and loss (PnL)')
plt.show()



from datetime import timedelta

# pnl of lstm
pnl_lstm = (df_profit.sum(axis=1) - df_loss.sum(axis=1))

# plotting the roi per day
plt.subplots(figsize=(16, 24))
plt.barh((pnl_lstm/150*100).index,(pnl_lstm/150*100).values, label = "LSTM", align='edge', height=0.25)
plt.xlabel('ROI per day (%)')
plt.ylabel('Trading dates')
plt.title('Graph of return on investment (%) per day for different strategies')
plt.legend()
plt.grid()
plt.show()

Following the simple strategy, based on 40 days of trading and  188 𝑖𝑛𝑣𝑒𝑠𝑡𝑚𝑒𝑛𝑡 𝑝𝑒𝑟𝑑𝑎𝑦, 𝑡ℎ𝑒 𝑡𝑜𝑡𝑎𝑙 𝑝𝑟𝑜𝑓𝑖𝑡 𝑚𝑎𝑑𝑒 𝑖𝑠 𝑎𝑏𝑜𝑢𝑡 6. Considering the numbers and the linear trend of the cumulative PnL, I would say that this is a successful strategy.

fig, (ax1) = plt.subplots(1,1, figsize=(15, 5), sharey=True)   

ax1.hist((pnl_lstm/188*100).values, bins=10)
ax1.set_title("LSTM")
ax1.set_ylabel('Frequency')
ax1.set_xlabel('ROI (%)')
ax1.grid()

from scipy import stats

# calculating the confidence
conf_lstm = stats.percentileofscore((pnl_lstm/188*100).values, 2) - stats.percentileofscore((pnl_lstm/188*100).values, 0)

print('Confidence of employed approaches in making positive ROI based on test set:')
print('LSTM: {}%'.format(conf_lstm))