## 0. Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.layers import LSTM
from keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from keras.layers import Dense
import yfinance as yf

In [3]:
#import data
df = pd.read_csv('../../long_short_local/raw_data/cleaned_data_2y.csv')

## 1. Splitting Data in Train and Validation

In [4]:
# R
def split_train_val(df):
    length_data = len(df)    
    split_ratio = 0.7           # %70 train + %30 validation
    length_train = round(length_data * split_ratio)  
    length_validation = length_data - length_train
    #print("Data length :", length_data)
    #print("Train data length:", length_train)
    #print("Validation data lenth:", length_validation)
    
    train_data = df[:length_train].iloc[:,:2] 
    train_data['Date'] = pd.to_datetime(train_data['Date'])  # converting to date time object

    validation_data = df[length_train:].iloc[:,:2]
    validation_data['Date'] = pd.to_datetime(validation_data['Date'])  # converting to date time object
    
    return train_data, validation_data, length_train, length_validation

## 2. Creating Train Dataset from Train split

In [5]:
# R
# create dataset from train split
def train_split(train_data):
    dataset_train = train_data.iloc[:, 1].values
    # Change 1d array to 2d array
    # Changing shape from (1692,) to (1692,1)
    dataset_train = np.reshape(dataset_train, (-1,1))
    #dataset_train.shape
    dataset_train_scaled = dataset_train
    
    return dataset_train_scaled

## 3. Creating X_train and y_train from Train data 

We have train data composed of ratios over days

So, it has 1184 ratios corresponding 1184 days

My aim is to predict the ratio (closing) of the next day.

I can use a time step of 2 days.

I will pick first 2 ratios (day 0 to 2), 1st 2 ratio will be in 
X_train data

Then predict the ratio of 3rd day; and 3rd ratio will be in y_train data

Again, i will pick ratios from 1 to 3, those will be in X_train data

Then predict the next days ratio, 4th ratio will be in y_train data



In [6]:
def create_x_y_train(df, length_train):
    X_train = []
    y_train = []

    time_step = 20 #change that?

    for i in range(time_step, length_train):
        X_train.append(dataset_train_scaled[i-time_step:i,0:1])
        y_train.append(dataset_train_scaled[i,0:1])

    # convert list to array
    X_train, y_train = np.array(X_train), np.array(y_train)
    
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1],1))
    y_train = np.reshape(y_train, (y_train.shape[0],1))
    
    X_train = X_train[:int(X_train.shape[0]*0.95)]
    X_val = X_train[int(X_train.shape[0]*0.95):]
    y_train = y_train[:int(y_train.shape[0]*0.95)]
    y_val = y_train[int(y_train.shape[0]*0.95):]
    
    return X_train, X_val, y_train, y_val, time_step

## 4. Creating Test Dataset from Validation Data

In [7]:
# R
#Converting array and scaling
def create_x_y_test(validation_data, length_validation, time_step):
    dataset_validation = validation_data.iloc[:,1].values  # getting "Ratio" column and converting to array
    dataset_validation = np.reshape(dataset_validation, (-1,1))  # converting 1D to 2D array
    #scaled_dataset_validation =  scaler.fit_transform(dataset_validation)  # scaling  values to between 0 and 1
    scaled_dataset_validation = dataset_validation
    
    X_test = []
    y_test = []

    for i in range(time_step, length_validation):
        X_test.append(scaled_dataset_validation[i-time_step:i,0])
        y_test.append(scaled_dataset_validation[i,0])
        
    # Converting to array
    X_test, y_test = np.array(X_test), np.array(y_test)
    X_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[1],1))  # reshape to 3D array
    y_test = np.reshape(y_test, (-1,1))  # reshape to 2D array
    
    return X_test, y_test

## 5. Creating LSTM Model

In [8]:
def lstm_model(X_train, y_train, X_val, y_val, X_test, y_test):
    es = EarlyStopping(patience=20, restore_best_weights=True)
    model_lstm = Sequential()

    model_lstm.add(LSTM(20,return_sequences=False,input_shape = (X_train.shape[1],1))) #64 lstm neuron block
    model_lstm.add(Dense(32))
    model_lstm.add(Dense(1))

    model_lstm.compile(loss = "mape", optimizer = "rmsprop", metrics = ["mae", "mape"])
    history2 = model_lstm.fit(X_train, y_train, epochs = 400, batch_size = 64,validation_data = (X_val, y_val),callbacks=[es])
    mape = model_lstm.evaluate(X_test, y_test)
    
    return mape[2]

## 6. Implemenation refactored

In [12]:
# R
mape_dict = {}

for ratio in df:
   # split into train/test
    if ratio == 'Date':
        continue
    else:
        one_ratio_df = pd.DataFrame(df[['Date', ratio]])
        train_data, validation_data, length_train, length_validation = split_train_val(one_ratio_df)
        # call train_split
        dataset_train_scaled = train_split(train_data)
        # create X_train, y_train
        X_train, X_val, y_train, y_val, time_step = create_x_y_train(dataset_train_scaled, length_train)
        # create X_test, y_test
        X_test, y_test = create_x_y_test(validation_data, length_validation, time_step)
        # run LSTM model
        mape = lstm_model(X_train, y_train, X_val, y_val, X_test, y_test)
        mape_dict[ratio] = round(mape, 3)

2022-09-09 17:35:31.945882: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-09 17:35:32.467768: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoc

In [67]:
mape_dict

{'ALB_ZBRA': 2.646,
 'NRG_CPRT': 2.562,
 'DE_BXP': 2.751,
 'NRG_STE': 2.698,
 'NRG_XRAY': 3.351,
 'SO_NLOK': 3.693,
 'PEG_O': 3.438}

In [14]:
print(f'The average MAPE of all {len(mape_dict)} ratios is: {round(sum(mape_dict.values()) / len(mape_dict), 2)}%')

The average MAPE of all 7 ratios is: 5.66%


## 8. Recommend best 5 ratios

In [15]:
# sort from best to worst
bst_list = sorted(mape_dict.items(), key=lambda item: item[1])

In [16]:
best_five = dict(bst_list)
best_five = {k: best_five[k] for k in list(best_five)[:5]}
for k in best_five:
    print(k)

NRG_XRAY
PEG_O
DE_BXP
NRG_CPRT
SO_NLOK


## 9. Compute average true range (ATR)

In [71]:
# once a model is selected based on the lowest mape, we can download open, high, low prices
# convert best 5 to list
best_list = list(best_five.keys())
# get the 5 best ratios from the original dataset
df.set_index('Date', inplace=True)
best_atr_df = df[[c for c in df.columns if c in best_list]]

In [74]:
i = 0
length = best_atr_df.shape[1]
long_list = []
short_list = []

while i < length:
    ratio = best_atr_df.columns.str.split('_').tolist()[i]
    # get the long prices (numerator) for a defined period first
    numerator = ratio[0]
    long = yf.Ticker(numerator).history(period="1mo").reset_index()
    high_low_long = long['High'] - long['Low']
    high_close_long = np.abs(long['High'] - long['Close'].shift())
    low_close_long = np.abs(long['Low'] - long['Close'].shift())
    ranges_long = pd.concat([high_low_long, high_close_long, low_close_long], axis=1)
    true_range_long = np.max(ranges_long, axis=1)
    long[f'{numerator}'] = true_range_long.rolling(14).sum()/14
    long_list.append(long)
    
    # get the short prices (denominator) for a defined period second
    denominator = ratio[1]
    short = yf.Ticker(denominator).history(period="1mo").reset_index()
    high_low_short = short['High'] - short['Low']
    high_close_short = np.abs(short['High'] - short['Close'].shift())
    low_close_short = np.abs(short['Low'] - short['Close'].shift())
    ranges_short = pd.concat([high_low_short, high_close_short, low_close_short], axis=1)
    true_range_short = np.max(ranges_short, axis=1)
    short[f'{denominator}'] = true_range_short.rolling(14).sum()/14
    short_list.append(short)
    
    i += 1

In [75]:
long_df = []
short_df = []

# get list of last columns of long and short ATRs
for long in long_list:
    long_df.append(pd.DataFrame(long[long.columns[-1]]))

for short in short_list:
    short_df.append(pd.DataFrame(short[short.columns[-1]]))  

In [76]:
# generate list of best ratios as tuples
long_short = list(zip(long_df, short_df))

In [77]:
# loop to combine
final_conc_df = pd.DataFrame()
for i in range(0, len(long_short)):
    conc_df = pd.concat([pd.DataFrame(long_short[i][0]), pd.DataFrame(long_short[i][1])], axis=1)
    final_conc_df = pd.concat([final_conc_df, conc_df], axis=1)

In [78]:
# add back 'Date' columns
final_conc_df = pd.concat([final_conc_df, short_list[0]['Date']], axis=1)

In [79]:
# drop NANs
final_conc_df.dropna(how='any', axis=0, inplace=True)
# set index to 'Date' column
final_conc_df.set_index('Date', inplace=True)

In [80]:
# compute ATR for long/short ratios
final_atr_df = pd.DataFrame()
for i in range(0, final_conc_df.shape[1], 2):
    final_atr_df[f'{final_conc_df.columns[i]}_{final_conc_df.columns[i+1]}'] = \
                  final_conc_df.iloc[:, i] / final_conc_df.iloc[:, i+1]

In [81]:
# collect all ATRs in a dict
atr_dict = {}
for col in final_atr_df:
    atr_dict[col] = round(final_atr_df[col].mean(), 3)

In [82]:
atr_dict

{'ALB_ZBRA': 1.008,
 'NRG_CPRT': 0.355,
 'DE_BXP': 5.756,
 'NRG_STE': 0.224,
 'NRG_XRAY': 1.025}

## 10. Empirical information about the best ratios

In [83]:
# get best 5 ratios 
best_ratios = list(atr_dict.keys())
lst_mth_df = df[best_ratios][-20:]
lst_three_mth_df = df[best_ratios][-60:]

In [84]:
pl_one_dict = {}
pl_three_dict = {}
ptf_value = 10000

# calcualte change for 20 trading days
for col in lst_mth_df.columns:
    delta_one = lst_mth_df[col][19] / lst_mth_df[col][0]
    pl_one_dict[col] = round(ptf_value * delta_one, 2)
    
# calcualte change for 60 trading days
for col in lst_mth_df.columns:
    delta_three = lst_three_mth_df[col][59] / lst_three_mth_df[col][0]
    pl_three_dict[col] = round(ptf_value * delta_three, 2)

In [85]:
pl_one_dict

{'ALB_ZBRA': 11755.52,
 'NRG_CPRT': 11364.72,
 'DE_BXP': 11818.48,
 'NRG_STE': 12529.68,
 'NRG_XRAY': 12049.57}