In [1]:
# this file creates the features used in the trend RNN.
# features will revolve around high frequency (intra-day) mid-freq (daily to weekly), long term ( monthly trends)
# for the intraday factor, should we try for minutely data?
import pandas as pd
import numpy as np
import datetime 
from scipy.stats import norm
import math
from matplotlib import pyplot as plt
# Set up the data that we want to create a DT on.
# import the fx data , econ and value data for EURUSD.
# then create the features (on trend and econ data) standardise and run a DT on the x_train sample.
# what is  target? 1 day ahead or long days ahead? trade on binary data.
csv_file = {"FXData" : r"C:\Users\edgil\Documents\Masters\dissertation\data\CurrencyData.csv",
            "ValueData" : r"",
            "EconData" : r"",
            }
fxdata = pd.read_csv(csv_file["FXData"])
fxdata['Date'] = pd.to_datetime(fxdata['Date'], format= '%d/%m/%Y %H:%M')
# Separate out the EURUSD factor
eurusd = fxdata[["Date", "EURUSD"]]

In [18]:
# Very important step is to truncate the data so that we do not see the last 1 year of data.
# Q. should we not have a rollign window type of model? or just always aggregate the data from the start?
# how ong is testing? we should we the train and test to sizes which make sense to the type of model we use going forward.
eurusd = eurusd.loc[eurusd['Date'] < "2018-01-01 00:00"]
# train size should be at least 5 years?
eurusd_train = eurusd.loc[eurusd['Date'] < "2003-01-01 00:00"]
eurusd_test = eurusd.loc[eurusd['Date'] > "2010-02-01 00:00"]
# create a target vector to train on.
# must think deeply about what this will look like
# Build out the feature set on price, this may need to be created using funcional process.
eurusd["logret"] = np.log(eurusd["EURUSD"]) - np.log(eurusd["EURUSD"].shift(1))
# Standardising the daily rets and accumulating the standardised returns, or should we sum the % ret and standardise by its own history
# is difference between different accumulated retusn horizons the same as the macd?
# should we standardise by the 1 year forward vol?
targetlkbk = 5
short = 5
medium = 15
long = 55
longest = 100
eurusd['HF_short'] = eurusd["EURUSD"].ewm(short).mean()
eurusd['HF_medium'] = eurusd["EURUSD"].ewm(medium).mean()
eurusd['HF_long'] = eurusd["EURUSD"].ewm(long).mean()
eurusd['HF_longest'] = eurusd["EURUSD"].ewm(longest).mean()
# differences to spot
eurusd['spot_v_HF_short'] = eurusd["EURUSD"] - eurusd['HF_short']
eurusd['spot_v_HF_medium'] = eurusd["EURUSD"] - eurusd['HF_medium']
eurusd['spot_v_HF_long'] = eurusd["EURUSD"] - eurusd['HF_long']
eurusd['spot_v_HF_longest'] = eurusd["EURUSD"] - eurusd['HF_longest'] 

# medium frequency factors, multiplyer allows us to scale up the lookback as needed.
# days to weeks
medium_multiplyer = 24
eurusd['MF_short'] = eurusd["EURUSD"].ewm(short*medium_multiplyer).mean()
eurusd['MF_medium'] = eurusd["EURUSD"].ewm(medium*medium_multiplyer).mean()
eurusd['MF_long'] = eurusd["EURUSD"].ewm(long*medium_multiplyer).mean()
eurusd['MF_longest'] = eurusd["EURUSD"].ewm(longest*medium_multiplyer).mean()
# differences to spot
eurusd['spot_v_MF_short'] = eurusd["EURUSD"] - eurusd['MF_short']
eurusd['spot_v_MF_medium'] = eurusd["EURUSD"] - eurusd['MF_medium']
eurusd['spot_v_MF_long'] = eurusd["EURUSD"] - eurusd['MF_long']
eurusd['spot_v_MF_longest'] = eurusd["EURUSD"] - eurusd['MF_longest'] 
# long term factors
# weeks to months
long_multiplyer = 120 # each period is now one business week, 24*5
eurusd['LF_short'] = eurusd["EURUSD"].ewm(short*medium_multiplyer).mean()
eurusd['LF_medium'] = eurusd["EURUSD"].ewm(medium*medium_multiplyer).mean()
eurusd['LF_long'] = eurusd["EURUSD"].ewm(long*medium_multiplyer).mean()
eurusd['LF_longest'] = eurusd["EURUSD"].ewm(longest*medium_multiplyer).mean()
# differences to spot
eurusd['spot_v_LF_short'] = eurusd["EURUSD"] - eurusd['LF_short']
eurusd['spot_v_LF_medium'] = eurusd["EURUSD"] - eurusd['LF_medium']
eurusd['spot_v_LF_long'] = eurusd["EURUSD"] - eurusd['LF_long']
eurusd['spot_v_LF_longest'] = eurusd["EURUSD"] - eurusd['LF_longest'] 

# average of both spot distance and each ema distance
# take simple average of the divergences
eurusd['spot_v_HF'] = (eurusd['spot_v_HF_short'] + eurusd['spot_v_HF_medium'] + eurusd['spot_v_HF_long'] + eurusd['spot_v_HF_longest'])/4
eurusd['spot_v_MF'] = (eurusd['spot_v_MF_short'] + eurusd['spot_v_MF_medium'] + eurusd['spot_v_MF_long'] + eurusd['spot_v_MF_longest'])/4
eurusd['spot_v_LF'] = (eurusd['spot_v_LF_short'] + eurusd['spot_v_LF_medium'] + eurusd['spot_v_LF_long'] + eurusd['spot_v_LF_longest'])/4 
#differences to each ema
eurusd['HF_ema_short_diff'] = eurusd['HF_short']  - eurusd['HF_medium'] - eurusd['HF_long'] - eurusd['HF_long'] - eurusd['HF_longest']
eurusd['MF_ema_short_diff'] = eurusd['MF_short'] - eurusd['MF_medium'] - eurusd['MF_long'] - eurusd['MF_long'] - eurusd['MF_longest']
eurusd['LF_ema_short_diff'] = eurusd['LF_short'] - eurusd['LF_medium'] - eurusd['LF_long'] - eurusd['LF_long'] - eurusd['LF_longest']


In [21]:
# features for high frequency periods.
print(eurusd.describe())

              EURUSD         logret       HF_short      HF_medium  \
count  112680.000000  112679.000000  112680.000000  112680.000000   
mean        1.216049       0.000002       1.216041       1.216025   
std         0.173805       0.001292       0.173791       0.173766   
min         0.824150      -0.022859       0.825967       0.826673   
25%         1.102550      -0.000451       1.102554       1.102666   
50%         1.249500       0.000000       1.249549       1.250031   
75%         1.341450       0.000459       1.341490       1.341078   
max         1.601750       0.022458       1.598156       1.595781   

             HF_long     HF_longest  spot_v_HF_short  spot_v_HF_medium  \
count  112680.000000  112680.000000    112680.000000     112680.000000   
mean        1.215966       1.215901         0.000008          0.000024   
std         0.173664       0.173547         0.002361          0.004213   
min         0.832665       0.838575        -0.026121         -0.032880   
25%     