In [None]:
'''
This Code is to be run in chronological order and it includes the full lifecycle of the code in the following stages
1. Get raw data and process 
2. Get specified features
3. Run the decision tree model and save results
4. Run the SVm model and save results.
5. Run LSTM (must be done on a 64bit machine which has Keras installed
6. Run the Trend estimation 
7. Save the results.
'''

In [None]:
# this file creates the features used in the trend RNN.
# features will revolve around high frequency (intra-day) mid-freq (daily to weekly), long term ( monthly trends)
# for the intraday factor, should we try for minutely data?
import pandas as pd
import jupyter
import numpy as np
import datetime 
from scipy.stats import norm
import math
from matplotlib import pyplot as plt
# Set up the data that we want to create a DT on.
# import the fx data , econ and value data for EURUSD.
# then create the features (on trend and econ data) standardise and run a DT on the x_train sample.
# what is  target? 1 day ahead or long days ahead? trade on binary data.
csv_file = {"FXData" : r"CurrencyData.csv",
            "ValueData" : r"",
            "EconData" : r"",
            }
fxdata = pd.read_csv(csv_file["FXData"])
fxdata['Date'] = pd.to_datetime(fxdata['Date'], format= '%d/%m/%Y %H:%M')
# Separate out the EURUSD factor `
eurusd = fxdata[["Date", "EURUSD"]]
def get_time(row):
    '''
    Get the time stamp of the day 
    '''
    return row.time()
eurusd['timestamp'] = eurusd['Date'].apply(get_time)
df = eurusd
# Very important step is to truncate the data so that we do not see the last 1 year of data.
# Q. should we not have a rolling window type of model? or just always aggregate the data from the start?
# how long is testing? we should we the train and test to sizes which make sense to the type of model we use going forward.
# Making it full sample by commenting out the below
#eurusd = eurusd.loc[eurusd['Date'] < "2018-01-01 00:00"]
# create a target vector to train on.
# What is the target? binary return? sharpe ratio? This can then help us to position size the trade,
# Build out the feature set on price, this may need to be created using functional process.
eurusd["logret"] = np.log(eurusd["EURUSD"]) - np.log(eurusd["EURUSD"].shift(1))
# Standardising the daily rets and accumulating the standardised returns, or should we sum the % ret and standardise by its own history
# is difference between different accumulated retusn horizons the same as the macd?
# should we standardise by the 1 year forward vol?
# Use uncommented features if you want to use the short term features
short = 21 #
medium = 55 #15
long = 100 #55
longest = 200 # 100

# TODO: Should this be an EMA or simple average? Using EWMA now as we 
# overweight recent history
eurusd['HF_short'] = eurusd["EURUSD"].ewm(short).mean()
eurusd['HF_medium'] = eurusd["EURUSD"].ewm(medium).mean()
eurusd['HF_long'] = eurusd["EURUSD"].ewm(long).mean()
eurusd['HF_longest'] = eurusd["EURUSD"].ewm(longest).mean()
# differences to spot
eurusd['spot_v_HF_short'] = eurusd["EURUSD"] - eurusd['HF_short']
eurusd['spot_v_HF_medium'] = eurusd["EURUSD"] - eurusd['HF_medium']
eurusd['spot_v_HF_long'] = eurusd["EURUSD"] - eurusd['HF_long']
eurusd['spot_v_HF_longest'] = eurusd["EURUSD"] - eurusd['HF_longest'] 

# medium frequency factors, multiplyer allows us to scale up the lookback as needed.
# days to weeks
medium_multiplyer = 24
eurusd['MF_short'] = eurusd["EURUSD"].ewm(short*medium_multiplyer).mean()
eurusd['MF_medium'] = eurusd["EURUSD"].ewm(medium*medium_multiplyer).mean()
eurusd['MF_long'] = eurusd["EURUSD"].ewm(long*medium_multiplyer).mean()
eurusd['MF_longest'] = eurusd["EURUSD"].ewm(longest*medium_multiplyer).mean()
# differences to spot
# to measure relative momentum
eurusd['spot_v_MF_short'] = eurusd["EURUSD"] - eurusd['MF_short']
eurusd['spot_v_MF_medium'] = eurusd["EURUSD"] - eurusd['MF_medium']
eurusd['spot_v_MF_long'] = eurusd["EURUSD"] - eurusd['MF_long']
eurusd['spot_v_MF_longest'] = eurusd["EURUSD"] - eurusd['MF_longest'] 
# long term factors
# weeks to months
long_multiplyer = 120 # each period is now one business week, 24*5
eurusd['LF_short'] = eurusd["EURUSD"].ewm(short*long_multiplyer).mean()
eurusd['LF_medium'] = eurusd["EURUSD"].ewm(medium*long_multiplyer).mean()
eurusd['LF_long'] = eurusd["EURUSD"].ewm(long*long_multiplyer).mean()
eurusd['LF_longest'] = eurusd["EURUSD"].ewm(longest*long_multiplyer).mean()
# differences to spot
# to measure relative momentum
eurusd['spot_v_LF_short'] = eurusd["EURUSD"] - eurusd['LF_short']
eurusd['spot_v_LF_medium'] = eurusd["EURUSD"] - eurusd['LF_medium']
eurusd['spot_v_LF_long'] = eurusd["EURUSD"] - eurusd['LF_long']
eurusd['spot_v_LF_longest'] = eurusd["EURUSD"] - eurusd['LF_longest'] 

# average of both spot distance and each ema distance
# take simple average of the divergences at each time frame
eurusd['spot_v_HF'] = (eurusd['spot_v_HF_short'] + eurusd['spot_v_HF_medium'] + eurusd['spot_v_HF_long'] + eurusd['spot_v_HF_longest'])/4
eurusd['spot_v_MF'] = (eurusd['spot_v_MF_short'] + eurusd['spot_v_MF_medium'] + eurusd['spot_v_MF_long'] + eurusd['spot_v_MF_longest'])/4
eurusd['spot_v_LF'] = (eurusd['spot_v_LF_short'] + eurusd['spot_v_LF_medium'] + eurusd['spot_v_LF_long'] + eurusd['spot_v_LF_longest'])/4 
#differences to each ema
# This can capture the divergences between the EMAs, which allows us to grasp the speed of the move
eurusd['HF_ema_diff'] = (eurusd['HF_short']-eurusd['HF_medium']) + (eurusd['HF_medium']-eurusd['HF_long']) + (eurusd['HF_long']-eurusd['HF_longest'])
eurusd['MF_ema_diff'] = (eurusd['MF_short']-eurusd['MF_medium']) + (eurusd['MF_medium']-eurusd['MF_long']) + (eurusd['MF_long']-eurusd['MF_longest'])
eurusd['LF_ema_diff'] = (eurusd['LF_short']-eurusd['LF_medium']) + (eurusd['LF_medium']-eurusd['LF_long']) + (eurusd['LF_long']-eurusd['LF_longest'])
# Add in hourly feature times. Think this is important as there can be certain patterns that occur into and out
# of these time frames
# London and NY liquid hours
eurusd['LDN'] = 0
eurusd['NY'] = 0
eurusd['Asia'] = 0
# adding in timezone changes
eurusd['LDN'].loc[(eurusd["timestamp"] >= datetime.time(7,0)) & (eurusd["timestamp"] <= datetime.time(12,0))] = 1
eurusd['LDN'].loc[(eurusd["timestamp"] >= datetime.time(13,0)) & (eurusd["timestamp"] <= datetime.time(17,0))] = 0.5
eurusd['NY'].loc[(eurusd["timestamp"] >= datetime.time(13,0)) & (eurusd["timestamp"] <= datetime.time(17,0))] = 0.5
eurusd['NY'].loc[(eurusd["timestamp"] >= datetime.time(18,0)) & (eurusd["timestamp"] <= datetime.time(22,0))] = 1
eurusd['Asia'].loc[(eurusd["timestamp"] >= datetime.time(23,0))] = 1
eurusd['Asia'].loc[(eurusd["timestamp"] <= datetime.time(6,0))] = 1

# Now adding the target vector
targetlkbk = 24
# Using a shift = 2 so that the forward return starts from exactly the next future time step.
eurusd["target"] = eurusd['logret'].iloc[::-1].shift(2).rolling(targetlkbk).sum().values[::-1]
eurusd['target_binary'] = eurusd['target'].apply(np.sign) 
eurusd['CCY'] = eurusd['EURUSD']
eurusd.to_csv(r"ccyDataFullSampleLongTerm.csv"
                           , index = False)
'''
----------------Create for Short term Features as well-------
'''
# Use uncommented features if you want to use the short term features
short = 5
medium = 15
long = 55
longest =  100

# TODO: Should this be an EMA or simple average? Using EWMA now as we 
# overweight recent history
eurusd['HF_short'] = eurusd["EURUSD"].ewm(short).mean()
eurusd['HF_medium'] = eurusd["EURUSD"].ewm(medium).mean()
eurusd['HF_long'] = eurusd["EURUSD"].ewm(long).mean()
eurusd['HF_longest'] = eurusd["EURUSD"].ewm(longest).mean()
# differences to spot
eurusd['spot_v_HF_short'] = eurusd["EURUSD"] - eurusd['HF_short']
eurusd['spot_v_HF_medium'] = eurusd["EURUSD"] - eurusd['HF_medium']
eurusd['spot_v_HF_long'] = eurusd["EURUSD"] - eurusd['HF_long']
eurusd['spot_v_HF_longest'] = eurusd["EURUSD"] - eurusd['HF_longest'] 

# medium frequency factors, multiplyer allows us to scale up the lookback as needed.
# days to weeks
medium_multiplyer = 24
eurusd['MF_short'] = eurusd["EURUSD"].ewm(short*medium_multiplyer).mean()
eurusd['MF_medium'] = eurusd["EURUSD"].ewm(medium*medium_multiplyer).mean()
eurusd['MF_long'] = eurusd["EURUSD"].ewm(long*medium_multiplyer).mean()
eurusd['MF_longest'] = eurusd["EURUSD"].ewm(longest*medium_multiplyer).mean()
# differences to spot
# to measure relative momentum
eurusd['spot_v_MF_short'] = eurusd["EURUSD"] - eurusd['MF_short']
eurusd['spot_v_MF_medium'] = eurusd["EURUSD"] - eurusd['MF_medium']
eurusd['spot_v_MF_long'] = eurusd["EURUSD"] - eurusd['MF_long']
eurusd['spot_v_MF_longest'] = eurusd["EURUSD"] - eurusd['MF_longest'] 
# long term factors
# weeks to months
long_multiplyer = 120 # each period is now one business week, 24*5
eurusd['LF_short'] = eurusd["EURUSD"].ewm(short*long_multiplyer).mean()
eurusd['LF_medium'] = eurusd["EURUSD"].ewm(medium*long_multiplyer).mean()
eurusd['LF_long'] = eurusd["EURUSD"].ewm(long*long_multiplyer).mean()
eurusd['LF_longest'] = eurusd["EURUSD"].ewm(longest*long_multiplyer).mean()
# differences to spot
# to measure relative momentum
eurusd['spot_v_LF_short'] = eurusd["EURUSD"] - eurusd['LF_short']
eurusd['spot_v_LF_medium'] = eurusd["EURUSD"] - eurusd['LF_medium']
eurusd['spot_v_LF_long'] = eurusd["EURUSD"] - eurusd['LF_long']
eurusd['spot_v_LF_longest'] = eurusd["EURUSD"] - eurusd['LF_longest'] 

# average of both spot distance and each ema distance
# take simple average of the divergences at each time frame
eurusd['spot_v_HF'] = (eurusd['spot_v_HF_short'] + eurusd['spot_v_HF_medium'] + eurusd['spot_v_HF_long'] + eurusd['spot_v_HF_longest'])/4
eurusd['spot_v_MF'] = (eurusd['spot_v_MF_short'] + eurusd['spot_v_MF_medium'] + eurusd['spot_v_MF_long'] + eurusd['spot_v_MF_longest'])/4
eurusd['spot_v_LF'] = (eurusd['spot_v_LF_short'] + eurusd['spot_v_LF_medium'] + eurusd['spot_v_LF_long'] + eurusd['spot_v_LF_longest'])/4 
#differences to each ema
# This can capture the divergences between the EMAs, which allows us to grasp the speed of the move
eurusd['HF_ema_diff'] = (eurusd['HF_short']-eurusd['HF_medium']) + (eurusd['HF_medium']-eurusd['HF_long']) + (eurusd['HF_long']-eurusd['HF_longest'])
eurusd['MF_ema_diff'] = (eurusd['MF_short']-eurusd['MF_medium']) + (eurusd['MF_medium']-eurusd['MF_long']) + (eurusd['MF_long']-eurusd['MF_longest'])
eurusd['LF_ema_diff'] = (eurusd['LF_short']-eurusd['LF_medium']) + (eurusd['LF_medium']-eurusd['LF_long']) + (eurusd['LF_long']-eurusd['LF_longest'])
# Add in hourly feature times. Think this is important as there can be certain patterns that occur into and out
# of these time frames
# London and NY liquid hours
eurusd['LDN'] = 0
eurusd['NY'] = 0
eurusd['Asia'] = 0
# adding in timezone changes
eurusd['LDN'].loc[(eurusd["timestamp"] >= datetime.time(7,0)) & (eurusd["timestamp"] <= datetime.time(12,0))] = 1
eurusd['LDN'].loc[(eurusd["timestamp"] >= datetime.time(13,0)) & (eurusd["timestamp"] <= datetime.time(17,0))] = 0.5
eurusd['NY'].loc[(eurusd["timestamp"] >= datetime.time(13,0)) & (eurusd["timestamp"] <= datetime.time(17,0))] = 0.5
eurusd['NY'].loc[(eurusd["timestamp"] >= datetime.time(18,0)) & (eurusd["timestamp"] <= datetime.time(22,0))] = 1
eurusd['Asia'].loc[(eurusd["timestamp"] >= datetime.time(23,0))] = 1
eurusd['Asia'].loc[(eurusd["timestamp"] <= datetime.time(6,0))] = 1

# Now adding the target vector
targetlkbk = 24
# Using a shift = 2 so that the forward return starts from exactly the next future time step.
eurusd["target"] = eurusd['logret'].iloc[::-1].shift(2).rolling(targetlkbk).sum().values[::-1]
eurusd['target_binary'] = eurusd['target'].apply(np.sign) 
eurusd['CCY'] = eurusd['EURUSD']
eurusd.to_csv(r"ccyData.csv", index = False)
'''
---------------- This code plots the EURUSD and Moving Averages -------
'''
# This code plots the chart of EURUSD and its moving averages
from matplotlib import pyplot as plt
eurusd['DDMMYY'] = eurusd['Date'].dt.date
fig = plt.figure(1,figsize=(10,8))
ax = fig.add_subplot(111)
ax.plot(pd.to_datetime(eurusd['Date']),eurusd['EURUSD'], c = "b")
ax.plot(pd.to_datetime(eurusd['Date']),eurusd['MF_medium'], c = "g")
ax.plot(pd.to_datetime(eurusd['Date']),eurusd['LF_medium'], c = "r")
ax.set_xlabel("Date")
ax.set_ylabel("1 EUR in USD Terms")
ax.legend(['EURUSD', 'Medium Term Moving Average', "Long Term Moving Average"])
ax.get_figure().savefig("EURUSDMovingAvg.png")


In [None]:
'''
2. The decision tree code
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Decision trees
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
#import graphviz
import os

'''
----- This code shows the running of the decision tree model on either running the random data train or the historical price
'''
##### FOR PCA features  && train on random data!####
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from model_functions import *
from run_decision_tree import *
import datetime
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn import metrics

# if you want to test on thw withheld data then use file name BLIND_STFEATURES.csv

file_location = r"ccyData.csv"
performance_store = {"data_size" : [], "Accuracy_Score" : [], "ntree": [],
                     "Info_Ratio" : [], "run_time" : [], "train_date_st": [], "test_date_st": []}
params_dict = set_params_random_forests()
use_random_train_data = params_dict['use_random_train_data']
########################### Set Model Paramaters #############################
# this looks back over a set period as the memory for the LSTM
'''
----- If you want to loop of the number of trees to use, then uncomment the next line..--------
'''

ntrees = params_dict['ntrees'] # [i for i in range(25,301,25)] # [21, 66]
# if running pca, max features can only be same or less than the full total of features
max_features = params_dict['max_features']
test_buffer = params_dict['test_buffer']
max_depth = params_dict['max_depth']
data_size = params_dict['data_size'] #  initially using 1500 training points
# I.e. append the data into one df over each training window, but also use all available up until that point
concat_results = params_dict['concat_results']
# if the number is > 1, then the code takes that as the number of test points you want to use
test_split = params_dict['test_split'] # roughly one month test ahead, which is a one month retrain period
# signal threshold, when using classifier
thold = params_dict['thold']
total_data_needed = get_total_data_needed(test_split,data_size,test_buffer)
# standardisation window
window = params_dict['window']
###### Set Targets ##############
trade_horizon = params_dict['trade_horizon'] # in hours
use_risk_adjusted = params_dict['use_risk_adjusted'] # if True: training on the sharpe return else raw
use_binary = params_dict['use_binary'] # set to true if you are using the risk adjusted and want it binary for classification score
use_classifier = params_dict['use_classifier']
use_pca = params_dict['use_pca'] # if = 0 then implies do not use pca in the model
use_separated_chunk = params_dict['use_separated_chunk']
################### Standardise Entire Dataset using rolling lookback windows ###############
data_normed, model_features, features_to_standardise = initialise_process(file_location, trade_horizon, 
                                                 window, use_risk_adjusted, use_pca,use_random_train_data)
#data_normed = data_normed.replace(np.nan, 0)
start_row = data_size
 # Use a rolling window to train and test
################ Loop through the full dataset in terms of the training and testing.
if use_random_train_data:
    random_data_location = r"CcyRandomTrend.csv"
    train, model_features, features_to_standardise = initialise_process(random_data_location, trade_horizon, 
                                                 window, use_risk_adjusted, use_pca, use_random_train_data)
    test, model_features, features_to_standardise  = initialise_process(file_location, trade_horizon, 
                                                 window, use_risk_adjusted, use_pca, 
                                                use_random_train_data = False)
    if use_pca > 0:
        train, test, var_explained = get_pca_features(train, test, features_to_standardise, use_pca)
    train_sample = train[model_features]
    test_sample = test[model_features]
    for ntree in ntrees:
        results, acc_score = decision_tree(train_sample, test_sample,use_classifier,
                                               use_risk_adjusted,ntree, max_features, max_depth)
        test_results = backtester(results, test, trade_horizon)[0]
        strat_return = backtester(results, test, trade_horizon)[1]
        information_ratio = backtester(results, test, trade_horizon)[2]
        train_date = 1 # train['Date'].iloc[0]
        test_date = test['Date'].iloc[0]
        run_time = "random"
        performance_df = update_performance(data_size, ntree, acc_score , information_ratio, 
                                            run_time, train_date, test_date, performance_store)
    save_test_df = r"randomData%s_st_row%s_use_risk%s_use_SepChunk%s_concat%s.csv" % (ntree,start_row,use_risk_adjusted,use_separated_chunk,concat_results) 
    test_results.to_csv(save_test_df, index = False)
else:
    # removing the code to train on non random generated data as it did not work as discussed in project
    pass





In [None]:
'''
This code is to train the SVM model on randomly generated data.
'''
##### FOR PCA features  && train on random data!####
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from model_functions import *
from run_decision_tree import *
import datetime
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn import metrics
# if you want to test on the withheld data then use the following file BLIND_STFEATURES.csv
file_location = r"ccyData.csv"
performance_store = {"data_size" : [], "Accuracy_Score" : [], "ntree": [],
                     "Info_Ratio" : [], "run_time" : [], "train_date_st": [], "test_date_st": []}
params_dict = set_params_random_forests()
svm_dict = set_params_svm()
use_random_train_data = params_dict['use_random_train_data']
########################### Set Model Paramaters #############################
# this looks back over a set period as the memory for the LSTM
kernel = svm_dict['kernel'] 
'''
--- If you want to loop through the costs then un comment the snippet below to make it a list of costs
'''
costs = svm_dict['cost']# [i for i in range(25,301,25)] # [21, 66]
# if running pca, max features can only be same or less than the full total of features
test_buffer = params_dict['test_buffer']
data_size = params_dict['data_size'] #  initially using 1500 training points
# I.e. append the data into one df over each training window, but also use all available up until that point
concat_results = params_dict['concat_results']
# if the number is > 1, then the code takes that as the number of test points you want to use
test_split = params_dict['test_split'] # roughly one month test ahead, which is a one month retrain period
# signal threshold, when using classifier
thold = params_dict['thold']
total_data_needed = get_total_data_needed(test_split,data_size,test_buffer)
# standardisation window
window = params_dict['window']
###### Set Targets ##############
trade_horizon = params_dict['trade_horizon'] # in hours
use_risk_adjusted = params_dict['use_risk_adjusted'] # if True: training on the sharpe return else raw
use_binary = params_dict['use_binary'] # set to true if you are using the risk adjusted and want it binary for classification score
use_classifier = params_dict['use_classifier']
use_pca = params_dict['use_pca'] # if = 0 then implies do not use pca in the model
use_separated_chunk = params_dict['use_separated_chunk']
################### Standardise Entire Dataset using rolling lookback windows ###############
data_normed, model_features, features_to_standardise = initialise_process(file_location, trade_horizon, 
                                                 window, use_risk_adjusted, use_pca,use_random_train_data)
#data_normed = data_normed.replace(np.nan, 0)
start_row = data_size
 # Use a rolling window to train and test
################ Loop through the full dataset in terms of the training and testing.
if use_random_train_data:
    random_data_location = r"CcyRandomTrend.csv"
    train, model_features, features_to_standardise = initialise_process(random_data_location, trade_horizon, 
                                                 window, use_risk_adjusted, use_pca, use_random_train_data)
    test, model_features, features_to_standardise  = initialise_process(file_location, trade_horizon, 
                                                 window, use_risk_adjusted, use_pca, 
                                                use_random_train_data= False)
    if use_pca > 0:
        train, test, var_explained = get_pca_features(train,test, features_to_standardise, use_pca)
    train_sample = train[model_features]
    test_sample = test[model_features]
    for cost in costs:
        results, acc_score = run_svm_model(train_sample, test_sample,use_classifier, use_risk_adjusted,kernel,cost)
        test_results = backtester(results, test, trade_horizon)[0]
        strat_return = backtester(results, test, trade_horizon)[1]
        information_ratio = backtester(results, test, trade_horizon)[2]
        train_date = 1 # train['Date'].iloc[0]
        test_date = test['Date'].iloc[0]
        run_time = "random"
        performance_df = update_performance(data_size, cost, acc_score , information_ratio, 
                                            run_time, train_date, test_date, performance_store)
    save_test_df = r"randomData%s_COST%s_use_risk%s_use_SepChunk%s_concat%s_TH%s.csv" % (kernel,
                                    cost,use_risk_adjusted,use_separated_chunk,concat_results, trade_horizon) 
    test_results.to_csv(save_test_df, index = False)
else:
    # the training on real world data is not needed due to poor performance as stated previously.
    pass



In [None]:
'''
Recurrent neural Network Code
NB. This must be run on 64mb Python and with Keras/Tensorflow installed
'''
##### FOR PCA features  && train on random data!####
# This file is a simple implementation of the 
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from ml_trading_module import *
from model_functions import *
import datetime
import sys

'''
---- If running the withheld data then use this name ccyDataBLIND_LongTermFEATURES
'''
file_location = r"ccyData.csv"
performance_store = {"data_size" : [], "Accuracy_Score" : [], "epochs": [],
                     "Info_Ratio" : [], "run_time" : [], "train_date_st": [], "test_date_st": []}
params_dict = set_params_random_forests()
lstm_dict = set_params_LSTM()
use_random_train_data = params_dict['use_random_train_data']
########################### Set Model Paramaters #############################
# this looks back over a set period as the memory for the LSTM
EPOCH = lstm_dict['EPOCH']
first_layer = lstm_dict['first_layer']
second_layer = lstm_dict['second_layer']
look_back = lstm_dict['look_back'] 

# [i for i in range(25,301,25)] # [21, 66]
# if running pca, max features can only be same or less than the full total of features
test_buffer = params_dict['test_buffer']
data_size = params_dict['data_size'] #  initially using 1500 training points
# I.e. append the data into one df over each training window, but also use all available up until that point
concat_results = params_dict['concat_results']
# if the number is > 1, then the code takes that as the number of test points you want to use
test_split = params_dict['test_split'] # roughly one month test ahead, which is a one month retrain period
# signal threshold, when using classifier
thold = params_dict['thold']
total_data_needed = get_total_data_needed(test_split,data_size,test_buffer)
# standardisation window
window = params_dict['window']
###### Set Targets ##############
trade_horizon = params_dict['trade_horizon'] # in hours
use_risk_adjusted = params_dict['use_risk_adjusted'] # if True: training on the sharpe return else raw
use_binary = params_dict['use_binary'] # set to true if you are using the risk adjusted and want it binary for classification score
use_classifier = params_dict['use_classifier']
use_pca = params_dict['use_pca'] # if = 0 then implies do not use pca in the model
use_separated_chunk = params_dict['use_separated_chunk']
################### Standardise Entire Dataset using rolling lookback windows ###############
data_normed, model_features, features_to_standardise = initialise_process(file_location, trade_horizon, 
                                                 window, use_risk_adjusted, use_pca,use_random_train_data, use_binary)
#data_normed = data_normed.replace(np.nan, 0)
start_row = data_size
 # Use a rolling window to train and test
################ Loop through the full dataset in terms of the training and testing.
if use_random_train_data:
    print("random starting, EPOCH %s" % str(EPOCH))
    random_data_location = r"/storage/CcyRandomTrendLSTM.csv"
    train, model_features, features_to_standardise = initialise_process(random_data_location, trade_horizon, 
                                                 window, use_risk_adjusted, use_pca, use_binary,use_random_train_data)
    test, model_features, features_to_standardise  = initialise_process(file_location, trade_horizon, 
                                                 window, use_risk_adjusted, use_pca, 
                                                 use_binary ,use_random_train_data = False)
    if test.shape[0] <= (look_back+test_buffer+trade_horizon):
        sys.exit()
    if use_pca > 0:
        train, test, var_explained = get_pca_features(train,test, features_to_standardise, use_pca)
    train_sample = train[model_features].values
    test_sample = test[model_features].head(test.shape[0]-39).values # had to do this to make it work

     # Parse the values into the LSTM format
    train_data , train_target, null_dates = create_dataset(train_sample,False, look_back, test)
    test_data, test_target, target_dates = create_dataset(test_sample, True, look_back, test)
    
    # reshape seems to add another list around every observation
    train_data = train_data.reshape(train_data.shape[0], look_back, train_data.shape[2])
    train_target = train_target.reshape(train_target.shape[0], 1)
    test_data = test_data.reshape(test_data.shape[0], look_back, test_data.shape[2])
    test_target = test_target.reshape(test_target.shape[0], 1)
    #### Set up model parameters
    # Build up the model
    BATCH_SIZE = None # previous was 32 batch and 33 lookback
    no_features = train_data.shape[2]
    model = Sequential()
    #model.add(LSTM(first_layer,batch_input_shape = (BATCH_SIZE,look_back,no_features), return_sequences = True))
    #model.add(LSTM(second_layer, return_sequences = False, activation="softmax"))
    #model.compile(loss = "mean_absolute_error", optimizer="adam", metrics = ['accuracy'])
    # newer method of learning
    model.add(LSTM(first_layer, batch_input_shape = (BATCH_SIZE,look_back,no_features), return_sequences = True))
    model.add(LSTM(second_layer, return_sequences = False, activation="softmax"))
    model.add(Dense(8, activation = "tanh"))
    model.add(Dense(1, activation = "linear"))
    model.compile(loss = "mse", optimizer="adam", metrics = ['accuracy'])
    # train the model
    # verbose = 1 gives the output of the training.
    start_time = datetime.datetime.now()
    lstm_engine = model.fit(train_data,train_target,epochs = EPOCH,validation_data=(test_data,test_target), verbose= 1)
    #lstm_engine = model.fit(train_data,train_target,epochs = EPOCH, validation_data=(test_data,test_target),batch_size = BATCH_SIZE, verbose = 1)
    import matplotlib.pyplot as plt
    fig , ax = plt.subplots(1,1,figsize=(10,8))
    plt.plot(lstm_engine.history['loss'])
    plt.show()
    run_time = datetime.datetime.now() - start_time
    # run training on the test data
    results = model.predict(test_data, batch_size = BATCH_SIZE, verbose = 1)
    # The % threshold needed to trigger a signal either way
    predicted = [i[0] for i in results] # [np.sign(i) for i in results] #[signal(i, thold) for i in results]
    acc_score = get_accuracy([np.sign(i) for i in results], test_target)
    # This needs to change to handle the change in the target
    predictions = pd.DataFrame({"Date" : target_dates,"Predictions": predicted})
    test_results = pd.merge(test,predictions,how="left", on="Date").fillna(0)
    # calculate the returns of the signal
    test_results["scaled_signal"] = test_results['Predictions'].shift(2).rolling(trade_horizon).sum()/trade_horizon
    # no shift needed as we have already done that in previous step
    test_results['strat_returns'] = test_results['logret']*test_results['scaled_signal']
    test_results['strat_returns_sum'] = test_results['strat_returns'].cumsum()
    strat_return = test_results['strat_returns'].sum()
    information_ratio = (test_results['strat_returns'].mean()*260)/(test_results['strat_returns'].std()*np.sqrt(260))
    
    # Store the data as needed
    performance_store['data_size'].append(data_size)
    performance_store['epochs'].append(EPOCH)
    performance_store['Accuracy_Score'].append(acc_score)
    performance_store['Info_Ratio'].append(information_ratio)
    performance_store['run_time'].append(run_time)
    performance_store['train_date_st'].append(test_results['Date'].iloc[0])
    performance_store['test_date_st'].append(test_results['Date'].iloc[-1])
    performance_df = pd.DataFrame(performance_store)
    save_results = r"/storage/test_result_start_row%s_lkbk%s_Epochs%s_thold%s.csv" % (start_row, look_back, EPOCH, thold)
    test_results.to_csv(save_results,index = False)
    performance_df.to_csv(r"/storage/performance_df_start_row%s_lkbk%s_Epochs%s_thold%s.csv" % (start_row, look_back, EPOCH, thold))
else:
    # if not using random data then move to the normal method.
    ################ Loop through the full dataset in terms of the training and testing.
    while start_row < data_normed.shape[0]:
        # first check if there is enough data left
        if (start_row + total_data_needed) > data_normed.shape[0]:
            # if we are about to go over the limit, then just return the last data_size + test size proportion of data
            trunc_data = data_normed.iloc[-total_data_needed:,:]
        # we need to increment over the data size
        if use_separated_chunk:
            # this means we jump across the full previous train and test data
            trunc_data = data_normed.loc[start_row:,:]
            start_row += total_data_needed
        if concat_results:
            # in this instance, we can to add to the start row first before chunking the data
            start_row += test_split
            # we are training on all data available up until that point, and testing x timeperiods ahead
            trunc_data = data_normed.loc[:start_row,:]  
        else:
            # this rolls the data so that the new training will overlap on the old test set and create a new separated test set
            trunc_data = data_normed.loc[start_row:,:]
            start_row += data_size
        # standardise the data
        #################### Set up training and testing ########################
        
        # create data_set
        train , test = create_train_test_file(trunc_data, data_size, test_split, test_buffer)
        if test.shape[0] <= (look_back+test_buffer+trade_horizon):
            break
        if use_pca > 0:
            train, test, var_explained = get_pca_features(train,test, features_to_standardise, use_pca)
        train_sample = train[model_features].values
        test_sample = test[model_features].values
    
         # Parse the values into the LSTM format
        train_data , train_target, null_dates = create_dataset(train_sample,False, look_back, test)
        test_data, test_target, target_dates = create_dataset(test_sample, True, look_back, test)
        
        # reshape seems to add another list around every observation
        train_data = train_data.reshape(train_data.shape[0], look_back, train_data.shape[2])
        train_target = train_target.reshape(train_target.shape[0], 1)
        test_data = test_data.reshape(test_data.shape[0], look_back, test_data.shape[2])
        test_target = test_target.reshape(test_target.shape[0], 1)
        #### Set up model parameters
        # Build up the model
        BATCH_SIZE = 6
        no_features = train_data.shape[2]
        model = Sequential()
        model.add(LSTM(120,batch_input_shape = (None,look_back,no_features), return_sequences = True))
        model.add(LSTM(1, return_sequences = False, activation="softmax"))
        model.compile(loss = "mean_absolute_error", optimizer="adam", metrics = ['accuracy'])
        
        EPOCH = 350
        # train the model
        # verbose = 1 gives the output of the training.
        start_time = datetime.datetime.now()
        lstm_engine = model.fit(train_data,train_target,epochs = EPOCH,validation_data=(test_data,test_target), verbose= 1)
        run_time = datetime.datetime.now() - start_time
        # run training on the test data
        results = model.predict(test_data)
        # The % threshold needed to trigger a signal either way
        #thold = 0.55
        predicted = [np.sign(i) for i in results] # [signal(i, thold) for i in results]
        acc_score = get_accuracy(predicted, test_target)
        # This needs to change to handle the change in the target
        predictions = pd.DataFrame({"Date" : target_dates,"Predictions": predicted})
        test_results = pd.merge(test,predictions,how="left", on="Date").fillna(0)
        # calculate the returns of the signal
        test_results["scaled_signal"] = test_results['Predictions'].shift(2).rolling(trade_horizon).sum()/trade_horizon
        # no shift needed as we have already done that in previous step
        test_results['strat_returns'] = test_results['logret']*test_results['scaled_signal']
        test_results['strat_returns_sum'] = test_results['strat_returns'].cumsum()
        strat_return = test_results['strat_returns'].sum()
        information_ratio = (test_results['strat_returns'].mean()*260)/(test_results['strat_returns'].std()*np.sqrt(260))
        
        # Store the data as needed
        performance_store['data_size'].append(data_size)
        performance_store['epochs'].append(EPOCH)
        performance_store['Accuracy_Score'].append(acc_score)
        performance_store['Info_Ratio'].append(information_ratio)
        performance_store['run_time'].append(run_time)
        performance_store['train_date_st'].append(test_results['Date'].iloc[0])
        performance_store['test_date_st'].append(test_results['Date'].iloc[-1])
        performance_df = pd.DataFrame(performance_store)
        save_results = r"/storage/test_result_start_row%s_lkbk%s_Epochs%s_thold%s.csv" % (start_row, look_back, EPOCH, thold)
        test_results.to_csv(save_results,index = False)
        performance_df.to_csv(r"/storage/performance_df_start_row%s_lkbk%s_Epochs%s_thold%s.csv" % (start_row, look_back, EPOCH, thold))

'''
Code for plotting the accuracy metrics
'''
# quick testing of what is happening with this LSTM
import matplotlib.pyplot as plt
#fig , ax = plt.subplots(1,1,figsize=(12,8))
# lets see the history of the error update.
# want a smooth learning line
#ax.plot(lstm_engine.history['loss'])
import matplotlib.pyplot as plt
fig , ax = plt.subplots(1,1,figsize=(10,8))
plt.plot(lstm_engine.history['loss'])
plt.show()


In [None]:
'''
This code performs the model 2 trend estimation, where we take in te return of the trend model in each case of the SVM, RF and LSTM

1. We get the features of the macro economic factors
'''
import numpy as np
import pandas as pd
from create_model_features import trends_features
from copy import deepcopy
import datetime
################## INSERT CCY You want to RUN
ccy = "EURUSD"
ccy1 = ccy[:3]
ccy2 = ccy[3:]
edi_location = r"EDI.csv"
esi_location = r"ESI.csv"
features = ["EDI_"+ccy1 + ccy2 + "_spread", "EDI_G10_normalised", "EDI_Global_normalised",
            "ESI_"+ccy1 + ccy2 + "_spread", "ESI_G10_normalised", "ESI_Global_normalised"]
EDI = pd.read_csv(edi_location)
ESI = pd.read_csv(esi_location)
EDI['Date'] =  pd.to_datetime(EDI['Date'], format= '%d/%m/%Y')
ESI['Date'] =  pd.to_datetime(ESI['Date'], format= '%d/%m/%Y')
EDI['Date'] =  EDI['Date'].dt.date
ESI['Date'] =  ESI['Date'].dt.date
EDI["EDI_"+ ccy1 + ccy2 + "_spread"] = EDI["EDI_" +ccy1 + "_normalised"] - EDI["EDI_" +ccy2 + "_normalised"]
ESI["ESI_"+ ccy1 + ccy2 + "_spread"] = ESI["ESI_" +ccy1 + "_normalised"] - ESI["ESI_" + ccy2 + "_normalised"]
EDI_tradable = EDI[["EDI_"+ccy1 + ccy2 + "_spread", "EDI_G10_normalised", "EDI_Global_normalised"]].cumsum()
ESI_tradable = ESI[["ESI_"+ccy1 + ccy2 + "_spread", "ESI_G10_normalised", "ESI_Global_normalised"]].cumsum()
EDI_tradable['Date'] = EDI['Date']
ESI_tradable['Date'] = ESI['Date']
short, medium, long, longest, medium_multiplier,long_multplier = 21, 55, 100, 200, 1, 1 # as all in days.
for col in features:
    if col in list(EDI_tradable.columns):
        data = deepcopy(EDI_tradable)
        data = trends_features(data,col, short, medium, long,longest,medium_multiplier,long_multplier)
        EDI_tradable[col+"_spotvma"] = data['spot_v_HF']
        EDI_tradable[col+"_madiff"] = data['HF_ema_diff']
    if col in list(ESI_tradable.columns):
        data = deepcopy(ESI_tradable)
        data = trends_features(data,col, short, medium, long, longest,medium_multiplier,long_multplier)
        ESI_tradable[col+"_spotvma"] = data['spot_v_HF']
        ESI_tradable[col+"_madiff"] = data['HF_ema_diff']
print(EDI_tradable.columns)
'''
------ Save file name as Signals_FullsamplewBlind.csv if running on the with held data----
'''
signal_location = r"Signals_Fullsample.csv"
signals = pd.read_csv(signal_location)
signals['Date'] = pd.to_datetime(signals['Date'], format= '%d/%m/%Y %H:%M')
signals['DDMMYY'] = signals['Date'].dt.date # signals['Date'].apply(get_ddmmyy)
#signals['DDMMYY'] = pd.to_datetime(signals['DDMMYY'])
trend_estimation = signals.groupby('DDMMYY').sum().reset_index(drop = False)
# i.e. the average of the ccy kevel across each hour
trend_estimation['CCY'] = signals.groupby('DDMMYY').mean().reset_index(drop = False)['CCY']
# overwriting the dte arg as its easier to understand
trend_estimation['Date'] = trend_estimation['DDMMYY']
trend_estimation[ccy1+ccy2 +"_vol"] = trend_estimation['logret'].rolling(60).std()*np.sqrt(260)
trend_estimation = pd.merge(trend_estimation,EDI_tradable,how="left" ,on= "Date")
trend_estimation = pd.merge(trend_estimation,ESI_tradable,how="left" ,on= "Date")
'''
----- Save te file as trendestimationwBlind.csv if running with with held data
'''
trend_location = r"trendestimation.csv"
trend_estimation.replace(np.nan,0).to_csv(trend_location)

from model_functions import standardise_data
import numpy as np
import pandas as pd
from create_model_features import trends_features
from copy import deepcopy
from sklearn.preprocessing import StandardScaler
from model_functions import create_train_test_file
from run_decision_tree import set_params_trend_estimate, set_params_random_forests
rf_dict = set_params_random_forests()
trend_estimate_dict = set_params_trend_estimate()
trade_horizon = trend_estimate_dict['trade_horizon'] # roughly 1 month ahead.
std_window = trend_estimate_dict['std_window'] # 1 year
train_size = trend_estimate_dict['train_size']# no. data points in the train
test_split = trend_estimate_dict['test_split']
test_buffer = trend_estimate_dict['test_buffer']
concat_results = False
# open trend estimation file from here
trend_location = r"trendestimation.csv"
trend_estimation = pd.read_csv(trend_location)
# list of available strats
trend_strats = ["Linear","SVM_erf","RF_erf","LSTM1_erf","LSTM2_erf"]
trend_strats_blind = ["SVM","RF", "LSTM"]
# trend features
T_features = ["EURUSD_vol","ESI_EURUSD_spread_spotvma", "ESI_EURUSD_spread_madiff",       "ESI_G10_normalised_spotvma",
                   "ESI_G10_normalised_madiff", "ESI_Global_normalised_spotvma", "ESI_Global_normalised_madiff",
                   "EDI_EURUSD_spread_spotvma", "EDI_EURUSD_spread_madiff",       "EDI_G10_normalised_spotvma",
                   "EDI_G10_normalised_madiff", "EDI_Global_normalised_spotvma",  "EDI_Global_normalised_madiff"]
dynamic_T_features = ["ESI_Global_normalised_madiff", "EDI_EURUSD_spread_spotvma", "EDI_G10_normalised_spotvma", 
                      "EDI_Global_normalised_madiff"]
feats_to_use = T_features  # can place t feats here too if needed
'''
------ Change the below to any one of these ["Linear","SVM_erf","RF_erf","LSTM1_erf","LSTM2_erf"] models
'''
trend_model = "RF_erf"
feats = deepcopy(feats_to_use)
feats.append("Date")
feats.append(trend_model)
# now kick off the learning approach
df = trend_estimation[feats]
df['target'] = df[trend_model].iloc[::-1].shift(2).rolling(trade_horizon).sum().values[::-1]
# maybe best to standardise on the sklearn methods
# df = standardise_data(df, feats, T_features , std_window)
data_size = int(df.shape[0]*train_size) # data size implies the size of the training set
train, test = create_train_test_file(df, data_size, test_split, test_buffer, concat_results)

'''
---- NOw run the random forest engine on this code
'''
from sklearn import tree
from run_decision_tree import decision_tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
scaler = StandardScaler() 
use_RF = rf_dict['use_RF']
train_sample = pd.DataFrame(scaler.fit_transform(train[feats_to_use]), columns = list(feats_to_use))
test_sample = pd.DataFrame(scaler.transform(test[feats_to_use]), columns = list(feats_to_use))
use_classifier = rf_dict['use_classifier']
use_risk_adjusted = rf_dict['use_risk_adjusted']
ntree = rf_dict['ntrees'][0]
max_features = rf_dict['max_features']
max_depth = rf_dict['max_depth']
# if you want to run a random forest.
if use_RF:
    train_sample['target'] = train['target']
    test_sample['target'] = test['target']
    results, acc_score = decision_tree(train_sample, test_sample,use_classifier, use_risk_adjusted,
                                       ntree, max_features, max_depth)
if use_classifier:
    train_sample['target'] = train['target'].apply(np.sign)
    test_sample['target'] = test['target'].apply(np.sign)
    clf = tree.DecisionTreeClassifier(max_leaf_nodes = 6, max_depth = 12)
    RF = RandomForestClassifier(n_estimators=ntree, max_features= max_features,max_depth = max_depth, verbose=0)
else:
    clf = tree.DecisionTreeRegressor(max_leaf_nodes = 6, max_depth = 12)
    RF = RandomForestRegressor(n_estimators=ntree, max_features= max_features, max_depth = max_depth,verbose=0)
#clf_model = clf.fit(train_sample[["EDI_EURUSD_spread_spotvma","EDI_G10_normalised_spotvma"]],train_sample['target'])
rf_model = RF.fit(train_sample.iloc[:,:-1], train_sample['target'])

'''
-----Create the variable Importance plot
'''
from matplotlib import pyplot as plt
importances = rf_model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(30,14))
plt.title('Relative Feature Importances', fontsize=20)
plt.barh(range(len(importances)), importances, color='b', align='center')
#plt.yticks(range(len(indices)), )
plt.xlabel('Relative Importance Score', fontsize=20)
plt.yticks(range(len(importances)), T_features, fontsize=18)
varimp_location = r"Varimp_%s.png" % (trend_model)
plt.savefig(varimp_location)
#plt.show()
'''
---- Now calculate the model returns
'''
def get_trade(row):
    '''
    
    :param row: 
    :return: 
    '''
    if row > 0:
        return 1
    else:
        return 0

def trend_estimate_backtester(test, results, trend_model):
    predictions = pd.DataFrame({"Date": test['Date'], "Predictions": results})
    test_results = pd.merge(test, predictions, how="left", on="Date").fillna(0)
    # calculate the returns of the signal
    test_results["scaled_signal"] = test_results['Predictions'].shift(2).rolling(trade_horizon).sum() / trade_horizon
    test_results['trend_buy_sell'] = test_results['scaled_signal'].apply(get_trade)
    test_results['strat_returns'] = test_results[trend_model] * test_results['trend_buy_sell']
    test_results['strat_returns_sum'] = test_results['strat_returns'].cumsum()
    test_results['nonfiltered_ret'] = test_results[trend_model].cumsum()
    return test_results
test_results = trend_estimate_backtester(test,results,trend_model)
# if saving the withheld data then save file as BLIND_csv
test_location = r"model%s_TH%s_ntree%s_classifier%s_riskadj%s.csv" % (
    trend_model, trade_horizon, ntree ,use_classifier, use_risk_adjusted)
test_results.to_csv(test_location, index = False)





In [None]:
'''
This code creates the process for the random data generation
'''

'''
This file generates the trenging price series on which the model will train on.
'''
from matplotlib import pyplot as plt
from generate_trend import get_trendy_data
from create_model_features import trends_features 
import numpy as np
file_location = r"CcyRandomTrend.csv"
n_samples = 5000 # for 50000 if training the SVM and RF
trend_strength = 0.068 # alpha a value between -0.5 to 0.15
pct_stdev = 0.008 # stdev value between 0.001 and 0.01
short = 5
medium = 21
long = 55
longest = 100
medium_multiplier = 24 # one day in hours
long_multplier = 120 # week in hours
CCY_COL = "trend"
ccy_data = get_trendy_data(n_samples,trend_strength,pct_stdev,CCY_COL,
                           short,medium,long,longest,medium_multiplier,long_multplier)
ccy_data.to_csv(file_location)
ccy_data['CCY'].plot()
ccy_data['LF_short'].plot()
plt.show()