In [2]:
import multiprocessing
import time
import os
import random
import multiprocessing
import warnings
warnings.filterwarnings('ignore')
import datetime
from tqdm import tqdm
import gc
import matplotlib.pyplot as plt

import kaleido
import pandas as pd
import numpy as np

from copy import deepcopy
from scipy.linalg import pinv
import random
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import matplotlib.dates as mdates
from joblib import Parallel, delayed
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
from AdaptiveBenignOverfitting import *
from forecast_utils import *
from backtesting_utils import *
from features.feature_module import *

In [4]:
random.seed(12)

In [5]:
df = pd.read_excel('~/Dropbox/FX/GBPUSD_df_daily.xlsx')
df.set_index('Date',inplace = True)
df['GBPUSD_SPREAD'] = df['GBPUSD_PX_ASK'] - df['GBPUSD_PX_BID']

In [6]:
def get_AR_MA_features(df_old, columns, window_sizes):
    
    df = pd.DataFrame()
    df.index = df_old.index
    """
    Adds financial features to the DataFrame for specified columns and multiple window sizes.

    Parameters:
    df (DataFrame): The original DataFrame.
    columns (list): List of column names to calculate features for.
    window_sizes (list): List of window sizes for calculating SMA, EMA, and rolling std.

    Returns:
    DataFrame: The original DataFrame with added financial features.
    """
    for col in columns:
        # Calculate returns
        df[f'{col}_returns'] = df_old[col].pct_change()
        for window_size in window_sizes:
            # SMA of returns
            df[f'{col}_sma_{window_size}'] = df[f'{col}_returns'].rolling(window=window_size).mean()

            # EMA of returns
            df[f'{col}_ema_{window_size}'] = df[f'{col}_returns'].ewm(span=window_size, adjust=False).mean()

            # Lagged Returns
            df[f'{col}_AR_{window_size}'] = df[f'{col}_returns'].shift(window_size)

            # Rolling Standard Deviation
            df[f'{col}_rolling_std_{window_size}'] = df[f'{col}_returns'].rolling(window=window_size).std()

    return df

In [7]:
def calc_surprise(df):

    eco_index_cols = [col for col in df.columns if '_ACTUAL_RELEASE' in col]
    prefixes = [col.split('_ACTUAL_RELEASE')[0] for col in eco_index_cols]


 

    for prefix in prefixes:
        act_col = f"{prefix}_ACTUAL_RELEASE"
        est_col = f"{prefix}_SURVEY_AVERAGE"
        std_col = f"{prefix}_FORECAST_STANDARD_DEVIATION"
        high_col = f"{prefix}_SURVEY_HIGH"
        low_col = f"{prefix}_SURVEY_LOW"
        surp_col = f"{prefix}_SURP"


        df[surp_col] = (df[act_col] - df[est_col])/np.where(df[std_col]==0,(df[high_col]-df[low_col])/4,df[std_col])
        df[surp_col].replace([np.inf, -np.inf], 0, inplace=True)
        df[surp_col].fillna(0,inplace=True)
    return df

In [8]:
def get_technicals(df, window_sizes):

    technicals_df = pd.DataFrame()
    technicals_df.index = df.index
    close_prices = df['GBPUSD_PX_LAST']
    high_prices = df['GBPUSD_PX_HIGH']
    low_prices = df['GBPUSD_PX_LOW']
    
    for i in window_sizes:
        colname = 'RSI_' + str(i)
        technicals_df[colname] = RSIIndicator(close=close_prices, window =i).rsi()
       
    for i in window_sizes:
        colname = 'oscillator_' + str(i)
        technicals_df[colname] = 100*((close_prices - low_prices.rolling(window = i).min()) / (high_prices.rolling(window = i).min() - low_prices.rolling(window = i).min()))
    
    for i in window_sizes: 
        colname = 'adx_' + str(i)
        adxI = ADXIndicator(high=high_prices, low=low_prices, close=close_prices, window=i)
        technicals_df[colname] = adxI.adx()
    return technicals_df



In [9]:
columns = ['SPX_PX_MID','UKX_PX_MID','GBPUSD_PX_LAST',
           'GBPUSD_PX_LOW','GBPUSD_PX_HIGH',
           'GBPUSD_SPREAD','GBPUSD_BASIS_1W','GBPUSD_BASIS_1M',
           'GBPUSD_FRD_1W','GBPUSD_FRD_1M',
           'USD_BOND_3M','GBP_BOND_1Y']

In [10]:
window_sizes = [5,6,8,10,12,15,16,20,30,40]

In [11]:
df_ARMA = get_AR_MA_features(df,columns, window_sizes)

In [12]:
df_technicals = get_technicals(df,window_sizes)

In [13]:
df_feature = pd.merge(df_technicals, df_ARMA, left_index=True, right_index=True)

In [14]:
eco_index_cols = [col for col in df.columns if '_ACTUAL_RELEASE' in col]
prefixes = [col.split('_ACTUAL_RELEASE')[0] for col in eco_index_cols]

In [15]:
macro_columns = [col for col in df.columns if any(s in col for s in prefixes)]

In [16]:
df_macro = pd.DataFrame()
df_macro[macro_columns] = df[macro_columns]

In [17]:
df_macro = calc_surprise(df_macro)

In [18]:
df_feature = pd.merge(df_feature, df_macro, left_index=True, right_index=True)

In [19]:
df_feature['close'] = df['GBPUSD_PX_LAST']

In [20]:
deriv_cols = ['GBPUSD_VOLA_1W','GBPUSD_VOLA_1M','GBPUSD_SKEW_1W','GBPUSD_SKEW_1M',
             'GBPUSD_KURT_1W','GBPUSD_KURT_1M']
df_feature[deriv_cols] = df[deriv_cols]

In [21]:
df_feature.ffill(inplace=True)

In [22]:
df_feature.replace([np.inf, -np.inf], np.nan, inplace=True)

In [23]:
df_feature.dropna(inplace=True)

In [None]:
backtest = df_perf['binary']*df_perf['target']

In [None]:
array_2 = backtest.cumsum()

In [None]:
#array_eps = (df_perf['signal_large']*df_perf['target']).cumsum()

In [None]:
plt.plot((1+array)*100000, label = 'Expanding window + ff')
plt.plot((1+array_2)*100000, label = 'Rolling window')
plt.xticks(rotation = 45)
plt.legend()
plt.show()

In [None]:
plt.plot(df_perf['Close'])
plt.xticks(rotation = 45)
plt.show()

In [None]:
df_perf

In [None]:
last_index_df_past = df_past.index[-1:]
indices_df_future = df_future.index[:-1]
combined_indices = last_index_df_past.append(indices_df_future)
results_df = pd.DataFrame(index=combined_indices)
results_df['actual'] = np.nan
results_df['mean'] = np.nan

# Initialize the neccesary lists
models = []
bags = []
all_bags_array = []
betas_array = []
# Select the most recent data from the available dataframe
df_model = df_past[-(roll_size+1):] # size is roll_size + 1, because we need 1 more point to make prediction
                                    # for that point we don't know the target variable yet

# calculate targets and scale the data
Y, X, scaler_Y, scaler_X = prepare_data(df_model) 

# perform RFF transformation
lags = X.shape[0]
rff = GaussianRFF(lags, D, sigma)
X_trans = rff.transform(X.reshape(lags, roll_size+1)).T

#Sampling features in each bag
features_array = sample_features(D,n_bags,feature_num)

for p in range(n_bags):
    bags.append(X_trans[:,features_array[p]])

# Parallel execution of the first loop. Model initialization
results = Parallel(n_jobs=-1)(delayed(process_initial_bag)(p, bags, Y, scaler_Y, ff, l, feature_num, roll_size, exp_window) for p in tqdm(range(0, n_bags)))
all_bags_preds = np.array([result[0] for result in results])
models = [result[1] for result in results]
betas = np.array([result[2] for result in results])
betas_array.append(betas)
all_bags_array.append(np.array(all_bags_preds).T)
#Add results in a results dataframe for comparison
results_df['actual'].iloc[0] = df_future['close'][0]/df_past['close'][-1]-1 #actual target
results_df['mean'].iloc[0] = np.mean(all_bags_preds)

#Continue performing forecasts by updating QR_RLS model
df_temp = df_model

# we need the last row of RFF dataset to append it to train set on next iteration
X_old = X_trans[-1,:].T  

for i in tqdm(range(0, 300)):
    
    #Delete old data and append data, that just became available
    df_temp = df_temp.iloc[1:]
    df_temp = df_temp.append(df_future.iloc[i])
    
    Y, X, scaler_Y, scaler_X = prepare_data(df_temp)
    
    ## We need to perform RFF expansion on the new observation row. For which we don't have target
    ## And which will be used for forecasting
    X_new = rff.transform(X[:, -1:].reshape(lags, 1))
    
    # Parallel execution of the second loop
    results = Parallel(n_jobs=-1)(delayed(process_updated_bag)(p, X_old, X_new, models, scaler_Y, Y, features_array, feature_num) for p in range(0, n_bags))
    all_bags_preds = np.array([result[0] for result in results])
    betas = [result[1] for result in results]
    betas_array.append(betas)
    all_bags_array.append(np.array(all_bags_preds).T)
    #new obeservation will be appended to train set in the next iteration
    X_old = X_new 
    
    # record results
    results_df['actual'].iloc[i+1] = df_future['close'][i+1]/df_temp['close'][-1]-1 #actual target
    results_df['mean'].iloc[i+1] = np.mean(all_bags_preds)
    
    ##CHANGE-POINT test here
    
    
    
    if i % 10 == 0 and i > 0:
        
        same_sign_count = ((results_df['mean'][:i] > 0) & (results_df['actual'][:i] > 0)).sum() + ((results_df['mean'][:i] < 0) & (results_df['actual'][:i] < 0)).sum()

        # Calculate the percentage
        total_rows = len(results_df['mean'][:i])
        percentage_same_sign = (same_sign_count / total_rows) * 100

        print(f"Accuracy on iteration {i}: {percentage_same_sign:.2f}%")
        
        p = results_df['mean'][:i].corr(results_df['actual'][:i])



        # Calculate rolling Sharpe ratio
        if not np.isnan(p):  # Check if p is not NaN
            rolling_sharpe_ratio = (p / np.sqrt(p**2 + 1)) * np.sqrt(252)
            print(f"Accuracy on iteration {i}: {percentage_same_sign:.2f}%, Rolling Sharpe Ratio: {rolling_sharpe_ratio:.2f}")
        else:
            print(f"Accuracy on iteration {i}: {percentage_same_sign:.2f}%, Rolling Sharpe Ratio: Cannot be calculated (NaN)")