In [11]:
import pandas as pd
import numpy as np
import yfinance as yf

from ta.momentum import RSIIndicator, StochasticOscillator
from ta.volatility import BollingerBands, AverageTrueRange
from ta.trend import MACD
from ta.trend import adx
from ta.volume import OnBalanceVolumeIndicator

#### data coll and preprocess

In [None]:
# # currency pairs
# major_pairs = [
#     "EUR/USD", "USD/JPY", "GBP/USD", "USD/CHF",
#     "AUD/USD", "USD/CAD", "NZD/USD"
# ]
# minor_pairs = [
#     "EUR/GBP", "EUR/CHF", "EUR/AUD", "EUR/CAD", "EUR/NZD", "EUR/JPY",
#     "GBP/JPY", "GBP/AUD", "GBP/CAD", "GBP/NZD", "GBP/CHF",
#     "AUD/JPY", "AUD/CAD", "AUD/CHF", "AUD/NZD",
#     "NZD/JPY", "NZD/CAD", "NZD/CHF",
#     "CAD/JPY", "CAD/CHF",
#     "CHF/JPY",
#     "USD/KRW", "EUR/KRW", "GBP/KRW", "AUD/KRW", "CAD/KRW", 
#     "CHF/KRW", "NZD/KRW", "JPY/KRW"
# ]
# exotic_pairs = [
#     "USD/HKD", "USD/ZAR", "USD/THB",
#     "USD/MXN", "USD/DKK", "USD/NOK", "USD/SEK", "USD/PLN", "USD/CZK",
#     "EUR/ZAR", "EUR/NOK", "EUR/SEK", "EUR/DKK", "EUR/HUF", "EUR/PLN",
#     "GBP/ZAR",
#     "AUD/ZAR",
#     "CHF/ZAR"
# ]

In [2]:
currency_pairs = [
    "EUR/USD", "USD/JPY", "GBP/USD", "USD/CHF",
    "AUD/USD", "USD/CAD", "NZD/USD"
]



# Format func for yf
def format_pairs(pair_list):
    """
    'EUR/USD' --> 'EURUSD=X'.
    """
    return [pair.replace('/', '') + '=X' for pair in pair_list]

# Format pairs
formatted_pairs = format_pairs(currency_pairs)
# Def date range
start_date = '2010-01-01'
end_date = '2020-12-31'

# Fetching func
def fetch_data(pairs, start_date, end_date):
    """
    daily OHLCV
    """
    forex_data = {}
    for pair in pairs:
        ticker = yf.Ticker(pair)
        # Fetch daily data
        data = ticker.history(start=start_date, end=end_date, interval='1d')
        if not data.empty:
            forex_data[pair] = data
    return forex_data

# Fetch - returns dict of dataframes
forex_data = fetch_data(formatted_pairs, start_date, end_date)

In [3]:
forex_data['EURUSD=X'].head(5)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-01 00:00:00+00:00,1.432706,1.440196,1.432706,1.438994,0,0.0,0.0
2010-01-04 00:00:00+00:00,1.431004,1.445191,1.426208,1.442398,0,0.0,0.0
2010-01-05 00:00:00+00:00,1.44271,1.44831,1.435194,1.436596,0,0.0,0.0
2010-01-06 00:00:00+00:00,1.436596,1.44346,1.429123,1.440403,0,0.0,0.0
2010-01-07 00:00:00+00:00,1.4403,1.444481,1.430206,1.431803,0,0.0,0.0


#### feature engieering

In [4]:
# Def lookback periods for different features
lookback_periods = {
    'momentum': 10,      # [t0,t1] for momentum indicators
    'volatility': 14,    # [t0,t1] for volatility indicators
    'correlation': 30    # [t0,t1] for correlation
}

def compute_features_for_pair(data, pair_name):
    df = data.copy()
    
    # Ensure df sorted by date
    df.sort_index(inplace=True)
    
    # Shift 1 to exclude current date price aka close is now yesterday's close
    df['Close_Shifted'] = df['Close'].shift(1)
    df['High_Shifted'] = df['High'].shift(1)
    df['Low_Shifted'] = df['Low'].shift(1)
    df['Volume_Shifted'] = df['Volume'].shift(1) if 'Volume' in df.columns else np.nan
    
    # --- Feature Calculations using shifted data ---
    
    # Return over the past momentum days (exclu current date)
    # Simple % change for past 10 days
    df['Momentum'] = df['Close_Shifted'].pct_change(periods=lookback_periods['momentum'])
    
    # Rolling std of daily returns (exclu current date) of past 14 days
    df['Returns'] = df['Close_Shifted'].pct_change()
    df['Volatility'] = df['Returns'].rolling(window=lookback_periods['volatility']).std()
    
    # RSI 14 days - use function from ta
    rsi_indicator = RSIIndicator(close=df['Close_Shifted'], window=14)
    df['RSI'] = rsi_indicator.rsi()
    
    # STOCH 14 days (fast), MA 3 days (slow)
    if 'High_Shifted' in df.columns and 'Low_Shifted' in df.columns:
        stoch_osc = StochasticOscillator(
            high=df['High_Shifted'], 
            low=df['Low_Shifted'], 
            close=df['Close_Shifted'], 
            window=14, smooth_window=3)
        df['Stoch_K'] = stoch_osc.stoch()
        df['Stoch_D'] = stoch_osc.stoch_signal()
    else:
        df['Stoch_K'] = np.nan
        df['Stoch_D'] = np.nan
    
    # BB 20 days
    # Calculating by adding or subtracting a std dev drom 20-day MA
    bollinger = BollingerBands(close=df['Close_Shifted'], window=20)
    df['BB_High'] = bollinger.bollinger_hband()
    df['BB_Low'] = bollinger.bollinger_lband()
    
    # ATR 14 days - measure of market volatility
    if 'High_Shifted' in df.columns and 'Low_Shifted' in df.columns:
        atr_indicator = AverageTrueRange(
            high=df['High_Shifted'], 
            low=df['Low_Shifted'], 
            close=df['Close_Shifted'], 
            window=14)
        df['ATR'] = atr_indicator.average_true_range()
    else:
        df['ATR'] = np.nan
    
    # MACD - diff bw 12-period EMA and 26-period EMA of shifted prices
    # Difference between MACD and Signal line "MACD_Diff" used to generate signals
    macd_indicator = MACD(close=df['Close_Shifted'])
    df['MACD'] = macd_indicator.macd()
    df['MACD_Signal'] = macd_indicator.macd_signal() # 9-period EMA of MACD
    df['MACD_Diff'] = macd_indicator.macd_diff()

    # ADX - measures trend strength
    # 14 period window suggested by Wilder(1978) but given strategy is to hold for 5 days, using a 5 or 7 day window may be more appropriate
    adx_indicator = adx(high=df['High_Shifted'], low=df['Low_Shifted'], close=df['Close_Shifted'], window=14)
    df['ADX'] = adx_indicator.adx()
    
    # --- Prep df ---
    
    # Features - returned df only includes the technical features
    feature_columns = ['Momentum', 'Volatility', 'RSI', 'Stoch_K', 'Stoch_D',
                       'BB_High', 'BB_Low', 'ATR', 'MACD', 'MACD_Signal', 'MACD_Diff']
    df_features = df[feature_columns].copy()
    df_features['CurrencyPair'] = pair_name
    df_features.index = df.index
    
    return df_features

# Features for all pairs
# forex_data: dict of dataframes
feature_dfs = []
for pair, data in forex_data.items():
    if not data.empty and 'Close' in data.columns:
        df_features = compute_features_for_pair(data, pair)
        feature_dfs.append(df_features)

# Concat 
features_all_pairs = pd.concat(feature_dfs)

# Reset index so treated as column
features_all_pairs.reset_index(inplace=True)
features_all_pairs.rename(columns={'index': 'Date'}, inplace=True)

#### Compute Rolling Correlations

In [5]:
# Prepare the returns DataFrame for correlation computation
prices = pd.DataFrame()
for pair, data in forex_data.items():
    if not data.empty:
        prices[pair] = data['Close']
prices.dropna(inplace=True)

# Shift 1 to exclude current date price as used the original forex_data
prices_shifted = prices.shift(1)
returns = prices_shifted.pct_change()

# Rolling average correlation for each pair
# Window is defined in first cell (currently 30 days)
def compute_rolling_correlations(returns, window):
    avg_corr = returns.rolling(window).corr()
    avg_corr_df = pd.DataFrame(index=returns.index, columns=returns.columns)
    
    # Avg corr for each currency pair at a given date
    for date in returns.index[window - 1:]:
        corr_matrix = avg_corr.loc[date] # Corr matrix for that date
        for pair in returns.columns:
            # Exclude self-corr, calculate avg correlations with other pairs for the past 
            correlations = corr_matrix[pair].drop(pair, errors='ignore')
            avg_corr_df.at[date, pair] = correlations.mean()
    
    return avg_corr_df

avg_correlations = compute_rolling_correlations(returns, lookback_periods['correlation'])
avg_corr_long = avg_correlations.stack().reset_index()
avg_corr_long.columns = ['Date', 'CurrencyPair', 'AvgCorrelation']

# Merge
features_all_pairs = pd.merge(features_all_pairs, avg_corr_long, on=['Date', 'CurrencyPair'], how='left')

In [6]:
avg_corr_long

Unnamed: 0,Date,CurrencyPair,AvgCorrelation
0,2010-02-15 00:00:00+00:00,EURUSD=X,0.099875
1,2010-02-15 00:00:00+00:00,USDJPY=X,0.085146
2,2010-02-15 00:00:00+00:00,GBPUSD=X,0.165885
3,2010-02-15 00:00:00+00:00,USDCHF=X,-0.391381
4,2010-02-15 00:00:00+00:00,AUDUSD=X,0.150194
...,...,...,...
19805,2020-12-30 00:00:00+00:00,GBPUSD=X,0.024355
19806,2020-12-30 00:00:00+00:00,USDCHF=X,-0.159271
19807,2020-12-30 00:00:00+00:00,AUDUSD=X,0.064384
19808,2020-12-30 00:00:00+00:00,USDCAD=X,-0.236388


#### Label Generation

In [7]:
# Def holding period - number of days to hold each position
holding_period = 5  # e.g., 5 days

# Future returns over the holding period
def compute_future_returns(prices, holding_period):
    """
    Include the current date in the holding period
    """
    # Negative shifts bring future returns to the current date, +1 to account for wanting todays close as part of future calculation so no data leakage
    # pct_change looks back by the value specified by periods = . So here it is looking back 5 days where the current day has been shifted back to fro, the shift
    future_returns = prices.shift(-holding_period + 1).pct_change(periods=holding_period)
    return future_returns

# Future returns (inclu current date)
# prices is close from forex_data
future_returns = compute_future_returns(prices, holding_period)
future_returns_long = future_returns.stack().reset_index() # Convert to long format
future_returns_long.columns = ['Date', 'CurrencyPair', 'FutureReturn']

# Merge
features_all_pairs = pd.merge(features_all_pairs, future_returns_long, on=['Date', 'CurrencyPair'], how='left')

# Assign labels based on quantiles of future returns
def label_data(df):
    """
    Assigns labels to each currency pair on each date based on the quantiles of future returns.
    Labels: -1 (Short), 0 (Neutral), 1 (Long)
    """
    # rm nans
    df = df.dropna(subset=['FutureReturn'])
    
    # Group by date to compute daily quantiles
    def assign_labels(group):
        # 30th and 70th percentiles
        lower_quantile = group['FutureReturn'].quantile(0.3)
        upper_quantile = group['FutureReturn'].quantile(0.7)
        
        # Labels based on quantiles
        conditions = [
            group['FutureReturn'] <= lower_quantile,  # Bottom 30% - Short
            group['FutureReturn'] >= upper_quantile   # Top 30% - Long
        ]
        choices = [-1, 1]  # Corresponding labels
        group['Label'] = np.select(conditions, choices, default=0)  # Neutral for others
        return group
    
    # Label assignment to each group, group by date so that the quantiles are computed for each date
    labeled_df = df.groupby('Date').apply(assign_labels)
    return labeled_df

features_all_pairs = label_data(features_all_pairs)

  future_returns = prices.shift(-holding_period + 1).pct_change(periods=holding_period)
  labeled_df = df.groupby('Date').apply(assign_labels)


#### data prep for modeling

In [8]:
# Features to be used
feature_columns = ['Momentum', 'Volatility', 'RSI', 'Stoch_K', 'Stoch_D',
                   'BB_High', 'BB_Low', 'ATR', 'MACD', 'MACD_Signal', 'MACD_Diff',
                   'AvgCorrelation']

features_all_pairs.dropna(subset=feature_columns + ['Label'], inplace=True)
features_all_pairs.reset_index(drop=True, inplace=True)
print("Final dataset shape:", features_all_pairs.shape)

features_all_pairs.iloc[30:40]

Final dataset shape: (19789, 16)


Unnamed: 0,Date,Momentum,Volatility,RSI,Stoch_K,Stoch_D,BB_High,BB_Low,ATR,MACD,MACD_Signal,MACD_Diff,CurrencyPair,AvgCorrelation,FutureReturn,Label
30,2010-02-24 00:00:00+00:00,-0.015801,0.006915,33.149932,14.891122,15.548676,1.617789,1.530609,0.015289,-0.015876,-0.013984,-0.001893,GBPUSD=X,0.164372,-0.027051,-1
31,2010-02-24 00:00:00+00:00,0.01596,0.006886,63.81763,76.489185,68.687663,1.08834,1.047768,0.010869,0.010131,0.010128,4e-06,USDCHF=X,-0.389418,-0.007485,0
32,2010-02-24 00:00:00+00:00,0.017409,0.009781,49.224429,70.354297,84.520738,0.908769,0.865483,0.01181,-0.001209,-0.003901,0.002692,AUDUSD=X,0.170691,0.013665,1
33,2010-02-24 00:00:00+00:00,-0.011432,0.006918,52.103632,44.282018,19.388041,1.078754,1.036564,0.010713,-0.00128,0.000773,-0.002052,USDCAD=X,-0.433665,-0.019422,-1
34,2010-02-24 00:00:00+00:00,-0.000278,0.009274,41.952071,51.421813,55.734624,0.713218,0.684911,0.00945,-0.005184,-0.006553,0.001369,NZDUSD=X,0.175284,0.003138,0
35,2010-02-25 00:00:00+00:00,-0.015277,0.007374,36.17948,23.868255,22.59645,1.400901,1.342103,0.014085,-0.015979,-0.016457,0.000478,EURUSD=X,0.083685,0.011701,1
36,2010-02-25 00:00:00+00:00,0.003881,0.00559,46.580262,44.212069,54.113891,91.750074,88.841826,0.933938,-0.06612,-0.200046,0.133926,USDJPY=X,0.061121,-0.01893,0
37,2010-02-25 00:00:00+00:00,-0.012488,0.006589,32.218332,11.877338,14.746085,1.611295,1.529493,0.014783,-0.016277,-0.014442,-0.001834,GBPUSD=X,0.164117,-0.020374,-1
38,2010-02-25 00:00:00+00:00,0.013503,0.006738,62.568329,69.204433,70.034889,1.088765,1.050453,0.010791,0.009959,0.010094,-0.000135,USDCHF=X,-0.386311,-0.012306,0
39,2010-02-25 00:00:00+00:00,0.020034,0.00864,50.291823,73.619515,78.585186,0.908525,0.865534,0.011641,-0.000974,-0.003315,0.002341,AUDUSD=X,0.168517,0.01213,1


In [10]:
features_all_pairs.head(5)

Unnamed: 0,Date,Momentum,Volatility,RSI,Stoch_K,Stoch_D,BB_High,BB_Low,ATR,MACD,MACD_Signal,MACD_Diff,CurrencyPair,AvgCorrelation,FutureReturn,Label
0,2010-02-18 00:00:00+00:00,-0.021159,0.006979,33.281113,15.039357,24.244319,1.421431,1.349064,0.013711,-0.016374,-0.016506,0.000132,EURUSD=X,0.096412,-0.004686,0
1,2010-02-18 00:00:00+00:00,0.002312,0.005963,55.496274,88.063125,67.793392,91.033683,89.058916,0.921151,-0.312531,-0.499791,0.18726,USDJPY=X,0.072729,-0.008468,0
2,2010-02-18 00:00:00+00:00,-0.014945,0.005976,38.952843,22.484508,24.714181,1.632753,1.54457,0.015316,-0.01229,-0.012133,-0.000157,GBPUSD=X,0.161232,-0.016897,-1
3,2010-02-18 00:00:00+00:00,0.01841,0.005997,66.548409,85.406906,74.118437,1.084414,1.036504,0.010643,0.010017,0.009841,0.000176,USDCHF=X,-0.388175,0.002988,1
4,2010-02-18 00:00:00+00:00,0.018693,0.009571,53.836993,88.584587,82.532503,0.912704,0.864378,0.011601,-0.003575,-0.006899,0.003324,AUDUSD=X,0.17486,-0.00483,0
