In [109]:
# features
# - TEMA
# - EMA4, 9, 18 crossover
# - False Breakout Finder (*new intraday high, close lower than previous high, new intraday low, close higher than previous low
# - ATR
# - Relative Strength Analysis
# - market breadth (indicator of overall market performance)
# - skdj

import investpy
import pandas as pd
import numpy as np
import yfinance as yf
from itertools import combinations

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

In [124]:
# def download_data(ticker: str):
#     df = yf.download(ticker, period="10y", interval="1d", multi_level_index=False)
#     df.dropna(inplace=True)
#     return df

def calculate_indicators(df, close, high, low, vol):
    df['RSI'] = ta.momentum.RSIIndicator(df[close]).rsi()

    macd = ta.trend.MACD(df[close])
    df['MACD'] = macd.macd()
    df['MACD_Signal'] = macd.macd_signal()
    df['MACD_Hist'] = macd.macd_diff()

    df['ATR'] = ta.volatility.AverageTrueRange(df[high], df[low], df[close]).average_true_range()

    df['TEMA'] = ta.trend.EMAIndicator(df[close], window=30).ema_indicator()
    df['EMA4'] = ta.trend.EMAIndicator(df[close], window=4).ema_indicator()
    df['EMA9'] = ta.trend.EMAIndicator(df[close], window=9).ema_indicator()
    df['EMA18'] = ta.trend.EMAIndicator(df[close], window=18).ema_indicator()

    df['OBV'] = ta.volume.OnBalanceVolumeIndicator(df[close], df[vol], fillna=True).on_balance_volume()
    df['VWAP'] = ta.volume.VolumeWeightedAveragePrice(df[high], df[low], df[close], df[vol], fillna=True).volume_weighted_average_price()
    # df['EMA4_Crossover_up'] = (df['EMA4'] > df['EMA9']) & (df['EMA9'] > df['EMA18'])
    # df['EMA4_Crossover_down'] = (df['EMA4'] < df['EMA9']) & (df['EMA9'] < df['EMA18'])

    stoch = ta.momentum.StochasticOscillator(df[high], df[low], df[close])
    df['SKDJ_K'] = stoch.stoch()
    df['SKDJ_D'] = stoch.stoch_signal()

    return df

def calculate_flags(df, high, low, close):
    df['Max_High_3d'] = df[high].rolling(window=3).max().shift(-3)
    df['Min_Low_3d'] = df[low].rolling(window=3).min().shift(-3)
    df['Flag'] = df.apply(lambda row: 1 if row['Max_High_3d'] > 1.007*row[close] else (-1 if row['Min_Low_3d'] < 0.993*row[close] else 0), axis=1)
    return df

def get_all_ratios(df, exclusion_list):
    feature_list = [col for col in df.columns.tolist() if col not in exclusion_list]
    new_columns = []
    for feat1, feat2 in combinations(feature_list, 2):
        column_name = f"{feat1}/{feat2}"
        new_columns.append((column_name, df[feat1] / df[feat2]))

    new_columns_df = pd.DataFrame(dict(new_columns))
    df = pd.concat([df, new_columns_df], axis=1)
    return df

def get_past_prices(df, close):
    past_prices = []
    for i in range(1,31):
        column_name = f"Close_{i}"
        past_prices.append((column_name, df[close].shift(i)))
    past_prices_df = pd.DataFrame(dict(past_prices))
    df = pd.concat([df, past_prices_df], axis=1)
    return df


# def calculate_relative_strength(df, market):
#     df_market = yf.download(market, period="10y", interval="1d", multi_level_index=False)['Close'].rename(market)
#     df['market_close'] = df.merge(df_market, how='left', left_index=True, right_index=True)[market]
#     df['relative_strength'] = df['Close']/df['market_close']
#     df['relative_strength_EMA_4'] = ta.trend.EMAIndicator(df['relative_strength'], window=4).ema_indicator()
#     df['relative_strength_EMA_9'] = ta.trend.EMAIndicator(df['relative_strength'], window=9).ema_indicator()
#     return df

# def data_prep(ticker, market):
#     df = download_data(ticker)
#     df = calculate_indicators(df)
#     df = calculate_flags(df)
#     df = calculate_relative_strength(df, market)
#     df.drop(columns=['Max_High_2w', 'Min_Low_2w'], inplace=True)  # Clean up unnecessary columns
#     df.dropna(inplace=True)

    # return df
    # df.to_csv(f'{ticker}_processed.csv')
    # print(f'Data saved to {ticker}_processed.csv')

In [134]:
df =  pd.read_csv('Data/AUDNZD/AUDNZD_Daily_201401010000_202503280000.csv', delimiter='\t',engine='python')
df_iron = pd.read_csv('Data/AUDNZD/PIORECRUSDM.csv')
df['Date'] = pd.to_datetime(df['<DATE>'], format='%Y.%m.%d')
df = calculate_indicators(df, '<CLOSE>', '<HIGH>', '<LOW>', '<TICKVOL>')
df = get_past_prices(df, '<CLOSE>')
df = get_all_ratios(df, ['<DATE>','Date','<VOL>'])
df = calculate_flags(df,'<HIGH>','<LOW>','<CLOSE>')

In [135]:
df.set_index('Date', inplace=True)

In [136]:
df['Flag'].value_counts()

Flag
 0    1414
 1     807
-1     702
Name: count, dtype: int64

In [137]:
df

Unnamed: 0_level_0,<DATE>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<TICKVOL>,<VOL>,<SPREAD>,RSI,MACD,...,Close_26/Close_30,Close_27/Close_28,Close_27/Close_29,Close_27/Close_30,Close_28/Close_29,Close_28/Close_30,Close_29/Close_30,Max_High_3d,Min_Low_3d,Flag
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-01,2014.01.01,1.08372,1.08465,1.08228,1.08307,1245,0,26,,,...,,,,,,,,1.09048,1.07962,0
2014-01-02,2014.01.02,1.08302,1.08956,1.08151,1.08750,47039,0,1,,,...,,,,,,,,1.09048,1.07540,-1
2014-01-03,2014.01.03,1.08750,1.09048,1.08009,1.08059,42596,0,1,,,...,,,,,,,,1.08599,1.07506,0
2014-01-06,2014.01.06,1.08267,1.08599,1.07962,1.08129,39268,0,1,,,...,,,,,,,,1.08271,1.07349,-1
2014-01-07,2014.01.07,1.08132,1.08271,1.07540,1.07600,41255,0,1,,,...,,,,,,,,1.08477,1.07349,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-03-24,2025.03.24,1.09227,1.09879,1.09227,1.09713,59571,0,11,39.977994,-0.003364,...,0.995982,0.999955,0.999730,1.000117,0.999775,1.000162,1.000386,1.10029,1.09547,0
2025-03-25,2025.03.25,1.09715,1.10029,1.09547,1.09930,55032,0,11,45.392101,-0.003012,...,0.995427,0.995866,0.995821,0.995597,0.999955,0.999730,0.999775,1.10026,1.09672,0
2025-03-26,2025.03.26,1.09890,1.10026,1.09738,1.09923,66584,0,11,45.250306,-0.002707,...,1.000656,0.999829,0.995695,0.995650,0.995866,0.995821,0.999955,,,0
2025-03-27,2025.03.27,1.09923,1.09996,1.09703,1.09821,71168,0,11,43.135817,-0.002519,...,0.999119,1.005027,1.004855,1.000701,0.999829,0.995695,0.995866,,,0


In [None]:
# Sort data by Date
df = df.sort_index()

# Split the data into train and test (20% test data)
# Define train-test split with a gap of 20 data points
train_size = int(0.8 * len(df))
train_data = df.iloc[:train_size].replace([np.inf, -np.inf], np.nan).dropna()
test_data = df.iloc[train_size + 20:].replace([np.inf, -np.inf], np.nan).dropna()  # Skipping 20 rows

# Features (excluding 'Date' and 'Flag' columns)
exclusion_list = ['<DATE>','<VOL>','Flag']
features = [col for col in df.columns.tolist() if col not in exclusion_list]

X_train = train_data[features]
y_train = train_data['Flag']!=0

X_test = test_data[features]
y_test = test_data['Flag']!=0

# 2) Logistic Regression with SequentialFeatureSelector and Cross-validation

# Initialize logistic regression model
# logreg = LogisticRegression(max_iter=10000)
# Initialize RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Initialize SequentialFeatureSelector for backward selection
selector = SequentialFeatureSelector(rf, n_features_to_select='auto', direction='forward', scoring='roc_auc', cv=5, n_jobs=-1)

# Fit the feature selector on the training data
selector = selector.fit(X_train, y_train)

# Get the selected features
selected_features = selector.get_feature_names_out()
# selected_features = features

# 3) Build Logistic Regression using selected features and evaluate AUROC with cross-validation
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Initialize the Logistic Regression model
rf.fit(X_train_selected, y_train)

# Perform cross-validation with AUROC as scoring metric
cv_scores = cross_val_score(rf, X_train_selected, y_train, cv=StratifiedKFold(5), scoring='roc_auc')

# Print the cross-validation AUROC scores
print(f"Cross-validation AUROC scores: {cv_scores}")
print(f"Mean AUROC: {cv_scores.mean()}")

# Evaluate AUROC on test data
y_pred_proba = rf.predict_proba(X_test_selected)[:, 1]
test_auroc = roc_auc_score(y_test, y_pred_proba)

print(f"AUROC on test data: {test_auroc}")

In [111]:
print(selected_features)


['<OPEN>' '<HIGH>' '<LOW>' 'RSI' 'MACD' 'MACD_Signal' 'EMA9' 'SKDJ_D'
 '<OPEN>/<LOW>' '<OPEN>/<CLOSE>' '<OPEN>/MACD' '<OPEN>/MACD_Signal'
 '<OPEN>/ATR' '<OPEN>/TEMA' '<OPEN>/EMA4' '<OPEN>/EMA18' '<OPEN>/VWAP'
 '<HIGH>/MACD' '<HIGH>/MACD_Signal' '<HIGH>/EMA4' '<HIGH>/EMA9'
 '<HIGH>/EMA18' '<HIGH>/VWAP' '<HIGH>/SKDJ_D' '<LOW>/RSI'
 '<LOW>/MACD_Signal' '<LOW>/ATR' '<LOW>/TEMA' '<LOW>/VWAP' '<LOW>/SKDJ_K'
 '<LOW>/SKDJ_D' '<CLOSE>/<TICKVOL>' '<CLOSE>/RSI' '<CLOSE>/MACD_Signal'
 '<CLOSE>/MACD_Hist' '<CLOSE>/TEMA' '<CLOSE>/EMA9' '<CLOSE>/EMA18'
 '<CLOSE>/VWAP' '<CLOSE>/SKDJ_K' '<CLOSE>/SKDJ_D' '<TICKVOL>/MACD'
 '<TICKVOL>/MACD_Signal' '<TICKVOL>/TEMA' '<TICKVOL>/EMA9'
 '<TICKVOL>/EMA18' '<TICKVOL>/SKDJ_K' 'RSI/MACD' 'RSI/MACD_Signal'
 'RSI/ATR' 'RSI/TEMA' 'RSI/EMA4' 'RSI/EMA9' 'RSI/EMA18' 'RSI/VWAP'
 'RSI/SKDJ_D' 'MACD/MACD_Signal' 'MACD/MACD_Hist' 'MACD/ATR' 'MACD/TEMA'
 'MACD/EMA4' 'MACD/EMA9' 'MACD/EMA18' 'MACD/VWAP' 'MACD/SKDJ_D'
 'MACD_Signal/MACD_Hist' 'MACD_Signal/TEMA' 'MACD_Signal/EM

In [86]:
X_test = X_test.copy()
X_test['predicted'] = logreg.predict_proba(X_test_selected)[:, 1]
X_test['actual'] = y_test

In [87]:
X_test.to_csv('Data/model_prelim_results.csv')