In [91]:
# features
# - TEMA
# - EMA4, 9, 18 crossover
# - False Breakout Finder (*new intraday high, close lower than previous high, new intraday low, close higher than previous low
# - ATR
# - Relative Strength Analysis
# - market breadth (indicator of overall market performance)
# - skdj

import yfinance as yf
import pandas as pd
import ta

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

In [48]:
def download_data(ticker: str):
    df = yf.download(ticker, period="5y", interval="1d", multi_level_index=False)
    df.dropna(inplace=True)
    return df

def calculate_indicators(df):
    df['RSI'] = ta.momentum.RSIIndicator(df['Close']).rsi()

    macd = ta.trend.MACD(df['Close'])
    df['MACD'] = macd.macd()
    df['MACD_Signal'] = macd.macd_signal()
    df['MACD_Hist'] = macd.macd_diff()

    df['ATR'] = ta.volatility.AverageTrueRange(df['High'], df['Low'], df['Close']).average_true_range()

    df['TEMA'] = ta.trend.EMAIndicator(df['Close'], window=30).ema_indicator()
    df['EMA4'] = ta.trend.EMAIndicator(df['Close'], window=4).ema_indicator()
    df['EMA9'] = ta.trend.EMAIndicator(df['Close'], window=9).ema_indicator()
    df['EMA18'] = ta.trend.EMAIndicator(df['Close'], window=18).ema_indicator()

    df['EMA4_Crossover_up'] = (df['EMA4'] > df['EMA9']) & (df['EMA9'] > df['EMA18'])
    df['EMA4_Crossover_down'] = (df['EMA4'] < df['EMA9']) & (df['EMA9'] < df['EMA18'])

    stoch = ta.momentum.StochasticOscillator(df['High'], df['Low'], df['Close'])
    df['SKDJ_K'] = stoch.stoch()
    df['SKDJ_D'] = stoch.stoch_signal()

    return df

def calculate_flags(df):
    df['Max_High_2w'] = df['High'].rolling(window=14).max().shift(-14)
    df['Min_Low_2w'] = df['Low'].rolling(window=14).min().shift(-14)
    df['Flag'] = df.apply(lambda row: 1 if row['Max_High_2w'] > 1.1*row['Close'] else (-1 if row['Min_Low_2w'] < 0.9*row['Close'] else 0), axis=1)
    return df

def calculate_relative_strength(df, market):
    df_market = yf.download(market, period="5y", interval="1d", multi_level_index=False)['Close'].rename(market)
    df['market_close'] = df.merge(df_market, how='left', left_index=True, right_index=True)[market]
    df['relative_strength'] = df['Close']/df['market_close']
    df['relative_strength_EMA_4'] = ta.trend.EMAIndicator(df['relative_strength'], window=4).ema_indicator()
    df['relative_strength_EMA_9'] = ta.trend.EMAIndicator(df['relative_strength'], window=9).ema_indicator()
    df['relative_strength_EMA_4_9'] = df['relative_strength_EMA_4']/df['relative_strength_EMA_9']

    return df

def data_prep(ticker, market):
    df = download_data(ticker)
    df = calculate_indicators(df)
    df = calculate_flags(df)
    df = calculate_relative_strength(df, market)
    df.drop(columns=['Max_High_2w', 'Min_Low_2w'], inplace=True)  # Clean up unnecessary columns
    df.dropna(inplace=True)

    return df
    # df.to_csv(f'{ticker}_processed.csv')
    # print(f'Data saved to {ticker}_processed.csv')

In [88]:
df =  data_prep("BTC-USD", "GDLC")



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [92]:
# Sort data by Date
df = df.sort_index()

# Split the data into train and test (20% test data)
# Define train-test split with a gap of 20 data points
train_size = int(0.8 * len(df))
train_data = df.iloc[:train_size]
test_data = df.iloc[train_size + 20:]  # Skipping 20 rows

# Features (excluding 'Date' and 'Flag' columns)
features = ['RSI', 'MACD', 'MACD_Signal',
            'MACD_Hist', 'ATR', 'TEMA', 'EMA4_Crossover_up',
            'EMA4_Crossover_down', 'SKDJ_K', 'SKDJ_D', 'relative_strength_EMA_4_9']

X_train = train_data[features]
y_train = train_data['Flag']>0

X_test = test_data[features]
y_test = test_data['Flag']>0

# 2) Logistic Regression with SequentialFeatureSelector and Cross-validation

# Initialize logistic regression model
# logreg = LogisticRegression(max_iter=10000)
# Initialize RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Initialize SequentialFeatureSelector for backward selection
selector = SequentialFeatureSelector(rf, n_features_to_select='auto', direction='backward', scoring='roc_auc', cv=5)

# Fit the feature selector on the training data
selector = selector.fit(X_train, y_train)

# Get the selected features
selected_features = selector.get_feature_names_out()

# 3) Build Logistic Regression using selected features and evaluate AUROC with cross-validation
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Initialize the Logistic Regression model
rf.fit(X_train_selected, y_train)

# Perform cross-validation with AUROC as scoring metric
cv_scores = cross_val_score(rf, X_train_selected, y_train, cv=StratifiedKFold(5), scoring='roc_auc')

# Print the cross-validation AUROC scores
print(f"Cross-validation AUROC scores: {cv_scores}")
print(f"Mean AUROC: {cv_scores.mean()}")

# Evaluate AUROC on test data
y_pred_proba = rf.predict_proba(X_test_selected)[:, 1]
test_auroc = roc_auc_score(y_test, y_pred_proba)

print(f"AUROC on test data: {test_auroc}")

Cross-validation AUROC scores: [0.56171514 0.38461951 0.49940476 0.55481602 0.5230535 ]
Mean AUROC: 0.5047217875643277
AUROC on test data: 0.4747820672478206


In [94]:
print(selected_features)


['RSI' 'MACD' 'MACD_Signal' 'EMA4_Crossover_up' 'SKDJ_K' 'SKDJ_D']


In [86]:
X_test = X_test.copy()
X_test['predicted'] = logreg.predict_proba(X_test_selected)[:, 1]
X_test['actual'] = y_test

In [87]:
X_test.to_csv('Data/model_prelim_results.csv')