In [1]:
# Remove Future Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# General
import numpy as np

# Data Management
import pandas as pd
from sklearn.model_selection import train_test_split

# Machine Learning
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

# Evaluation
from sklearn.metrics import precision_score

# Reporting
import matplotlib.pyplot as plt

# vectorbt
import vectorbtpro as vbt
vbt.settings.set_theme('dark')
vbt.settings['plotting']['layout']['width'] = 600
vbt.settings['plotting']['layout']['height'] = 300

In [2]:
df = pd.read_csv('data/1ySOLdata1hAllHassInd.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df.set_index('timestamp', inplace=True)
pd.set_option('future.no_silent_downcasting', True)
df.loc[:, 'signal'] = 'SignalNone'

window_size = 10
data_trimmed = df.copy()
rolling_max = data_trimmed.loc[:,'price'].rolling(window=2*window_size+1, center=True, min_periods=1).max()
rolling_min = data_trimmed.loc[:,'price'].rolling(window=2*window_size+1, center=True, min_periods=1).min()

is_peak = (data_trimmed.loc[:, 'price'] == rolling_max)

is_low = (data_trimmed.loc[:, 'price'] == rolling_min)

# Update signal columns where conditions are met
data_trimmed.loc[is_peak, 'signal'] = 'SignalShort'
data_trimmed.loc[is_low, 'signal'] = 'SignalLong'
df = data_trimmed.copy()

df_filtered = df[df['signal'] != 'SignalNone']

# Iterate through the DataFrame and adjust the signals
for i in range(1, len(df_filtered)):
    current_signal = df_filtered.iloc[i]['signal']
    previous_signal = df_filtered.iloc[i - 1]['signal']
    current_close = df_filtered.iloc[i]['price']
    previous_close = df_filtered.iloc[i - 1]['price']

    if current_signal == previous_signal:
        if current_signal == 'SignalLong':
            if previous_close > current_close:
                df_filtered.iloc[i - 1, df_filtered.columns.get_loc('signal')] = 'SignalNone'
            else:
                df_filtered.iloc[i, df_filtered.columns.get_loc('signal')] = 'SignalNone'
        elif current_signal == 'SignalShort':
            if previous_close < current_close:
                df_filtered.iloc[i - 1, df_filtered.columns.get_loc('signal')] = 'SignalNone'
            else:
                df_filtered.iloc[i, df_filtered.columns.get_loc('signal')] = 'SignalNone'


df.update(df_filtered)


df['signal'] = df['signal'].replace({'SignalLong': 2, 'SignalShort': 0, 'SignalNone': 1})
df = df.ffill()


In [4]:
data = vbt.Data.from_data(df)
# features = data.run("talib", mavp=vbt.run_arg_dict(periods=14))
# data.data['symbol'] = pd.concat([data.data['symbol'], features], axis=1)
data.data['symbol'].drop(['Open', 'High', 'Low'], axis=1, inplace=True)

# This will drop columns from the DataFrame where all values are NaN
# data.data['symbol'] = data.data['symbol'].dropna(axis=1, how='all')

open_price = data.get('Open')
high_price = data.get('High')
low_price = data.get('Low')
close_price = data.get('Close')

data.data['symbol'] = data.data['symbol'].dropna()

df_tts = data.copy()

# df_tts.data['symbol']

# df_tts.data['symbol'].drop(columns=["Close"], inplace=True)
# df_tts.data['symbol'].drop(columns=["Volume"], inplace=True)



predictor_list = data.data['symbol'].drop('signal', axis=1).columns.tolist()
# predictor_list = [('midpoint', 'real'), ('stochf', 'fastk'), ('stochrsi', 'fastk'), ('willr', 'real'), ('wma', 'real')]


X = df_tts.data['symbol'][predictor_list]
y = df_tts.data['symbol']['signal']

X.columns = X.columns.astype(str)

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, shuffle=False)

In [18]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')


In [19]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=42)


In [20]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

# Setup cross-validation method
cv = StratifiedKFold(n_splits=5)

# Initialize RFECV
selector = RFECV(estimator=clf, step=1, cv=cv, scoring='accuracy')


In [21]:
selector = selector.fit(X_train, y_train)

print("Optimal number of features: %d" % selector.n_features_)


Optimal number of features: 6
