In [None]:
# Remove Future Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# General
import numpy as np

# Data Management
import pandas as pd
from sklearn.model_selection import train_test_split

# Machine Learning
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

# Evaluation
from sklearn.metrics import precision_score

# Reporting
import matplotlib.pyplot as plt

# vectorbt
import vectorbtpro as vbt
vbt.settings.set_theme('dark')
vbt.settings['plotting']['layout']['width'] = 600
vbt.settings['plotting']['layout']['height'] = 300

In [None]:
# Select type of model to optimize for
is_binary = False
is_optimise_for_precision = True

In [None]:
df = pd.read_csv('data/1ySOLdata1hAllHassInd.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df.set_index('timestamp', inplace=True)
pd.set_option('future.no_silent_downcasting', True)
df.loc[:, 'signal'] = 'SignalNone'

window_size = 10
data_trimmed = df.copy()
rolling_max = data_trimmed.loc[:,'price'].rolling(window=2*window_size+1, center=True, min_periods=1).max()
rolling_min = data_trimmed.loc[:,'price'].rolling(window=2*window_size+1, center=True, min_periods=1).min()

is_peak = (data_trimmed.loc[:, 'price'] == rolling_max)

is_low = (data_trimmed.loc[:, 'price'] == rolling_min)

# Update signal columns where conditions are met
data_trimmed.loc[is_peak, 'signal'] = 'SignalShort'
data_trimmed.loc[is_low, 'signal'] = 'SignalLong'
df = data_trimmed.copy()

df_filtered = df[df['signal'] != 'SignalNone']

# Iterate through the DataFrame and adjust the signals
for i in range(1, len(df_filtered)):
    current_signal = df_filtered.iloc[i]['signal']
    previous_signal = df_filtered.iloc[i - 1]['signal']
    current_close = df_filtered.iloc[i]['price']
    previous_close = df_filtered.iloc[i - 1]['price']

    if current_signal == previous_signal:
        if current_signal == 'SignalLong':
            if previous_close > current_close:
                df_filtered.iloc[i - 1, df_filtered.columns.get_loc('signal')] = 'SignalNone'
            else:
                df_filtered.iloc[i, df_filtered.columns.get_loc('signal')] = 'SignalNone'
        elif current_signal == 'SignalShort':
            if previous_close < current_close:
                df_filtered.iloc[i - 1, df_filtered.columns.get_loc('signal')] = 'SignalNone'
            else:
                df_filtered.iloc[i, df_filtered.columns.get_loc('signal')] = 'SignalNone'


df.update(df_filtered)

if is_binary:
    # Assuming df is your DataFrame
    previous_signal = None  # Initialize a variable to keep track of the previous non-"SignalNone" value

    for i in range(len(df)):
        if df.iloc[i, df_filtered.columns.get_loc('signal')] == "SignalNone" and previous_signal is not None:
            df.iloc[i, df_filtered.columns.get_loc('signal')] = previous_signal  # Replace "SignalNone" with the previous signal
        elif df.iloc[i, df_filtered.columns.get_loc('signal')] != "SignalNone":
            previous_signal = df.iloc[i, df_filtered.columns.get_loc('signal')]  # Update the previous signal to the current one if it's not "SignalNone"

    df = df.loc[df['signal'] != 'SignalNone']

df['signal'] = df['signal'].replace({'SignalLong': 2, 'SignalShort': 0, 'SignalNone': 1})
df = df.ffill()


In [None]:
data = vbt.Data.from_data(df)
# features = data.run("talib", mavp=vbt.run_arg_dict(periods=14))
# data.data['symbol'] = pd.concat([data.data['symbol'], features], axis=1)
data.data['symbol'].drop(['Open', 'High', 'Low'], axis=1, inplace=True)

# This will drop columns from the DataFrame where all values are NaN
# data.data['symbol'] = data.data['symbol'].dropna(axis=1, how='all')

open_price = data.get('Open')
high_price = data.get('High')
low_price = data.get('Low')
close_price = data.get('Close')

data.data['symbol'] = data.data['symbol'].dropna()

df_tts = data.copy()

# df_tts.data['symbol']

# df_tts.data['symbol'].drop(columns=["Close"], inplace=True)
# df_tts.data['symbol'].drop(columns=["Volume"], inplace=True)

df_tts.data['symbol'] = df_tts.data['symbol'].rename(columns={'signal': 'TARGET'})

predictor_list = data.data['symbol'].drop('TARGET', axis=1).columns.tolist()
# predictor_list = [('midpoint', 'real'), ('stochf', 'fastk'), ('stochrsi', 'fastk'), ('willr', 'real'), ('wma', 'real')]


X = df_tts.data['symbol'][predictor_list]
y = df_tts.data['symbol']['TARGET']

X.columns = X.columns.astype(str)

In [22]:

from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from scipy.stats import randint, uniform
# Perform Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, shuffle=False)

y_train = y_train.astype('int')
y_test = y_test.astype('int')



objective = "multi:softmax"
eval_metric = "mlogloss"
scoring = "f1_micro"

# Provide Gris for Hyperparams
param_distributions = {
    "max_depth": randint(2, 10),
    "n_estimators": randint(50, 500),
    "learning_rate": uniform(0.01, 0.5),
    "gamma": uniform(0, 1),
    "min_child_weight": randint(1, 10),
    "subsample": uniform(0.5, 1),
    "colsample_bytree": uniform(0.5, 1),
    "reg_alpha": uniform(0, 1),
    "reg_lambda": uniform(0, 1),
}

X_train.replace([np.inf, -np.inf], np.nan, inplace=True)

X_train.fillna(-999, inplace=True)  


estimator = XGBClassifier(
    objective="multi:softmax",  # For multiclass classification
    eval_metric="mlogloss",  # Evaluation metric for multiclass
    use_label_encoder=False,  # Avoid using the deprecated label encoder
    random_state=42  # For reproducibility
)

# Perform Random Search for Best Hyper params
random_search = RandomizedSearchCV(
    estimator=estimator,
    param_distributions=param_distributions,
    n_iter=100,  # Number of random parameter settings to try
    cv=5,  # Number of cross-validation folds
    scoring='f1_micro',  # Evaluation metric for multiclass
    n_jobs=-1,  # Use all available CPU cores
    verbose=2,  # Print progress information
    random_state=42,  # For reproducibility
)
best_model = random_search.fit(X_train, y_train)
# hyperparams = best_model.best_params_
# ne = hyperparams["n_estimators"]
# lr = hyperparams["learning_rate"]
# md = hyperparams["max_depth"]
# gm = hyperparams["gamma"]
# print("Recommended Params >>", f"ne: {ne},", f"lr: {lr}", f"md: {md}", f"gm: {gm}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END colsample_bytree=0.8745401188473625, gamma=0.9507143064099162, learning_rate=0.37599697090570255, max_depth=6, min_child_weight=5, n_estimators=152, reg_alpha=0.44583275285359114, reg_lambda=0.09997491581800289, subsample=0.9592488919658672; total time=   1.2s
[CV] END colsample_bytree=0.8745401188473625, gamma=0.9507143064099162, learning_rate=0.37599697090570255, max_depth=6, min_child_weight=5, n_estimators=152, reg_alpha=0.44583275285359114, reg_lambda=0.09997491581800289, subsample=0.9592488919658672; total time=   1.1s
[CV] END colsample_bytree=0.8745401188473625, gamma=0.9507143064099162, learning_rate=0.37599697090570255, max_depth=6, min_child_weight=5, n_estimators=152, reg_alpha=0.44583275285359114, reg_lambda=0.09997491581800289, subsample=0.9592488919658672; total time=   1.0s
[CV] END colsample_bytree=0.8745401188473625, gamma=0.9507143064099162, learning_rate=0.37599697090570255, max_depth=6, min_chi

400 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/duncanmaclennan/anaconda3/envs/vectorbtpro/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/duncanmaclennan/anaconda3/envs/vectorbtpro/lib/python3.10/site-packages/xgboost/core.py", line 730, in inner_f
    return func(**kwargs)
  File "/Users/duncanmaclennan/anaconda3/envs/vectorbtpro/lib/python3.10/site-packages/xgboost/sklearn.py", line 1519, in fit
    self._Booster = train(
  File "/Users/duncanmaclennan/anaconda3/envs/vectorbtpro/lib/python3.10/s

In [None]:
# Build Classification Model 1
classifier_1 = XGBClassifier(
    objective=objective,
    booster="gbtree",
    eval_metric=eval_metric,
    n_estimators=ne,
    learning_rate=lr,
    max_depth=md,
    gamma=gm,
    subsample=0.8,
    colsample_bytree=1,
    random_state=1,
    use_label_encoder=False
)

In [None]:
# Fit Model
eval_set = [(X_train, y_train)]
classifier_1.fit(
    X_train,
    y_train,
    eval_set=eval_set,
    verbose=False
)

In [None]:
# Get predictions for training data
train_yhat = classifier_1.predict(X_train)
print("Training Preds: \n", train_yhat[-50:])

In [None]:
test_yhat = classifier_1.predict(X_test)
print(set(y_train))
print(set(test_yhat))

In [None]:
def backtest(close, signal):
    entries = signal == 2
    exits = signal == 0
    pf = vbt.Portfolio.from_signals(
        close=close, 
        long_entries=entries, 
        long_exits=exits,
        size=100,
        size_type='value',
        # accumulate=True,
        init_cash='auto'
    )
    pf.plot({"orders", "cum_returns"}, settings=dict(bm_returns=False)).show()
    print(pf.stats())
df_split = df_tts.data['symbol'].iloc[-len(X_test):]
df_split['TARGET'] = test_yhat
backtest(df_split.Close, df_split.TARGET)

In [None]:
# Set K-Fold Cross Validation Levels
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

In [None]:
# Training Results
train_results = cross_val_score(classifier_1, X_train, y_train, scoring=scoring, cv=cv, n_jobs=1)

In [None]:
# Brief Review of Training Results

print("Average Accuracy K-Fold: ", round(train_results.mean(), 2))
print("Std Deviation K-Fold: ", round(train_results.std(), 2))
print("Precision Score 0: ", round(precision_score(y_train, train_yhat, average=None)[0], 3))
print("Precision Score 1: ", round(precision_score(y_train, train_yhat, average=None)[1], 3))

# print("")
# print("Just for reference. Right now, we are only focussed on getting some initial features.")
# print("If the results look too good to be true, they probably are.")

### Feature Selection

In [None]:
# Plot Feature Importances
fig = plt.figure(figsize=(15, 5))
importance_labels = X.columns
importance_features = classifier_1.feature_importances_
plt.barh(importance_labels, importance_features)
plt.tight_layout()  # Adjust layout to fit
plt.show()

In [None]:
# Select Best Features
mean_feature_importance = importance_features.mean()
i = 0
recommended_feature_labels = []
recommended_feature_score = []
for fi in importance_features:
    if fi > mean_feature_importance:
        recommended_feature_labels.append(importance_labels[i])
        recommended_feature_score.append(fi)
    i += 1

In [None]:
import ast

tuple_list = []
for item in recommended_feature_labels:
    if item.startswith("(") and item.endswith(")"):
        # It's a tuple representation, evaluate it
        try:
            evaluated_item = ast.literal_eval(item)
            tuple_list.append(evaluated_item)
        except ValueError as e:
            print(f"Skipping item {item}: {e}")
    else:
        # It's a simple string, use it directly
        tuple_list.append(item)

print(tuple_list)

In [None]:
import json

# Assuming tuple_list is already defined and filled with your data
with open('tuple_list.json', 'w') as f:
    # Convert the tuples to lists because tuples are not JSON serializable
    json.dump([list(item) if isinstance(item, tuple) else item for item in tuple_list], f)


In [None]:
# Plot Recommended Features
fig = plt.figure(figsize=(10, 6))
plt.barh(recommended_feature_labels, recommended_feature_score)
plt.tight_layout()  # Adjust layout to fit
plt.show()
