In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, roc_auc_score
from xgboost import XGBClassifier


df = pd.read_csv('/Users/cyruskurd/Documents/grad_programming/AML/Project work/combined_data_with_y.csv')

df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp')

# Define parameters
num_days_maxout = 3
threshold = 1.095 ** num_days_maxout

# Create target variable 'y'
df['future_return'] = df['close'].shift(-num_days_maxout) / df['close']
df['y'] = (df['future_return'] >= threshold).astype(int)
df.dropna(subset=['future_return'], inplace=True)
df.drop(columns=['future_return'], inplace=True)

# Feature Engineering
df['SMA_5'] = df['close'].rolling(window=5).mean()
df['SMA_10'] = df['close'].rolling(window=10).mean()
df['SMA_20'] = df['close'].rolling(window=20).mean()
df['Bollinger_Upper'] = df['SMA_20'] + (df['close'].rolling(window=20).std() * 2)
df['Bollinger_Lower'] = df['SMA_20'] - (df['close'].rolling(window=20).std() * 2)
df['EMA_10'] = df['close'].ewm(span=10, adjust=False).mean()

# RSI Calculation
delta = df['close'].diff(1)
gain = delta.where(delta > 0, 0)
loss = -delta.where(delta < 0, 0)
avg_gain = gain.rolling(window=14).mean()
avg_loss = loss.rolling(window=14).mean()
rs = avg_gain / avg_loss
df['RSI'] = 100 - (100 / (1 + rs))

# Additional Indicators
df['ATR'] = df['close'].rolling(window=14).std()
df['Volume_SMA_10'] = df['vol'].rolling(window=10).mean()
df['Volume_Spike'] = (df['vol'] > df['Volume_SMA_10']).astype(int)
df['Rolling_Std_20'] = df['close'].rolling(window=20).std()

# Drop rows with NaN values
df.dropna(inplace=True)

# Define features and target
X = df[['SMA_5', 'SMA_10', 'SMA_20', 'EMA_10', 'Bollinger_Upper', 'Bollinger_Lower', 'RSI', 'ATR', 'Volume_Spike', 'Rolling_Std_20']]
y = df['y']

# Time-based train-test split
split_date = '2020-01-01'
X_train = X[df['timestamp'] < split_date]
X_test = X[df['timestamp'] >= split_date]
y_train = y[df['timestamp'] < split_date]
y_test = y[df['timestamp'] >= split_date]

# Handle class imbalance using scale_pos_weight
negative_counts = (y_train == 0).sum()
positive_counts = (y_train == 1).sum()
scale_pos_weight = negative_counts / positive_counts

# Train XGBoost classifier
xgb_model = XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42, n_estimators=100, max_depth=5, learning_rate=0.1, subsample=0.8)
xgb_model.fit(X_train, y_train)

# Predictions
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]
threshold = 0.5
y_pred = (y_pred_proba >= threshold).astype(int)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"XGBoost Accuracy: {accuracy:.4f}")
print(f"XGBoost F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("Confusion Matrix:\n", conf_matrix)
print(classification_report(y_test, y_pred))

XGBoost Accuracy: 0.6719
XGBoost F1 Score: 0.6757
ROC AUC Score: 0.7546
Confusion Matrix:
 [[1747689 1324699]
 [ 412641 1809751]]
              precision    recall  f1-score   support

           0       0.81      0.57      0.67   3072388
           1       0.58      0.81      0.68   2222392

    accuracy                           0.67   5294780
   macro avg       0.69      0.69      0.67   5294780
weighted avg       0.71      0.67      0.67   5294780

