In [4]:
# Core libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib


In [5]:
# Load the preprocessed and labeled dataset
data = pd.read_csv('../data/final_stock_data.csv', index_col=0)

# Check shape and basic info
print("📊 Loaded data shape:", data.shape)
print("🔍 Columns:", data.columns.tolist())


📊 Loaded data shape: (1208, 16)
🔍 Columns: ['open', 'high', 'low', 'close', 'volume', 'daily_return', 'SMA_20', 'SMA_50', 'RSI', 'MACD', 'Signal_Line', 'BB_upper', 'BB_middle', 'BB_lower', 'future_return', 'Signal']


In [6]:
# Cell 3
# Define features and target label
feature_cols = ['SMA_20', 'SMA_50', 'RSI', 'MACD', 'BB_upper', 'BB_middle', 'BB_lower']
feature_cols = [col for col in feature_cols if col in data.columns]  # safety check

X = data[feature_cols]
y = data['Signal']

# Drop rows with any missing values in X or y
X = X.dropna()
y = y.loc[X.index]

# 🔁 Map labels: -1 → 0 (Sell), 0 → 1 (Hold), 1 → 2 (Buy)
label_map = {-1: 0, 0: 1, 1: 2}
y = y.map(label_map)

# Confirm label distribution
print("✅ Label distribution after mapping:\n", y.value_counts())


✅ Label distribution after mapping:
 Signal
1    631
2    350
0    227
Name: count, dtype: int64


In [7]:
# CELL 4
# Split data into training and testing sets (no shuffle for time-series)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

print("🧠 Training set:", X_train.shape)
print("🧪 Testing set:", X_test.shape)

🧠 Training set: (966, 7)
🧪 Testing set: (242, 7)


In [8]:
# CELL 5
from xgboost import XGBClassifier

# Train the model
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [9]:
# CELL 6
from sklearn.metrics import classification_report, confusion_matrix

# Predict and evaluate
y_pred = model.predict(X_test)

print("📈 Classification Report:")
print(classification_report(y_test, y_pred))

print("📊 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


📈 Classification Report:
              precision    recall  f1-score   support

           0       0.21      0.66      0.32        38
           1       0.63      0.43      0.51       145
           2       0.41      0.15      0.22        59

    accuracy                           0.40       242
   macro avg       0.42      0.41      0.35       242
weighted avg       0.51      0.40      0.41       242

📊 Confusion Matrix:
[[25 12  1]
 [70 63 12]
 [25 25  9]]


In [10]:
import joblib

# Save model to file
joblib.dump(model, '../models/xgb_model.pkl')
print("✅ Model saved to /models/xgb_model.pkl")


✅ Model saved to /models/xgb_model.pkl
