In [11]:
split = int(len(X) * 0.7)

X_train = X.iloc[:split]
y_train = y.iloc[:split]

X_test = X.iloc[split:]
y_test = y.iloc[split:]


In [13]:
print(y_train.nunique(), y_test.nunique())


1 1


In [15]:
df["returns"] = df["close"].pct_change()


In [17]:
# For example, if next candle goes up -> 1 else -1
df["target"] = df["returns"].shift(-1)

df["target"] = df["target"].apply(lambda x: 1 if x > 0 else -1)


In [19]:
df.dropna(inplace=True)


In [21]:
threshold = 0.0002  # 0.02% threshold

df["target"] = df["returns"].shift(-1)

df["target"] = df["target"].apply(
    lambda x: 1 if x > threshold else (-1 if x < -threshold else 0)
)

print(df["target"].value_counts())


target
 0    20824
 1     1303
-1     1275
Name: count, dtype: int64


In [25]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(class_weight="balanced", random_state=42)
model.fit(X_train, y_train)


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE

# Split data
X = df.drop(["Date","target"], axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=False
)

# Oversample
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# Train model
model = RandomForestClassifier(class_weight="balanced", random_state=42)
model.fit(X_train_res, y_train_res)

# Predict
pred = model.predict(X_test)

# Metrics
print("Accuracy:", accuracy_score(y_test, pred))
print("F1 Score:", f1_score(y_test, pred, average="macro"))
print(classification_report(y_test, pred))


Accuracy: 0.7310403759880367
F1 Score: 0.46260183227843127
              precision    recall  f1-score   support

          -1       0.15      0.44      0.23       288
           0       0.98      0.77      0.86      4107
           1       0.21      0.49      0.30       286

    accuracy                           0.73      4681
   macro avg       0.45      0.57      0.46      4681
weighted avg       0.89      0.73      0.79      4681



In [34]:
!pip install lightgbm



Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 19.2 MB/s  0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [36]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight

import lightgbm as lgb

# ---------------------------
# Load cleaned data
# ---------------------------
project_root = r"C:\Users\Diya\OneDrive\Desktop\quant-trading-system"
clean_path = os.path.join(project_root, "data", "cleaned", "nifty_spot_5min_cleaned.csv")

df = pd.read_csv(clean_path, parse_dates=["Date"])

# ---------------------------
# Feature Engineering (Advanced)
# ---------------------------
df["returns"] = df["close"].pct_change()
df["log_returns"] = np.log(df["close"] / df["close"].shift(1))

# RSI
delta = df["close"].diff()
gain = np.where(delta > 0, delta, 0)
loss = np.where(delta < 0, -delta, 0)
avg_gain = pd.Series(gain).rolling(14).mean()
avg_loss = pd.Series(loss).rolling(14).mean()
rs = avg_gain / avg_loss
df["RSI"] = 100 - (100 / (1 + rs))

# MACD
ema12 = df["close"].ewm(span=12, adjust=False).mean()
ema26 = df["close"].ewm(span=26, adjust=False).mean()
df["MACD"] = ema12 - ema26
df["MACD_signal"] = df["MACD"].ewm(span=9, adjust=False).mean()

# ATR
high_low = df["high"] - df["low"]
high_close = np.abs(df["high"] - df["close"].shift())
low_close = np.abs(df["low"] - df["close"].shift())
tr = np.max([high_low, high_close, low_close], axis=0)
df["ATR"] = pd.Series(tr).rolling(14).mean()

# Bollinger Bands
df["BB_mid"] = df["close"].rolling(20).mean()
df["BB_std"] = df["close"].rolling(20).std()
df["BB_upper"] = df["BB_mid"] + 2 * df["BB_std"]
df["BB_lower"] = df["BB_mid"] - 2 * df["BB_std"]

# Momentum
df["momentum"] = df["close"] - df["close"].shift(10)

# Drop missing
df.dropna(inplace=True)

# ---------------------------
# Target Label
# ---------------------------
future_returns = df["close"].pct_change(5).shift(-5)

df["target"] = 0
df.loc[future_returns > 0.001, "target"] = 1
df.loc[future_returns < -0.001, "target"] = -1

df.dropna(inplace=True)

# ---------------------------
# Train/Test Split
# ---------------------------
features = ["open","high","low","close","RSI","MACD","MACD_signal","ATR","BB_upper","BB_lower","momentum"]
X = df[features]
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# ---------------------------
# Scaling
# ---------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---------------------------
# Class weights (Imbalance handling)
# ---------------------------
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
weight_dict = dict(zip(np.unique(y_train), class_weights))

# ---------------------------
# LightGBM Model
# ---------------------------
model = lgb.LGBMClassifier(
    objective="multiclass",
    num_class=3,
    class_weight=weight_dict,
    n_estimators=500,
    learning_rate=0.05,
)

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, average="macro"))
print(classification_report(y_test, y_pred))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000850 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 3961, number of used features: 11
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Accuracy: 0.5529767911200807
F1 Score: 0.3635559140146296
              precision    recall  f1-score   support

          -1       0.19      0.16      0.17       178
           0       0.71      0.73      0.72       676
           1       0.19      0.21      0.20       137

    accuracy                           0.55       991
   macro avg       0.36      0.37      0.36       991
weighted avg       0.55      0.55      0.55       991



In [38]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score
import lightgbm as lgb

# ----------------------------
# Load Data
# ----------------------------
project_root = r"C:\Users\Diya\OneDrive\Desktop\quant-trading-system"
path = os.path.join(project_root, "data", "features", "nifty_features_spot_5min.csv")

df = pd.read_csv(path, parse_dates=["Date"])

# ----------------------------
# Add Features
# ----------------------------
df["returns"] = df["close"].pct_change()

df["EMA_5"] = df["close"].ewm(span=5).mean()
df["EMA_15"] = df["close"].ewm(span=15).mean()

# RSI
delta = df["close"].diff()
gain = np.where(delta > 0, delta, 0)
loss = np.where(delta < 0, -delta, 0)
avg_gain = pd.Series(gain).rolling(14).mean()
avg_loss = pd.Series(loss).rolling(14).mean()
rs = avg_gain / avg_loss
df["RSI"] = 100 - (100 / (1 + rs))

# MACD
ema12 = df["close"].ewm(span=12).mean()
ema26 = df["close"].ewm(span=26).mean()
df["MACD"] = ema12 - ema26
df["MACD_signal"] = df["MACD"].ewm(span=9).mean()

# ATR
high_low = df["high"] - df["low"]
high_close = np.abs(df["high"] - df["close"].shift())
low_close = np.abs(df["low"] - df["close"].shift())
tr = np.max([high_low, high_close, low_close], axis=0)
df["ATR"] = pd.Series(tr).rolling(14).mean()

# Bollinger Bands
df["BB_mid"] = df["close"].rolling(20).mean()
df["BB_std"] = df["close"].rolling(20).std()
df["BB_upper"] = df["BB_mid"] + 2 * df["BB_std"]
df["BB_lower"] = df["BB_mid"] - 2 * df["BB_std"]

df["momentum"] = df["close"] - df["close"].shift(10)

df.dropna(inplace=True)

# ----------------------------
# Improved Target
# ----------------------------
future_returns = df["close"].pct_change(15).shift(-15)

df["target"] = 0
df.loc[future_returns > 0.0015, "target"] = 1
df.loc[future_returns < -0.0015, "target"] = -1

df.dropna(inplace=True)

# ----------------------------
# Train/Test
# ----------------------------
features = [
    "open","high","low","close",
    "EMA_5","EMA_15","RSI","MACD","MACD_signal",
    "ATR","BB_upper","BB_lower","momentum"
]

X = df[features]
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ----------------------------
# LightGBM Model
# ----------------------------
model = lgb.LGBMClassifier(
    objective="multiclass",
    num_class=3,
    class_weight="balanced",
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, average="macro"))
print(classification_report(y_test, y_pred))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000540 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3315
[LightGBM] [Info] Number of data points in the train set: 3961, number of used features: 13
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Accuracy: 0.4661957618567104
F1 Score: 0.37203129018075637
              precision    recall  f1-score   support

          -1       0.22      0.15      0.18       258
           0       0.65      0.63      0.64       570
           1       0.24      0.39      0.30       163

    accuracy                           0.47       991
   macro avg       0.37      0.39      0.37       991
weighted avg       0.47      0.47      0.46       991



In [40]:
import os

project_root = r"C:\Users\Diya\OneDrive\Desktop\quant-trading-system"
results_path = os.path.join(project_root, "results")

# Create results folder if not exist
os.makedirs(results_path, exist_ok=True)

# Save trades dataframe
trades.to_csv(os.path.join(results_path, "backtest_trades.csv"), index=False)

print("✅ Saved backtest trades at:", os.path.join(results_path, "backtest_trades.csv"))


✅ Saved backtest trades at: C:\Users\Diya\OneDrive\Desktop\quant-trading-system\results\backtest_trades.csv
