In [None]:
import pandas as pd
import numpy as np

# ===== 1. LOADING DATA =====
file_name = "nifty50_ticks.csv"
df = pd.read_csv(file_name)

print("Columns:", df.columns.tolist())
print(df.head())

# ===== 2. PARSE TIMESTAMP & SORT IN TIME ORDER =====
df['timestamp'] = pd.to_datetime(df['timestamp'])

# sorting from oldest -> newest
df = df.sort_values('timestamp').reset_index(drop=True)

print("\nAfter sorting by time:")
print(df[['timestamp', 'close']].head())
print(df[['timestamp', 'close']].tail())

# ===== 3. next_close & target =====
# single instrument (all 'Nifty 50'), so no need for groupby
df['next_close'] = df['close'].shift(-1)

df['target'] = (df['next_close'] > df['close']).astype(int)

df = df.dropna(subset=['next_close']).reset_index(drop=True)

df = df.drop(columns=['next_close'])

print("\nData with target column:")
print(df[['timestamp', 'close', 'target']].head(10))



Columns: ['id', 'symbol', 'timestamp', 'open', 'high', 'low', 'close', 'volume', 'open_interest', 'exchange']
       id    symbol                      timestamp      open      high  \
0  401738  Nifty 50  2025-09-10 15:29:00.000 +0530  24978.95  24980.45   
1  401739  Nifty 50  2025-09-10 15:28:00.000 +0530  24979.70  24982.05   
2  401740  Nifty 50  2025-09-10 15:27:00.000 +0530  24976.40  24980.10   
3  401741  Nifty 50  2025-09-10 15:26:00.000 +0530  24975.25  24977.90   
4  401742  Nifty 50  2025-09-10 15:25:00.000 +0530  24981.00  24981.70   

        low     close  volume  open_interest exchange  
0  24972.35  24977.55     0.0            0.0      NSE  
1  24973.40  24978.45     0.0            0.0      NSE  
2  24975.35  24980.10     0.0            0.0      NSE  
3  24974.65  24976.45     0.0            0.0      NSE  
4  24973.70  24975.15     0.0            0.0      NSE  

After sorting by time:
                  timestamp     close
0 2022-02-01 09:15:00+05:30  17489.95
1 2022-02

In [None]:
def compute_rsi(series, period=14):
    delta = series.diff()

    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)

    avg_gain = gain.rolling(window=period).mean()
    avg_loss = loss.rolling(window=period).mean()

    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))

    return rsi


In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from sklearn.pipeline import Pipeline


# ===== basic features =====
df['range'] = df['high'] - df['low']
df['body'] = df['close'] - df['open']
df['ret_1'] = df['close'].pct_change()
df['ma_5'] = df['close'].rolling(window=5).mean()
df['ma_15'] = df['close'].rolling(window=15).mean()

# ===== EXTRA FEATURES =====

# 1) Short-term momentum
df['ret_2'] = df['close'].pct_change(2)
df['ret_3'] = df['close'].pct_change(3)

# 2) Rolling volatility (regime)
df['vol_5'] = df['close'].rolling(window=5).std()
df['vol_15'] = df['close'].rolling(window=15).std()

# 3) Previous bar direction (simple pattern)
df['dir_1'] = (df['ret_1'] > 0).astype(int)

# 4) RSI
df['rsi_14'] = compute_rsi(df['close'], 14)


df = df.dropna().reset_index(drop=True)

# ===== FEATURE MATRIX (X) AND TARGET (y) =====

feature_cols = [
    'open', 'high', 'low', 'close',
    'volume', 'open_interest',
    'range', 'body',
    'ret_1', 'ret_2', 'ret_3',
    'ma_5', 'ma_15',
    'vol_5', 'vol_15',
    'dir_1','rsi_14'
]

X = df[feature_cols].values
y = df['target'].values

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

# ===== TIME-BASED SPLIT  =====

n = len(df)
split_idx = int(0.8 * n)

X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")


# ========= 4. LOGISTIC REGRESSION MODEL WITH SCALING =========

log_reg_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000))
])

log_reg_clf.fit(X_train, y_train)

# ========= 5. EVALUATION ON TEST SET =========

y_pred = log_reg_clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)

print("\n==== Logistic Regression Performance on Test Set ====")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))


Shape of X: (318955, 17)
Shape of y: (318955,)
Train size: 255164, Test size: 63791

==== Logistic Regression Performance on Test Set ====
Accuracy : 0.5086
Precision: 0.5052
Recall   : 0.5573

Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.46      0.49     32083
           1       0.51      0.56      0.53     31708

    accuracy                           0.51     63791
   macro avg       0.51      0.51      0.51     63791
weighted avg       0.51      0.51      0.51     63791



In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    random_state=42
)

rf_clf.fit(X_train, y_train)

y_pred_rf = rf_clf.predict(X_test)

acc = accuracy_score(y_test, y_pred_rf)
prec = precision_score(y_test, y_pred_rf, zero_division=0)
rec = recall_score(y_test, y_pred_rf, zero_division=0)

print("\n==== Random Forest Performance ====")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf, zero_division=0))



==== Random Forest Performance ====
Accuracy : 0.5050
Precision: 0.5022
Recall   : 0.4884

Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.52      0.51     32083
           1       0.50      0.49      0.50     31708

    accuracy                           0.51     63791
   macro avg       0.50      0.50      0.50     63791
weighted avg       0.50      0.51      0.50     63791



In [None]:
!pip install xgboost




In [None]:
from xgboost import XGBClassifier
xgb_clf = XGBClassifier(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method='hist',   # fast training
    objective='binary:logistic'
)

xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)

acc = accuracy_score(y_test, y_pred_xgb)
prec = precision_score(y_test, y_pred_xgb, zero_division=0)
rec = recall_score(y_test, y_pred_xgb, zero_division=0)

print("\n==== XGBoost Performance ====")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb, zero_division=0))



==== XGBoost Performance ====
Accuracy : 0.5039
Precision: 0.5009
Recall   : 0.4997

Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.51      0.51     32083
           1       0.50      0.50      0.50     31708

    accuracy                           0.50     63791
   macro avg       0.50      0.50      0.50     63791
weighted avg       0.50      0.50      0.50     63791



In [None]:
import numpy as np

n = len(df)
split_idx = int(0.8 * n)

df_test = df.iloc[split_idx:].copy().reset_index(drop=True)

print("Test rows in df_test:", len(df_test), " | Test rows in X_test:", len(X_test))

# ========= 2. Getting predictions from BEST model (Logistic Regression) =========

y_pred_lr = log_reg_clf.predict(X_test)

df_test['pred_label'] = y_pred_lr   # 1 => expecting price to go UP, 0 => DOWN/SAME

# Converting prediction to model_call
df_test['model_call'] = np.where(df_test['pred_label'] == 1, 'buy', 'sell')

# ========= 3. Computing step PnL and cumulative PnL =========

df_test['pnl_step'] = np.where(
    df_test['model_call'] == 'buy',
    -df_test['close'],
    df_test['close']
)

# Cumulative PnL
df_test['model_pnl'] = df_test['pnl_step'].cumsum()


print("\nSample of test predictions with PnL:")
display(df_test[['timestamp', 'close', 'pred_label', 'model_call', 'pnl_step', 'model_pnl']].head(15))

# =========  Final cumulative PnL from the strategy =========

final_pnl = df_test['model_pnl'].iloc[-1]
print("\nFinal cumulative PnL from Logistic Regression strategy:", final_pnl)


Test rows in df_test: 63791  | Test rows in X_test: 63791

Sample of test predictions with PnL:


Unnamed: 0,timestamp,close,pred_label,model_call,pnl_step,model_pnl
0,2024-11-01 18:24:00+05:30,24331.3,1,buy,-24331.3,-24331.3
1,2024-11-01 18:25:00+05:30,24328.9,0,sell,24328.9,-2.4
2,2024-11-01 18:26:00+05:30,24341.25,1,buy,-24341.25,-24343.65
3,2024-11-01 18:27:00+05:30,24332.1,0,sell,24332.1,-11.55
4,2024-11-01 18:28:00+05:30,24328.4,1,buy,-24328.4,-24339.95
5,2024-11-01 18:29:00+05:30,24326.8,0,sell,24326.8,-13.15
6,2024-11-01 18:30:00+05:30,24329.75,1,buy,-24329.75,-24342.9
7,2024-11-01 18:31:00+05:30,24328.5,0,sell,24328.5,-14.4
8,2024-11-01 18:32:00+05:30,24329.9,1,buy,-24329.9,-24344.3
9,2024-11-01 18:33:00+05:30,24329.8,0,sell,24329.8,-14.5



Final cumulative PnL from Logistic Regression strategy: -151094855.4500012


In [None]:
num_buys = (df_test['model_call'] == 'buy').sum()
num_sells = (df_test['model_call'] == 'sell').sum()

print("Number of BUY signals :", num_buys)
print("Number of SELL signals:", num_sells)


Number of BUY signals : 34979
Number of SELL signals: 28812
