In [1]:
# model_training.ipynb

# =========================
# 1. Import Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout

import pickle

In [2]:
# =========================
# 2. Load Data
# =========================
# Suppose your CSV has columns: Date, Open, High, Low, Close, Shares Traded, Turnover
df = pd.read_csv('/content/NIFTY 500-02-01-2024-to-02-01-2025.csv')  # Adjust path if needed

# Quick look
print(df.head())

         Date     Open      High       Low     Close   Shares Traded   \
0  02-JAN-2024  19496.8  19499.80  19271.65  19418.40    4.059854e+09   
1  03-JAN-2024  19434.3  19442.20  19343.15  19367.15    3.635617e+09   
2  04-JAN-2024  19451.8  19560.40  19433.50  19541.25    3.245963e+09   
3  05-JAN-2024  19602.1  19637.10  19490.95  19590.60    4.034292e+09   
4  08-JAN-2024  19636.0  19641.85  19400.60  19417.70    3.520353e+09   

   Turnover (₹ Cr)  
0         79033.69  
1         89160.29  
2         86566.76  
3         84305.85  
4         71596.21  


In [3]:
# Check the column names and clean them
df.columns = df.columns.str.strip()  # Remove any extra spaces
print("Column Names:", df.columns)  # Debugging step

# Rename columns explicitly if necessary
if 'Date' not in df.columns:
    df.rename(columns={df.columns[0]: 'Date'}, inplace=True)  # Ensure 'Date' is correct

# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d-%b-%Y')  # Adjust format if needed

# Print first few rows to verify
print(df.head())

Column Names: Index(['Date', 'Open', 'High', 'Low', 'Close', 'Shares Traded',
       'Turnover (₹ Cr)'],
      dtype='object')
        Date     Open      High       Low     Close  Shares Traded  \
0 2024-01-02  19496.8  19499.80  19271.65  19418.40   4.059854e+09   
1 2024-01-03  19434.3  19442.20  19343.15  19367.15   3.635617e+09   
2 2024-01-04  19451.8  19560.40  19433.50  19541.25   3.245963e+09   
3 2024-01-05  19602.1  19637.10  19490.95  19590.60   4.034292e+09   
4 2024-01-08  19636.0  19641.85  19400.60  19417.70   3.520353e+09   

   Turnover (₹ Cr)  
0         79033.69  
1         89160.29  
2         86566.76  
3         84305.85  
4         71596.21  


In [4]:
# =========================
# 3. Preprocessing
# =========================
# Example approach: Predict 'Close' from previous columns
# Drop rows with missing values
df.dropna(inplace=True)

# Convert Date to datetime if needed
df['Date'] = pd.to_datetime(df['Date'])

# Sort by date if not already
df.sort_values(by='Date', inplace=True)

# For a simple approach, let's just pick 'Close' as y and maybe use 'Open', 'High', 'Low', 'Shares Traded', 'Turnover' as features.
features = ['Open', 'High', 'Low', 'Shares Traded', 'Turnover (₹ Cr)']  # Adjust to your actual column names
target = 'Close'

X = df[features].values
y = df[target].values

# Scale features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, shuffle=False
)


In [5]:
# =========================
# 4. Linear Regression
# =========================
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

# Evaluate
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print("Linear Regression MSE:", mse_lr)
print("Linear Regression R2:", r2_lr)


Linear Regression MSE: 6197.864474850521
Linear Regression R2: 0.9614536852891495


In [6]:
# =========================
# 5. LSTM Model
# =========================
# For an LSTM, we typically use time-series sequences.
# We'll do a simple approach: each sample is one day, predicting the next day.
# More advanced approach: create sequences of N days to predict day N+1.

# Let's demonstrate a naive approach: shift 'Close' by 1 day.
# (In real scenarios, you'd create sequences of length T.)

sequence_length = 1  # simple approach

# We already have X_scaled. Let's reshape for LSTM: (samples, time_steps, features)
X_train_lstm = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm  = np.reshape(X_test,  (X_test.shape[0], 1, X_test.shape[1]))

model_lstm = Sequential()
model_lstm.add(LSTM(64, input_shape=(1, X_train.shape[1]), activation='relu', return_sequences=False))
model_lstm.add(Dropout(0.2))
model_lstm.add(Dense(1))  # predicting a single value (Close)

model_lstm.compile(loss='mean_squared_error', optimizer='adam')
model_lstm.summary()

# Train
history = model_lstm.fit(
    X_train_lstm, y_train,
    epochs=500,
    batch_size=32,
    validation_split=0.1,
    shuffle=False
)

# Predict
y_pred_lstm = model_lstm.predict(X_test_lstm)

mse_lstm = mean_squared_error(y_test, y_pred_lstm)
r2_lstm = r2_score(y_test, y_pred_lstm)
print("LSTM MSE:", mse_lstm)
print("LSTM R2:", r2_lstm)

  super().__init__(**kwargs)


Epoch 1/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 308ms/step - loss: 427735136.0000 - val_loss: 568179456.0000
Epoch 2/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - loss: 427733856.0000 - val_loss: 568177344.0000
Epoch 3/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 427732704.0000 - val_loss: 568175360.0000
Epoch 4/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 427731584.0000 - val_loss: 568173376.0000
Epoch 5/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 427730336.0000 - val_loss: 568171264.0000
Epoch 6/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 427729056.0000 - val_loss: 568169024.0000
Epoch 7/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 427727616.0000 - val_loss: 568166400.0000
Epoch 8/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [7]:
# =========================
# 6. Compare Results
# =========================
print(f"Linear Regression -> MSE: {mse_lr}, R2: {r2_lr}")
print(f"LSTM -> MSE: {mse_lstm}, R2: {r2_lstm}")

# =========================
# 7. Save Models
# =========================
# (A) Save Linear Regression model as .pkl
with open('linear_regression_model.pkl', 'wb') as f:
    pickle.dump(lr_model, f)

# (B) Save LSTM model as .h5
model_lstm.save('lstm_model.h5')



Linear Regression -> MSE: 6197.864474850521, R2: 0.9614536852891495
LSTM -> MSE: 450529008.19087434, R2: -2800.970421676828


In [8]:
# =========================
# 7. Save Models and Scaler
# =========================
# (A) Save Linear Regression model as .pkl
with open('linear_regression_model.pkl', 'wb') as f:
    pickle.dump(lr_model, f)

# (B) Save LSTM model as .h5
model_lstm.save('lstm_model.h5')

# (C) Save the scaler (fitted on the training data)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


