# Live Stock Price Prediction

## Tasks:

1. Write main.py

## Import Libraries

In [22]:
import pandas as pd
import numpy as np
import joblib
import time
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor

## Load data

In [102]:
df = pd.read_csv("SWIGGY_minute.csv")
df

Unnamed: 0,date,open,high,low,close,volume
0,2024-11-13 09:15:00,364.0,364.0,364.0,364.0,0
1,2024-11-13 09:38:00,420.0,420.0,420.0,420.0,2037114
2,2024-11-13 09:39:00,420.0,420.0,420.0,420.0,0
3,2024-11-13 09:42:00,420.0,420.0,420.0,420.0,0
4,2024-11-13 09:45:00,420.0,420.0,420.0,420.0,0
...,...,...,...,...,...,...
68239,2025-08-06 16:44:00,392.7,392.7,392.7,392.7,0
68240,2025-08-06 17:54:00,392.7,392.7,392.7,392.7,0
68241,2025-08-06 17:57:00,392.7,392.7,392.7,392.7,0
68242,2025-08-06 18:13:00,392.7,392.7,392.7,392.7,0


## Clean Data

### Convert date column to index

In [103]:
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')
df

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-11-13 09:15:00,364.0,364.0,364.0,364.0,0
2024-11-13 09:38:00,420.0,420.0,420.0,420.0,2037114
2024-11-13 09:39:00,420.0,420.0,420.0,420.0,0
2024-11-13 09:42:00,420.0,420.0,420.0,420.0,0
2024-11-13 09:45:00,420.0,420.0,420.0,420.0,0
...,...,...,...,...,...
2025-08-06 16:44:00,392.7,392.7,392.7,392.7,0
2025-08-06 17:54:00,392.7,392.7,392.7,392.7,0
2025-08-06 17:57:00,392.7,392.7,392.7,392.7,0
2025-08-06 18:13:00,392.7,392.7,392.7,392.7,0


### Clean missing values

We will ffill gaps only upto two rows. We will delete rest of the rows.

In [104]:
df = df.sort_index()
df = df.asfreq('1min')
df = df.ffill(limit = 2)
df.dropna(inplace=True)
df.to_csv('clean_data2.csv')

## Convert Data

Now, we will convert the "close" column into sliding window type data.\
Sliding window type data can be explained using an example:\
[1,2,3] -> 4\
[2,3,4] -> 5\
[3,4,5] -> 6\
...

Here, the arrays are features and numbers after '->' are labels.

Note: We will create a different window for each day.

In [105]:
grp_obj = df.groupby(df.index.date)
features = []
labels = []
for _,j in grp_obj:
    arr = j['close'].copy()
    a = 0
    while a+3<len(arr):
        b = a+1
        c = a+2
        d = a + 3
        features.append([arr[a],arr[b],arr[c]])
        labels.append(arr[d])
        a+=1
features = np.array(features)
labels = np.array(labels)

  features.append([arr[a],arr[b],arr[c]])
  labels.append(arr[d])


## Split Data

In [106]:
split_len = int(0.8 * len(features))
train_features = features[:split_len]
train_labels = labels[:split_len]
test_features = features[split_len:]
test_labels = labels[split_len:]
train = (train_features, train_labels)
test = (test_features, test_labels)
joblib.dump(train, 'train.pkl')
joblib.dump(test, 'test.pkl')

['test.pkl']

## Evaluate Performance

In [3]:
train = joblib.load('train.pkl')
(features, labels) = train

## Scaling

We will generalise the data using sklearn's StandardScaler.

In [4]:
scaler = StandardScaler()
arr = scaler.fit_transform(features)
arr

array([[-0.45735943, -0.45734495, -0.4573356 ],
       [-0.45735943, -0.45734495,  0.21269199],
       [-0.45735943,  0.21265619,  0.21269199],
       ...,
       [-0.56922245, -0.57220229, -0.57159924],
       [-0.57221344, -0.57160408, -0.57339395],
       [-0.57161524, -0.57339872, -0.57040276]])

## Evaluate Performance

### 1. Baseline

We evaluate model's performance using a dumb guess. Here, the guess would be tomorrow's price = last seen price. If the model can't beat this, it's not worth training. We will use native forecast technique. We will set test features' last values as baseline.

In [5]:
test = joblib.load('test.pkl')
(test_features, test_labels) = test
arr2 = scaler.transform(test_features)
arr2

array([[-0.57340984, -0.57040765, -0.571001  ],
       [-0.57041885, -0.57100586, -0.57339395],
       [-0.57101705, -0.57339872, -0.57279571],
       ...,
       [-0.11399378, -0.11396937, -0.11394646],
       [-0.11399378, -0.11396937, -0.11394646],
       [-0.11399378, -0.11396937, -0.11394646]])

#### 1. Linear Regression

In [24]:
lrg = LinearRegression()
lrg.fit(arr, labels)
baseline = test_features[:,-1]
baseline_error = np.abs(test_labels - baseline)
predictions = lrg.predict(arr2)
lrg_error = np.abs(test_labels - predictions)
print(baseline_error.mean(), lrg_error.mean(), sep='\n')

0.2692061976795414
0.26906061433576717


#### 2. Random Forests

In [25]:
baseline_error = np.abs(test_labels - baseline)
rfg = RandomForestRegressor(n_estimators=200, random_state=42)
rfg.fit(features, labels)
predictions = rfg.predict(test_features)
rfg_error = np.abs(test_labels - predictions)
print(baseline_error.mean(), rfg_error.mean())

0.2692061976795414 0.44662859531328314


#### 3. Ridge Regressor

In [28]:
baseline_error = np.abs(test_labels - baseline)
ridge = Ridge(alpha = 1.0)
ridge.fit(arr, labels)
predictions = ridge.predict(arr2)
ridge_error = np.abs(test_labels - predictions)
print(baseline_error.mean(), ridge_error.mean())

0.2692061976795414 0.2709214683334227


### 2. RMSE

#### 1. Linear Regression

In [5]:
lrg = LinearRegression()
lrg.fit(arr, labels)
predictions = lrg.predict(arr2)
lrg_error = np.mean((test_labels - predictions) ** 2)
lrg_error

np.float64(0.20137917346042075)

#### 2. Random Forests

In [8]:
rfg = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
rfg.fit(features, labels)
predictions = rfg.predict(test_features)
rfg_error = np.mean((test_labels - predictions) ** 2)
rfg_error

np.float64(0.3891968366431846)

#### 3. Ridge Regression

In [9]:
rdg = Ridge(alpha = 1.0)
rdg.fit(arr, labels)
predictions = rdg.predict(arr2)
rdg_error = np.mean((test_labels - predictions) ** 2)
rdg_error

np.float64(0.20599582056673205)

## Conclusion

We can conclude that Linear regression performed overall better during the evaluation. It beat Ridge Regressor by very small numbers. It beat Random Forest Regressor by considerable numbers.

## Simulate Live

We will write code to simulate live experience using test data. It will predict data every 3 seconds.

In [21]:
lrg = LinearRegression()
lrg.fit(arr, labels)
for i in range(len(test_labels)):
    # For small and clear output:
    if i == 5:
        print("That's enough!")
        break
    predict = lrg.predict([arr2[i]])[0]
    actual = test_labels[i]
    print(f"Prediction: {round(predict, 2)}\nActual price: {actual}")
    time.sleep(3)
    print("\n")

Prediction: 354.5
Actual price: 354.3


Prediction: 354.31
Actual price: 354.35


Prediction: 354.35
Actual price: 354.45


Prediction: 354.45
Actual price: 354.5


Prediction: 354.5
Actual price: 354.5


That's enough!
