#  Machine Learning Models Built

In [1]:
# Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, r2_score


### Load the Cleaned Datase

In [2]:
# Load dataset
df = pd.read_csv("clean_stock_data.csv")

# Display dataset info
print(df.head())
print(df.info())


         date symbol    open    high     low   close    volume  change  \
0  2021-01-06   META  262.00  267.75  260.01  263.31  24354149    1.31   
1  2021-01-07   META  265.90  271.61  264.78  268.74  15789800    2.84   
2  2021-01-08   META  268.31  268.95  263.18  267.57  18528300   -0.74   
3  2021-01-11   META  260.48  263.47  255.90  256.84  30412300   -3.64   
4  2021-01-12   META  256.63  259.72  250.30  251.09  26449943   -5.54   

   changePercent      vwap  changeOverTime       label  unadjustedVolume  \
0         0.5000  263.2675        0.000000  2021-01-06          24354149   
1         1.0700  267.7575        0.020622  2021-01-07          15789800   
2        -0.2758  267.0025        0.016179  2021-01-08          18528300   
3        -1.4000  259.1725       -0.024572  2021-01-11          30412300   
4        -2.1600  254.4350       -0.046409  2021-01-12          26449943   

   adjClose  
0    263.31  
1    268.74  
2    267.57  
3    256.84  
4    251.09  
<class 'pandas

#### Feature Selection and Target Variable

In [3]:
# Select input features
features = [
    "open",
    "high",
    "low",
    "volume",
    "vwap",
    "change",
    "changePercent"
]

# Target variable
target = "close"

X = df[features]
y = df[target]


### Train-Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


#### Model 1: Linear Regression

In [6]:
# Initialize model
linear_model = LinearRegression()

# Train model
linear_model.fit(X_train, y_train)

# Predictions
lr_predictions = linear_model.predict(X_test)
lr_predictions

array([705.29944411, 329.14965095, 293.53940889, 313.01956723,
       546.28905008, 497.3691171 , 338.68955187, 109.45979562,
       504.78946633, 516.89922591, 567.35947313, 647.50910235,
       702.90935502, 129.71950599, 218.85971623, 531.92936839,
       214.74967153, 220.1790969 , 233.36972541, 727.04907143,
       577.15938053, 164.25943774, 717.83903681, 285.28967088,
       584.05910111, 159.92959602, 716.36912145, 147.06963694,
       305.20950865, 264.89963466, 126.75954503, 504.15946443,
       310.61949128, 169.1498272 , 306.18951959, 186.98970358,
       324.75955657, 476.1994165 , 382.17940862, 317.86935227,
       656.96936659, 294.36939072, 373.91943424, 354.699644  ,
       339.99457984, 694.05938896, 101.46923236, 269.44965059,
       342.95958769, 316.96956452, 300.30942934, 324.81957417,
       243.17955278, 172.03971933, 747.71884272, 619.31922054,
       597.98909541, 302.54961961, 485.5795644 , 167.10970169,
       171.1197448 , 220.34969907, 193.63963671, 318.35

#### Model 2: Random Forest Regressor

In [7]:
# Initialize model
rf_model = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

# Train model
rf_model.fit(X_train, y_train)

# Predictions
rf_predictions = rf_model.predict(X_test)
rf_predictions

array([713.5063 , 328.7199 , 291.23705, 311.6166 , 560.23435, 495.1784 ,
       338.32665, 111.13615, 504.48085, 513.86905, 568.0513 , 645.37575,
       704.98425, 130.4542 , 218.80455, 533.5338 , 213.1497 , 221.9144 ,
       232.8866 , 726.8364 , 576.81525, 166.3461 , 715.0882 , 287.1541 ,
       583.29725, 159.50475, 719.48025, 149.46035, 305.97015, 263.80375,
       129.25915, 503.8067 , 308.358  , 168.99425, 304.8326 , 185.1148 ,
       324.57845, 473.85275, 377.1799 , 315.16585, 657.509  , 290.63775,
       373.74135, 353.61505, 339.73585, 695.07375,  99.2135 , 270.4015 ,
       342.0216 , 318.3006 , 302.8164 , 324.0106 , 238.8691 , 170.6873 ,
       742.37195, 617.8371 , 593.10535, 301.71455, 489.19995, 167.86545,
       170.6068 , 218.28975, 195.97855, 316.88905, 292.5453 , 168.8176 ,
       513.97525, 326.76225, 766.6186 , 281.9032 , 623.1868 , 302.5553 ,
       247.2103 , 205.32845, 712.002  , 178.7132 , 216.9073 , 357.9738 ,
       148.41215, 656.39955, 207.04365, 213.37985, 

##### Model 3: Support Vector Regressor (SVR)

In [8]:
# Initialize model
svr_model = SVR()

# Train model
svr_model.fit(X_train, y_train)

# Predictions
svr_predictions = svr_model.predict(X_test)
svr_predictions

array([349.30106814, 378.53560972, 281.23659249, 324.6307381 ,
       281.73922184, 381.15752864, 346.18182746, 305.80291489,
       381.30556126, 382.04108642, 388.10875529, 357.86434211,
       371.57027872, 283.97358454, 358.25632189, 356.93175337,
       354.57805452, 308.07306324, 349.99991081, 379.64364099,
       381.27921681, 265.80910993, 382.43815521, 350.87261982,
       326.90498894, 247.56467868, 317.99654281, 282.4172474 ,
       330.6158269 , 365.46818696, 307.88148202, 384.76121377,
       297.31328047, 316.79003404, 357.3212704 , 276.32924859,
       320.16423334, 382.61456967, 382.40192321, 319.1377555 ,
       371.93370838, 258.63257023, 343.36566093, 379.13946956,
       338.04787978, 373.07704259, 310.59256166, 363.2265642 ,
       341.85908056, 350.08885037, 281.95392646, 359.44866979,
       276.97743401, 330.22044422, 377.38330773, 381.7883051 ,
       317.60959262, 365.33660135, 359.81571087, 294.37601651,
       314.88086053, 309.41530593, 292.68409092, 304.99

##### Model Evaluation

In [9]:
models = {
    "Linear Regression": lr_predictions,
    "Random Forest": rf_predictions,
    "SVR": svr_predictions
}

for model_name, preds in models.items():
    rmse = mean_squared_error(y_test, preds, squared=False)
    r2 = r2_score(y_test, preds)
    
    print(f"{model_name}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R2 Score: {r2:.4f}")
    print("-" * 30)


Linear Regression
RMSE: 0.0020
R2 Score: 1.0000
------------------------------
Random Forest
RMSE: 3.0868
R2 Score: 0.9997
------------------------------
SVR
RMSE: 161.7404
R2 Score: 0.2127
------------------------------


## Model Evaluation and Best Model Selection

We evaluate the three regression models using standard regression metrics:

RMSE (Root Mean Squared Error) → lower is better

R² Score → closer to 1 is better

#### Evaluation Code

In [10]:
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Create evaluation table
evaluation_results = []

# Linear Regression evaluation
lr_rmse = mean_squared_error(y_test, lr_predictions, squared=False)
lr_r2 = r2_score(y_test, lr_predictions)

evaluation_results.append({
    "Model": "Linear Regression",
    "RMSE": lr_rmse,
    "R2 Score": lr_r2
})

# Random Forest evaluation
rf_rmse = mean_squared_error(y_test, rf_predictions, squared=False)
rf_r2 = r2_score(y_test, rf_predictions)

evaluation_results.append({
    "Model": "Random Forest Regressor",
    "RMSE": rf_rmse,
    "R2 Score": rf_r2
})

# SVR evaluation
svr_rmse = mean_squared_error(y_test, svr_predictions, squared=False)
svr_r2 = r2_score(y_test, svr_predictions)

evaluation_results.append({
    "Model": "Support Vector Regressor",
    "RMSE": svr_rmse,
    "R2 Score": svr_r2
})

# Convert to DataFrame
evaluation_df = pd.DataFrame(evaluation_results)

print(evaluation_df)


                      Model        RMSE  R2 Score
0         Linear Regression    0.002013  1.000000
1   Random Forest Regressor    3.086793  0.999713
2  Support Vector Regressor  161.740374  0.212682


### Best Model Selection

In [11]:
# Select best model based on lowest RMSE
best_model = evaluation_df.loc[evaluation_df["RMSE"].idxmin()]

print("Best Model Selected:")
print(best_model)


Best Model Selected:
Model       Linear Regression
RMSE                 0.002013
R2 Score                  1.0
Name: 0, dtype: object


#### Save the Best Model

In [12]:
import joblib

# Save Linear Regression model
joblib.dump(linear_model, "best_stock_price_model.pkl")

print("Model saved successfully!")


Model saved successfully!


In [17]:
# Load trained model
model = joblib.load("best_stock_price_model.pkl")
print(model)

LinearRegression()
