#  Machine Learning Models Built

In [1]:
# Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, r2_score


### Load the Cleaned Datase

In [2]:
# Load dataset
df = pd.read_csv("clean_stock_data.csv")

# Display dataset info
print(df.head())
print(df.info())


         date symbol    open    high     low   close    volume  change  \
0  2021-01-14   META  253.40  255.03  244.61  245.64  29739404   -7.76   
1  2021-01-15   META  247.90  253.86  247.16  251.36  24942930    3.46   
2  2021-01-19   META  256.90  262.20  252.72  261.10  28028546    4.20   
3  2021-01-20   META  268.93  270.32  263.60  267.48  25199919   -1.45   
4  2021-01-21   META  269.26  273.60  267.49  272.87  20838700    3.61   

   changePercent      vwap  changeOverTime       label  unadjustedVolume  \
0       -3.06000  249.6700        0.000000  2021-01-14          29739404   
1        1.40000  250.0700        0.023286  2021-01-15          24942930   
2        1.63000  258.2300        0.062938  2021-01-19          28028546   
3       -0.53917  267.5825        0.088911  2021-01-20          25199919   
4        1.34000  270.8050        0.110853  2021-01-21          20838700   

   adjClose  
0    245.64  
1    251.36  
2    261.10  
3    267.48  
4    272.87  
<class 'pandas

#### Feature Selection and Target Variable

In [3]:
# Select input features
features = [
    "open",
    "high",
    "low",
    "volume",
    "vwap",
    "change",
    "changePercent"
]

# Target variable
target = "close"

X = df[features]
y = df[target]


### Train-Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


#### Model 1: Linear Regression

In [5]:
# Initialize model
linear_model = LinearRegression()

# Train model
linear_model.fit(X_train, y_train)

# Predictions
lr_predictions = linear_model.predict(X_test)
lr_predictions

array([732.16881766, 332.45964815, 287.99972942, 306.17974799,
       484.66945222, 519.82904105, 338.02964182, 113.92986514,
       536.31936409, 494.1694402 , 584.77954212, 664.93914892,
       714.7992982 ,  95.19923938, 207.55980814, 521.30939598,
       217.88983906, 216.53955594, 242.4897059 , 705.29951316,
       559.13943559, 155.84960717, 712.06913778, 290.52963777,
       602.5793347 , 168.52985493, 673.6986677 , 191.62953232,
       285.08968123, 278.61974544, 129.71951426, 510.59900176,
       294.46967124, 148.01990303, 287.59977651, 212.02957079,
       328.68929177, 468.83955736, 373.91950671, 333.73951214,
       649.49938979, 313.40955067, 345.96457894, 352.08963512,
       312.21961986, 697.22935339, 111.44984633, 269.39472172,
       341.05950619, 288.34910039, 300.82955879, 325.27951662,
       233.36979463, 185.24954109, 751.10890842, 597.18956944,
       582.35934014, 298.95966893, 519.24958935, 180.88988252,
       174.52987376, 212.88980676, 196.63979364, 302.54

#### Model 2: Random Forest Regressor

In [6]:
# Initialize model
rf_model = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

# Train model
rf_model.fit(X_train, y_train)

# Predictions
rf_predictions = rf_model.predict(X_test)
rf_predictions

array([729.9398   , 332.6274   , 290.83705  , 305.9546   , 484.4213   ,
       515.6069   , 336.51805  , 113.88985  , 535.17235  , 491.998    ,
       590.4583   , 661.3572   , 718.4339   ,  95.6473   , 209.53065  ,
       519.7453   , 218.47605  , 214.7932   , 241.8442   , 712.8446   ,
       562.3937   , 158.21505  , 712.3134   , 292.75315  , 609.4573   ,
       168.4623   , 670.53605  , 190.0397   , 288.94415  , 281.8049   ,
       130.5199   , 508.44275  , 294.9485   , 146.5493   , 290.49835  ,
       210.91405  , 327.3262   , 468.6299   , 373.84155  , 334.35365  ,
       655.2183   , 313.1674   , 345.6912   , 353.1064   , 316.7402   ,
       699.67085  , 111.4221   , 270.7374   , 339.782    , 287.35785  ,
       298.37385  , 324.6627   , 233.595    , 180.02165  , 747.6298   ,
       610.05915  , 582.3255   , 297.86295  , 524.13185  , 179.94735  ,
       173.85215  , 211.99535  , 196.40175  , 302.3174   , 312.51525  ,
       159.6408   , 527.91295  , 318.01535  , 780.95195  , 283.1

##### Model 3: Support Vector Regressor (SVR)

In [7]:
# Initialize model
svr_model = SVR()

# Train model
svr_model.fit(X_train, y_train)

# Predictions
svr_predictions = svr_model.predict(X_test)
svr_predictions

array([389.42867371, 370.63100716, 346.79660699, 371.5488167 ,
       354.56470802, 379.37420385, 351.71792169, 278.25544812,
       378.14009226, 375.79122585, 376.63989723, 390.8293532 ,
       381.55902445, 312.04634154, 336.2461564 , 388.51976022,
       375.3747744 , 254.76182463, 323.87524996, 349.76317429,
       388.49657183, 257.1019647 , 389.01960011, 297.77075159,
       383.59470499, 339.59707059, 364.03954426, 256.36338627,
       307.15252661, 338.98298996, 287.35223016, 376.18954429,
       333.84691086, 317.86408907, 371.84717879, 252.10519887,
       253.63649375, 377.89079569, 343.97330322, 328.41334887,
       357.839448  , 274.73520529, 320.0922843 , 379.20584819,
       276.03511174, 385.08949989, 259.14264167, 359.82238943,
       368.95315626, 292.11385263, 337.55124886, 296.57172528,
       350.4450993 , 254.83098856, 393.96303892, 349.19348734,
       329.42726177, 334.89054496, 370.52709433, 356.23075566,
       347.48254914, 345.32883853, 317.26665716, 303.09

##### Model Evaluation

In [8]:
models = {
    "Linear Regression": lr_predictions,
    "Random Forest": rf_predictions,
    "SVR": svr_predictions
}

for model_name, preds in models.items():
    rmse = mean_squared_error(y_test, preds, squared=False)
    r2 = r2_score(y_test, preds)
    
    print(f"{model_name}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R2 Score: {r2:.4f}")
    print("-" * 30)


Linear Regression
RMSE: 0.0025
R2 Score: 1.0000
------------------------------
Random Forest
RMSE: 2.8518
R2 Score: 0.9998
------------------------------
SVR
RMSE: 159.6256
R2 Score: 0.2451
------------------------------


## Model Evaluation and Best Model Selection

We evaluate the three regression models using standard regression metrics:

RMSE (Root Mean Squared Error) → lower is better

R² Score → closer to 1 is better

#### Evaluation Code

In [9]:
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Create evaluation table
evaluation_results = []

# Linear Regression evaluation
lr_rmse = mean_squared_error(y_test, lr_predictions, squared=False)
lr_r2 = r2_score(y_test, lr_predictions)

evaluation_results.append({
    "Model": "Linear Regression",
    "RMSE": lr_rmse,
    "R2 Score": lr_r2
})

# Random Forest evaluation
rf_rmse = mean_squared_error(y_test, rf_predictions, squared=False)
rf_r2 = r2_score(y_test, rf_predictions)

evaluation_results.append({
    "Model": "Random Forest Regressor",
    "RMSE": rf_rmse,
    "R2 Score": rf_r2
})

# SVR evaluation
svr_rmse = mean_squared_error(y_test, svr_predictions, squared=False)
svr_r2 = r2_score(y_test, svr_predictions)

evaluation_results.append({
    "Model": "Support Vector Regressor",
    "RMSE": svr_rmse,
    "R2 Score": svr_r2
})

# Convert to DataFrame
evaluation_df = pd.DataFrame(evaluation_results)

print(evaluation_df)


                      Model        RMSE  R2 Score
0         Linear Regression    0.002453  1.000000
1   Random Forest Regressor    2.851793  0.999759
2  Support Vector Regressor  159.625626  0.245104


### Best Model Selection

In [10]:
# Select best model based on lowest RMSE
best_model = evaluation_df.loc[evaluation_df["RMSE"].idxmin()]

print("Best Model Selected:")
print(best_model)


Best Model Selected:
Model       Linear Regression
RMSE                 0.002453
R2 Score                  1.0
Name: 0, dtype: object


#### Save the Best Model

In [11]:
import joblib

# Save Linear Regression model
joblib.dump(linear_model, "best_stock_price_model.pkl")

print("Model saved successfully!")


Model saved successfully!


In [12]:
# Load trained model
model = joblib.load("best_stock_price_model.pkl")
print(model)

LinearRegression()
