Import the neccessary libraries

In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error,r2_score

Data Loading

In [68]:
# This block of code handles loading the dataset from a CSV file and preparing it for the models
file_path = "sp500_stocks.csv"

# Check if the file exists before attempting to read it
if not os.path.exists(file_path):
    print(f"Error: The file '{file_path} was not found")
    print("Please make sure the file is in the same directory as the script")
else:
    try:
        # Read the CSV file into a Pandas DataFrame
        df = pd.read_csv("sp500_stocks.csv")
        print("Data loaded successfully!")
    except ValueError:
        print("An Error Occurred")

print("First 5 rows")
print(df.head())
print("Data Information")
print(df.info())

Data loaded successfully!
First 5 rows
         Date Symbol  Adj Close  Close  High  Low  Open  Volume
0  2010-01-04    MMM        NaN    NaN   NaN  NaN   NaN     NaN
1  2010-01-05    MMM        NaN    NaN   NaN  NaN   NaN     NaN
2  2010-01-06    MMM        NaN    NaN   NaN  NaN   NaN     NaN
3  2010-01-07    MMM        NaN    NaN   NaN  NaN   NaN     NaN
4  2010-01-08    MMM        NaN    NaN   NaN  NaN   NaN     NaN
Data Information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1891536 entries, 0 to 1891535
Data columns (total 8 columns):
 #   Column     Dtype  
---  ------     -----  
 0   Date       object 
 1   Symbol     object 
 2   Adj Close  float64
 3   Close      float64
 4   High       float64
 5   Low        float64
 6   Open       float64
 7   Volume     float64
dtypes: float64(6), object(2)
memory usage: 115.5+ MB
None


Data Preprocessing

In [69]:
# Check for missing values
df_missing = df.isnull().sum()
print("Missing Values")
print(df_missing)

# Check for duplicated rows
df_duplicated = df.duplicated().sum()
print("Duplicated Values")
print(df_duplicated)

# Drop rows with missing values
df.dropna(inplace=True)

# Check for Missing Values
df_missing = df.isnull().sum()
print("Missing Values")
print(df_missing)

# Convert the "Date" column to datetime objects
df["Date"] = pd.to_datetime(df["Date"])
print(df.info())
#Symbol	Adj Close	Close	High	Low	Open	Volume

# Rename the columns for Clarity
df.rename(columns={
    "Date": "date",
    "Symbol": "symbol",
    "Adj Close": "adj_close",
    "Close": "close",
    "High": "high",
    "Low": "low",
    "Open": "open",
    "Volume": "volume"
}, inplace=True)
print(df.info())

Missing Values
Date               0
Symbol             0
Adj Close    1273705
Close        1273705
High         1273705
Low          1273705
Open         1273705
Volume       1273705
dtype: int64
Duplicated Values
0
Missing Values
Date         0
Symbol       0
Adj Close    0
Close        0
High         0
Low          0
Open         0
Volume       0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 617831 entries, 3768 to 1876463
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   Date       617831 non-null  datetime64[ns]
 1   Symbol     617831 non-null  object        
 2   Adj Close  617831 non-null  float64       
 3   Close      617831 non-null  float64       
 4   High       617831 non-null  float64       
 5   Low        617831 non-null  float64       
 6   Open       617831 non-null  float64       
 7   Volume     617831 non-null  float64       
dtypes: datetime64[ns](1), float64(6), object(1

Features Engineering

In [70]:
# We will focus on a single company for this prediction task. Let's use Apple (AAPL)
company_symbol = "AAPL"
company_data = df[df["symbol"] == company_symbol].copy()

# Drop rows with any missing values, which are common in financial data
# We are only interested in the features needed for the Prediction
features = ["open","high","low","volume","close"]
company_data.dropna(subset=features,inplace=True)

# Create the target variable: the next day's closing price. We do this by shifting the "Close" column up by one row
# The last row will now have a NaN, so we drop it
company_data["next_day_close"] = company_data["close"].shift(-1)
#company_data.drop(company_symbol.tail(1).index,inplace=True)

Data Spiltting