Import the neccessary libraries

In [84]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
import matplotlib.pyplot as plt

Data Loading

In [85]:
try:
    df = pd.read_csv('meta.csv')
    print("Data loaded successfully!")
    print("\nFirst 5 rows of the dataset:")
    print(df.head().to_string())
except FileNotFoundError:
    print("Error: The file 'meta.csv' was not found. Please make sure it is in the same directory as the script.")
    exit()

Data loaded successfully!

First 5 rows of the dataset:
         Date       Open       High        Low      Close  Adj Close     Volume
0  2012-05-18  42.049999  45.000000  38.000000  38.230000  38.230000  573576400
1  2012-05-21  36.529999  36.660000  33.000000  34.029999  34.029999  168192700
2  2012-05-22  32.610001  33.590000  30.940001  31.000000  31.000000  101786600
3  2012-05-23  31.370001  32.500000  31.360001  32.000000  32.000000   73600000
4  2012-05-24  32.950001  33.209999  31.770000  33.029999  33.029999   50237200


Data preprocessing and Feature Engineering

In [86]:
print(df.info())

# Check for missing values
df_missing = df.isnull().sum()
print(df_missing)

# Check for duplicates
df_duplicates = df.duplicated().sum()
print(df_duplicates)

# Rename the column for clarity and consistency
df.rename(columns={
    "Date":"date",
    "Open":"open",
    "High":"high",
    "Low":"low",
    "Close":"close",
    "Adj Close":"adj_close",
    "Volume":"volume"
},inplace=True)

# Display the first first row and information
print(df.head().to_string())
print(df.info())

# We will create a new feature called "average_price" which is the "open", "high" and "low" prices
# This new feature might have a strong relationship with the "close" price
df["average_price"] = (df["open"] + df["high"] + df["low"]) / 3

# We need to select the columns that will be our features (X) and our target (y)
# The target variable is what we want to predict, which is "close" price
# The features are the input variables the model will use to make predictions
features = ["open","high","low","volume","average_price"]
target = "close"
X = df[features]
y = df[target]

print("Shape of features (X):",X.shape)
print("Shape of target (y):",y.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2906 entries, 0 to 2905
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       2906 non-null   object 
 1   Open       2906 non-null   float64
 2   High       2906 non-null   float64
 3   Low        2906 non-null   float64
 4   Close      2906 non-null   float64
 5   Adj Close  2906 non-null   float64
 6   Volume     2906 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 159.1+ KB
None
Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64
0
         date       open       high        low      close  adj_close     volume
0  2012-05-18  42.049999  45.000000  38.000000  38.230000  38.230000  573576400
1  2012-05-21  36.529999  36.660000  33.000000  34.029999  34.029999  168192700
2  2012-05-22  32.610001  33.590000  30.940001  31.000000  31.000000  101786600
3  2012-05-23  31.370001 

Data Splitting