# Import dependencies

Lets Import necessary packages and modules

In [1]:
import os
import sys
import yfinance as yf
import pandas as pd
from datetime import date, timedelta 
pd.set_option('display.max_columns', None)  # to show all columns
pd.set_option('display.max_rows', None)  # to show all rows

# Fetching and Saving Data

Let's start by fetching stock price data using the `yfinance` library and saving it as `stock_data.csv` in the `data` directory.


## Step 1: Create the directory structure if it doesn't exist

In [2]:
# Step 1: Set up the root directory for the project
root_dir = os.path.abspath("../")
data_dir = os.path.join(root_dir, "data")
os.makedirs(data_dir, exist_ok=True)

## Step 2: Define the stock ticker and time period for data extraction

In [3]:
ticker = "AAPL"  # Apple Inc.
start_date = "2015-01-01"
end_date = "2023-01-01"

## Step 3: Fetch stock data using the yfinance library

In [4]:
stock_data = yf.download(ticker, start=start_date, end=end_date)
stock_data.head()

[*********************100%***********************]  1 of 1 completed


Price,Adj Close,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2015-01-02,24.347168,27.3325,27.860001,26.8375,27.8475,212818400
2015-01-05,23.661274,26.5625,27.1625,26.352501,27.0725,257142000
2015-01-06,23.663498,26.565001,26.8575,26.157499,26.635,263188400
2015-01-07,23.995317,26.9375,27.049999,26.674999,26.799999,160423600
2015-01-08,24.917269,27.9725,28.0375,27.174999,27.307501,237458000


## Step 4: Flatten column names (remove multi-index)

In [5]:
if isinstance(stock_data.columns, pd.MultiIndex):
    stock_data.columns = ['_'.join(col).strip().replace(' ', '_') for col in stock_data.columns]

## Step 4: Reset the index to ensure the data format is correct

In [6]:
stock_data.reset_index(inplace=True)

## Step 5: Convert the "Date" column to datetime format if not already

In [7]:
stock_data['Date'] = pd.to_datetime(stock_data['Date'])

## Step 6: Save the data as a Parquet file in the `data` directory

In [8]:
parquet_file_path = os.path.join(data_dir, "stock_data.parquet")
stock_data.to_parquet(parquet_file_path, index=False)

## Step 7: Confirm the data has been saved correctly

In [9]:
stock_data.head()

Unnamed: 0,Date,Adj_Close_AAPL,Close_AAPL,High_AAPL,Low_AAPL,Open_AAPL,Volume_AAPL
0,2015-01-02,24.347168,27.3325,27.860001,26.8375,27.8475,212818400
1,2015-01-05,23.661274,26.5625,27.1625,26.352501,27.0725,257142000
2,2015-01-06,23.663498,26.565001,26.8575,26.157499,26.635,263188400
3,2015-01-07,23.995317,26.9375,27.049999,26.674999,26.799999,160423600
4,2015-01-08,24.917269,27.9725,28.0375,27.174999,27.307501,237458000


# Loading and Processing Data

## Loading Data

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import MinMaxScaler

file_path = "../data/stock_data.parquet"
    
def load_data(parquet_path):
    """Load stock data from a Parquet file."""
    return pd.read_parquet(parquet_path)

df = load_data(file_path)
df.head()

Unnamed: 0,Date,Adj_Close_AAPL,Close_AAPL,High_AAPL,Low_AAPL,Open_AAPL,Volume_AAPL
0,2015-01-02,24.347168,27.3325,27.860001,26.8375,27.8475,212818400
1,2015-01-05,23.661274,26.5625,27.1625,26.352501,27.0725,257142000
2,2015-01-06,23.663498,26.565001,26.8575,26.157499,26.635,263188400
3,2015-01-07,23.995317,26.9375,27.049999,26.674999,26.799999,160423600
4,2015-01-08,24.917269,27.9725,28.0375,27.174999,27.307501,237458000


## Normalizing Data

In [11]:
target_column="Adj_Close_AAPL"

# Check if the target column exists
if target_column not in df.columns:
    raise KeyError(f"Column '{target_column}' not found in DataFrame.")

# Select the relevant column
data = df[target_column].values
data = data.reshape(-1, 1)  # Reshape for LSTM input

# Normalize data to range [0, 1]
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_normalized = scaler.fit_transform(data)

sequence_length = 60
sequences = []
labels = []
for i in range(len(data) - sequence_length):
    sequences.append(data[i:i + sequence_length])
    labels.append(data[i + sequence_length])

## Splitting Data

In [12]:
from sklearn.model_selection import train_test_split

# Convert sequences and labels to NumPy arrays
sequences = np.array(sequences)
labels = np.array(labels)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.2, random_state=42)

print(f"Training data shape: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Testing data shape: X_test={X_test.shape}, y_test={y_test.shape}")

Training data shape: X_train=(1563, 60, 1), y_train=(1563, 1)
Testing data shape: X_test=(391, 60, 1), y_test=(391, 1)
