In [None]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta

def crawl_data(
    ticker: str,
    previous_days: int = 120
) -> pd.DataFrame:
    """Crawling data from yfinance from a specified number of previous days to the current time
    
    Args:
        ticker (str): Ticker symbol of the stock
        previous_days (int, optional): Number of previous days to crawl data. Defaults to 7.
    
    Returns:
        pd.DataFrame: DataFrame containing the crawled data
    """
    current = datetime.now()
    previous = current - timedelta(days=previous_days)
    data = yf.Ticker(ticker)
    history = data.history(start=previous, end=current)
    history = history.reset_index().drop(["Date", "Dividends", "Stock Splits"], axis=1)
    return history

df = crawl_data("AAPL")
raw_data=df.reset_index().to_dict(orient='list')  # Reset index to include it in the output
raw_data

dict

In [1]:
# Add the parent directory to the Python path
import sys
sys.path.append('..')

In [2]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Bidirectional, GRU
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from utils.data_helper import * 

2025-05-13 19:01:20.474115: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import pandas as pd
import time
from datetime import datetime
import yfinance as yf
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [87]:
from datetime import datetime, timedelta

def crawl_data(
    ticker: str,
    previous_days: int = 120
) -> pd.DataFrame:
    """Crawling data from yfinance from a specified number of previous days to the current time
    
    Args:
        ticker (str): Ticker symbol of the stock
        previous_days (int, optional): Number of previous days to crawl data. Defaults to 7.
    
    Returns:
        pd.DataFrame: DataFrame containing the crawled data
    """
    current = datetime.now()
    previous = current - timedelta(days=previous_days)
    data = yf.Ticker(ticker)
    history = data.history(start=previous, end=current)
    history = history.reset_index().drop(["Date", "Dividends", "Stock Splits"], axis=1)
    return history


In [None]:
test = crawl_data("LCID")
test.head()

Unnamed: 0,Open,High,Low,Close,Volume
0,3.06,3.06,2.8,2.86,92853600
1,2.82,2.84,2.67,2.67,82754500
2,2.66,2.74,2.61,2.72,65568900
3,2.755,2.89,2.73,2.79,80613300
4,2.75,2.97,2.69,2.75,96738500


In [None]:
features = ["Close"]
CLOSE_IDX = features.index("Close")
SEQ_LEN = 60

# Define Sequence Creation Function
def create_sequences_api(data, seq_len=60):
    X = []
    for i in range(len(data) - seq_len):
        X.append(data[i:i+seq_len])
    return np.array(X)

def preprocess_api(
    ticker: str,
    data: pd.DataFrame,
    scaler_path: str,    
) -> np.array:
    """
    Preprocess the data for prediction

    Args:
        ticker (str): The ticker of the stock
        data (pd.DataFrame): The data to preprocess
        scaler (joblib): The scaler to use for normalisation
    
    Returns:
        tuple[np.array, np.array]: The preprocessed data 
    """
    scaler = joblib.load(scaler_path)
    data = data[features].values
    normalised_data = scaler.transform(data)
    predict_data = create_sequences_api(normalised_data, SEQ_LEN)
    predict_data = predict_data.reshape((predict_data.shape[0], SEQ_LEN, len(features)))
    return predict_data
    

In [None]:
def predict_price(
    model_path: str,
    scaler_path: str,
    data: np.array,
) -> np.array:
    """
    Predict the price of the stock

    Args:
        model_path (str): The path to the model
        scaler_path (str): The path to the scaler
        data (np.array): The data to predict
    
    Returns:
        np.array: The predicted price
    """
    scaler = joblib.load(scaler_path)
    model = keras.models.load_model(model_path)
    predict_data = model.predict(data)
    predict_data = scaler.inverse_transform(predict_data)   
    return predict_data


# Preprocessing

In [3]:
# # Preprocessing and save the processed file
# for data_path in os.listdir("../data/raw"):
#     df = pd.read_csv(os.path.join("../data/raw", data_path))
#     df = preprocess(df, "Close")
#     df.to_csv(os.path.join("../data/processed", data_path), index=False)

# Train Models

In [4]:
# Set random seed for reproducibility
tf.random.set_seed(42)

In [24]:
# Split data into training and validation
df = pd.read_csv("../data/processed/NVDA.csv")
train_data, val_data = split_data(df)

In [25]:
# Format data into time series
trainX, trainY = timedata_format(train_data)
valX, valY = timedata_format(val_data)
input_shape = trainX.shape[1:]  # Should be (time_steps, features)

## LSTM

In [26]:
# LSTM model
lstm = Sequential()
lstm.add(LSTM(64,
    input_shape=input_shape,
    return_sequences=True))
lstm.add(Dropout(0.2))
lstm.add(LSTM(64,
    return_sequences=True))
lstm.add(LSTM(128,
    return_sequences=False))
lstm.add(Dropout(0.2))
lstm.add(Dense(128))
lstm.add(Dense(1))
lstm.compile(optimizer='adam', 
             loss='mean_squared_error')

  super().__init__(**kwargs)


In [27]:
lstm.summary()

In [28]:
lstm.fit(trainX, 
         trainY,    
         epochs=50, 
         batch_size=32, 
         validation_data=(valX, valY))

Epoch 1/50


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 33ms/step - loss: 1123.5239 - val_loss: 14434.7578
Epoch 2/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 779.0369 - val_loss: 1712.7471
Epoch 3/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 144.7214 - val_loss: 562.3677
Epoch 4/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 38.7067 - val_loss: 64.2546
Epoch 5/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 22.5376 - val_loss: 75.7961
Epoch 6/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 18.2968 - val_loss: 83.3312
Epoch 7/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 13.2483 - val_loss: 102.0648
Epoch 8/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 10.0710 - val_loss: 154.7838
Epoch 9/50
[1m32/32[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x1444f0800>

## GRU

In [None]:
# GRU model
gru = Sequential()
gru.add(GRU(64,
    input_shape=input_shape,
    return_sequences=True))
gru.add(Dropout(0.2))
gru.add(GRU(64,
    return_sequences=True))
gru.add(GRU(128,
    return_sequences=False))
gru.add(Dropout(0.2))
gru.add(Dense(1))
gru.compile(optimizer='adam', 
             loss='mean_squared_error')

In [33]:
gru.summary()

In [34]:
gru.fit(trainX, 
         trainY,    
         epochs=50, 
         batch_size=32, 
         validation_data=(valX, valY))

Epoch 1/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 36ms/step - loss: 1123.2024 - val_loss: 14263.3389
Epoch 2/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 834.6542 - val_loss: 10394.8662
Epoch 3/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 392.1847 - val_loss: 9450.4375
Epoch 4/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 357.6220 - val_loss: 9094.3223
Epoch 5/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 324.9839 - val_loss: 8595.3193
Epoch 6/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 268.6857 - val_loss: 7914.6396
Epoch 7/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 206.9176 - val_loss: 7274.7964
Epoch 8/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 161.1615 - val_loss: 6741.7231
Epoch

<keras.src.callbacks.history.History at 0x1461a2d50>

## BiLSTM