# Initialize the Python Notebook

Install the necessary Python packages found in requirements.txt and import the necessary Python libraries.

In [None]:
%%capture
!pip3 install -r requirements.txt

In [None]:
import datetime as dt
import json
import math
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import shutil
import uuid
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama
from pyspark.sql import SparkSession, Window, functions as F
from pyspark.sql.types import (
    DateType,
    DoubleType,
    IntegerType,
    LongType,
    StructType,
    StructField,
    StringType,
    TimestampType,
)
from numpy import newaxis
from keras.layers import Dense, Dropout, LSTM
from keras.models import Sequential, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sumy.nlp.stemmers import Stemmer
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lsa import LsaSummarizer
from sumy.utils import get_stop_words

Initialize the Python notebook environment.

In [None]:
# A sentence tokenizer to split text into sentences
nltk.download("punkt_tab")

# Initialize a Spark session
# spark = SparkSession.builder.appName("Stock Forecasting").getOrCreate()
spark = (
    SparkSession.builder.appName("Stock Forecasting")
    .config("spark.executor.memory", "4g")
    .config("spark.driver.memory", "4g")
    .config("spark.default.parallelism", "8")
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY")
    .getOrCreate()
)

Define global constants.

In [None]:
# Stock symbol
STOCK_SYMBOL_UPPER = "AA"  # AA, AAPL, AMZN, MSFT, TSLA, JPM, LLY
STOCK_SYMBOL_LOWER = "aa"  # aa, aapl, amzn, msft, tsla, jpm, lly

# File names
FILE_NAME_EXTERNAL_NEWS = f"external_{STOCK_SYMBOL_LOWER}.csv"
FILE_NAME_NASDAQ_NEWS = f"nasdaq_{STOCK_SYMBOL_LOWER}.csv"
FILE_NAME_NEWS = f"news_{STOCK_SYMBOL_LOWER}.csv"
FILE_NAME_PRICE = f"price_{STOCK_SYMBOL_LOWER}.csv"
FILE_NAME_COMBINED = f"{STOCK_SYMBOL_LOWER}.csv"

# File paths
FILE_PATH_ALL_EXTERNAL_NEWS = "stock_news/external/all_external.csv"
FILE_PATH_EXTERNAL_NEWS = f"stock_news/external/external_{STOCK_SYMBOL_LOWER}.csv"
FILE_PATH_ALL_NASDAQ_NEWS = "stock_news/nasdaq/nasdaq_external_data.csv"
FILE_PATH_NASDAQ_NEWS = f"stock_news/nasdaq/nasdaq_{STOCK_SYMBOL_LOWER}.csv"
FILE_PATH_MERGED_NEWS = f"stock_news/merged/{FILE_NAME_NEWS}"
FILE_PATH_SUMMARIZED_NEWS = f"stock_news/summarized/{FILE_NAME_NEWS}"
FILE_PATH_SCORED_NEWS = f"stock_news/scored/{FILE_NAME_NEWS}"
FILE_PATH_DECAYED_NEWS = f"stock_news/decayed/{FILE_NAME_NEWS}"
FILE_PATH_SENTIMENT_SCORES = f"stock_news/scored/sentiments_{STOCK_SYMBOL_LOWER}.csv"
FILE_PATH_FULL_HISTORY_PRICE = f"stock_price/full_history/{STOCK_SYMBOL_UPPER}.csv"
FILE_PATH_PREPROCESSED_PRICE = f"stock_price/preprocessed/{FILE_NAME_PRICE}"
FILE_PATH_COMBINED = f"stock_combined/{FILE_NAME_COMBINED}"

# Used by the Tokenizer
LANGUAGE = "english"  # Set the language

# Used by the LSA Summarizer
SENTENCES_COUNT = 3  # Set the max number of sentences in a summary

# Used by the LLM
# MODEL = "llama3.2:1b-instruct-fp16"
MODEL = "llama3.2:3b-instruct-fp16"  # Set the large-language model
BATCH_SIZE = 5  # Set the max number of sentences in a batch to be fed to the LLM
TEMPERATURE = 0.0  # Set the temperature to 0.0 for deterministic output
MAX_OUTPUT_TOKENS = 14  # Set the maximum number of output tokens

# Used by LLM for sentiment scoring
MIN_VALUE = 1  # The minimum value of the sentiment score
BASE_VALUE = 3  # The midpoint between 1 and 5 of the sentiment score
MAX_VALUE = 5  # The maximum value of the sentiment score

# Used by exponential decay algorithm
DECAY_RATE = 0.5  # Determines how quickly the sentiment decays over time

# Define the schema
EXTERNAL_NEWS_SCHEMA = StructType(
    [
        StructField("Date", TimestampType(), True),
        StructField("Article_title", StringType(), True),
        StructField("Stock_symbol", StringType(), True),
        StructField("Url", StringType(), True),
        StructField("Publisher", StringType(), True),
        StructField("Author", StringType(), True),
        StructField("Article", StringType(), True),
        StructField("Lsa_summary", StringType(), True),
        StructField("Luhn_summary", StringType(), True),
        StructField("Textrank_summary", StringType(), True),
        StructField("Lexrank_summary", StringType(), True),
    ]
)
NASDAQ_NEWS_SCHEMA = StructType(
    [
        StructField("Unnamed: 0", StringType(), True),
        StructField("Date", TimestampType(), True),
        StructField("Article_title", StringType(), True),
        StructField("Stock_symbol", StringType(), True),
        StructField("Url", StringType(), True),
        StructField("Publisher", StringType(), True),
        StructField("Author", StringType(), True),
        StructField("Article", StringType(), True),
        StructField("Lsa_summary", StringType(), True),
        StructField("Luhn_summary", StringType(), True),
        StructField("Textrank_summary", StringType(), True),
        StructField("Lexrank_summary", StringType(), True),
    ]
)
MERGED_NEWS_SCHEMA = StructType(
    [
        StructField("Date", TimestampType(), True),
        StructField("Article_title", StringType(), True),
        StructField("Stock_symbol", StringType(), True),
        StructField("Url", StringType(), True),
        StructField("Publisher", StringType(), True),
        StructField("Author", StringType(), True),
        StructField("Article", StringType(), True),
        StructField("Lsa_summary", StringType(), True),
        StructField("Summarized", IntegerType(), True),
        StructField("Sentiment_score", IntegerType(), True),
        StructField("UUID", StringType(), True),
    ]
)
SUMMARIZED_NEWS_SCHEMA = StructType(
    [
        StructField("Date", TimestampType(), True),
        StructField("Article_title", StringType(), True),
        StructField("Stock_symbol", StringType(), True),
        StructField("Url", StringType(), True),
        StructField("Publisher", StringType(), True),
        StructField("Author", StringType(), True),
        StructField("Article", StringType(), True),
        StructField("Lsa_summary", StringType(), True),
        StructField("Summarized", IntegerType(), True),
        StructField("Sentiment_score", IntegerType(), True),
        StructField("UUID", StringType(), True),
        StructField("Text", StringType(), True),
    ]
)
SCORED_NEWS_SCHEMA = StructType(
    [
        StructField("UUID", StringType(), True),
        StructField("Date", TimestampType(), True),
        StructField("Sentiment_score", IntegerType(), True),
        StructField("Lsa_summary", StringType(), True),
    ]
)
DECAYED_SENTIMENT_SCHEMA = StructType(
    [
        StructField("Date", DateType(), True),
        StructField("Sentiment_avg", DoubleType(), True),
        StructField("Last_valid_sentiment", DoubleType(), True),
        StructField("Last_valid_date", DateType(), True),
        StructField("Days_since_last_valid", IntegerType(), True),
        StructField("Decayed_sentiment", DoubleType(), True),
    ]
)
PREPROCESSED_PRICE_SCHEMA = StructType(
    [
        StructField("Date", DateType(), True),
        StructField("Open", DoubleType(), True),
        StructField("High", DoubleType(), True),
        StructField("Low", DoubleType(), True),
        StructField("Close", DoubleType(), True),
        StructField("Adj_close", DoubleType(), True),
        StructField("Volume", LongType(), True),
    ]
)

# Preprocess the Price Dataset

Ingest the price dataset.

In [None]:
# Read price data from CSV files to Spark dataframes
df_price = spark.read.csv(FILE_PATH_FULL_HISTORY_PRICE, header=True, inferSchema=True)

# Verify
print(f"Row count for df_price: {df_price.count()}")
df_price.printSchema()
df_price.show(5, truncate=True)

Preprocess the price dataset.

In [None]:
# Rename the headers
df_price_preprocessed = (
    df_price.withColumnRenamed("date", "Date")
    .withColumnRenamed("open", "Open")
    .withColumnRenamed("high", "High")
    .withColumnRenamed("low", "Low")
    .withColumnRenamed("close", "Close")
    .withColumnRenamed("adj close", "Adj_close")
    .withColumnRenamed("volume", "Volume")
)

# Reorder the columns
df_price_preprocessed = df_price_preprocessed.select(
    "Date", "Open", "High", "Low", "Close", "Adj_close", "Volume"
)


# Define a UDF to parse dates
def parse_date(date_string):
    for format in ("%m/%d/%y", "%Y-%m-%d"):
        try:
            return dt.datetime.strptime(date_string, format).date()
        except ValueError:
            continue
    return None  # Return None if no format matches


# Register the UDF
parse_date_udf = F.udf(parse_date, DateType())

# Use the UDF to standardize the date format
df_price_preprocessed = (
    df_price_preprocessed.withColumn("Date", parse_date_udf(F.col("Date")))
    .filter(F.col("Date").isNotNull())
    .orderBy("Date")
)

# Round the DoubleType values to eight decimal places
# Cast Date to DateType and Volume to LongType
df_price_preprocessed = (
    df_price_preprocessed.withColumn("Date", F.col("Date").cast(DateType()))
    .withColumn("Open", F.round("Open", 8))
    .withColumn("High", F.round("High", 8))
    .withColumn("Low", F.round("Low", 8))
    .withColumn("Close", F.round("Close", 8))
    .withColumn("Adj_close", F.round("Adj_close", 8))
    .withColumn("Volume", F.col("Volume").cast(LongType()))
)

# Verify
print(f"Row count for df_price: {df_price_preprocessed.count()}")
df_price_preprocessed.show(5, truncate=True)

Save a copy of the dataset.

In [None]:
# Specify directories
FOLDER_PRICE = "stock_price/preprocessed"
TEMP_FOLDER_PRICE = f"stock_price/preprocessed/price_{STOCK_SYMBOL_LOWER}"

# Write to a single CSV file
df_price_preprocessed.coalesce(1).write.csv(
    TEMP_FOLDER_PRICE, sep=",", header=True, mode="overwrite"
)

# Move the part file to the desired filename
for filename in os.listdir(TEMP_FOLDER_PRICE):
    if filename.startswith("part-"):
        shutil.move(
            os.path.join(TEMP_FOLDER_PRICE, filename),
            os.path.join(FOLDER_PRICE, FILE_NAME_PRICE),
        )

# Remove the temporary directory
shutil.rmtree(TEMP_FOLDER_PRICE)

local_variables = [
    "df_price",
    "df_price_preprocessed",
    "parse_date_udf",
]

# Delete local variables if they exist
for var in local_variables:
    if var in locals():  # Check if the variable exists in the local scope
        del locals()[var]

# Extract News Dataset A of a Stock

Read in the all-external news CSV dataset.

In [None]:
# Read the CSV file to Spark dataframe
df_all_news_external = spark.read.csv(
    FILE_PATH_ALL_EXTERNAL_NEWS, header=True, schema=EXTERNAL_NEWS_SCHEMA
)

# Verify
print(f"Row count for df_all_news_external: {df_all_news_external.count()}")
df_all_news_external.printSchema()
df_all_news_external.show(5, truncate=True)

Extract only the stock of interest.

In [None]:
# Extract only the stock symbol of interest from the external news dataset
df_news_external_filtered = df_all_news_external.filter(
    F.col("Stock_symbol") == STOCK_SYMBOL_UPPER
).orderBy("Date")

# Verify
print(f"Row count for df_news_external_filtered: {df_news_external_filtered.count()}")
df_news_external_filtered.show(5, truncate=True)


Save a copy of the external news dataset.

In [None]:
# Specify directories
FOLDER_EXTERNAL = "stock_news/external"
TEMP_FOLDER_EXTERNAL = f"stock_news/external/external_{STOCK_SYMBOL_LOWER}"

# Write to a single CSV file
df_news_external_filtered.coalesce(1).write.csv(
    TEMP_FOLDER_EXTERNAL, sep=",", header=True, mode="overwrite"
)

# Move the part file to the desired filename
for filename in os.listdir(TEMP_FOLDER_EXTERNAL):
    if filename.startswith("part-"):
        shutil.move(
            os.path.join(TEMP_FOLDER_EXTERNAL, filename),
            os.path.join(FOLDER_EXTERNAL, FILE_NAME_EXTERNAL_NEWS),
        )

# Remove the temporary directory
shutil.rmtree(TEMP_FOLDER_EXTERNAL)

local_variables = [
    "df_all_news_external",
    "df_news_external_filtered",
]

# Delete local variables if they exist
for var in local_variables:
    if var in locals():  # Check if the variable exists in the local scope
        del locals()[var]

# Extract News Dataset B of a Stock

Reads in the all-Nasdaq news CSV dataset.

In [None]:
# Read the CSV file to Spark dataframe
df_all_news_nasdaq = spark.read.csv(
    FILE_PATH_ALL_NASDAQ_NEWS, header=True, schema=NASDAQ_NEWS_SCHEMA
)

# Verify
print(f"Row count for df_all_news_nasdaq: {df_all_news_nasdaq.count()}")
df_all_news_nasdaq.printSchema()
df_all_news_nasdaq.show(5, truncate=True)

Extract only the stock of interest.

In [None]:
# Extract only the stock symbol of interest from the external news dataset
df_news_nasdaq_filtered = df_all_news_nasdaq.filter(
    F.col("Stock_symbol") == STOCK_SYMBOL_UPPER
).orderBy("Date")

# Verify
print(f"Row count for df_news_nasdaq_filtered: {df_news_nasdaq_filtered.count()}")
df_news_nasdaq_filtered.show(5, truncate=True)

Save a copy of the Nasdaq news dataset.

In [None]:
# Specify directories
FOLDER_NASDAQ = "stock_news/nasdaq"
TEMP_FOLDER_NASDAQ = f"stock_news/nasdaq/nasdaq_{STOCK_SYMBOL_LOWER}"

# Write to a single CSV file
df_news_nasdaq_filtered.coalesce(1).write.csv(
    TEMP_FOLDER_NASDAQ, sep=",", header=True, mode="overwrite"
)

# Move the part file to the desired filename
for filename in os.listdir(TEMP_FOLDER_NASDAQ):
    if filename.startswith("part-"):
        shutil.move(
            os.path.join(TEMP_FOLDER_NASDAQ, filename),
            os.path.join(FOLDER_NASDAQ, FILE_NAME_NASDAQ_NEWS),
        )

# Remove the temporary directory
shutil.rmtree(TEMP_FOLDER_NASDAQ)

local_variables = [
    "df_all_news_external",
    "df_news_external_filtered",
]

# Delete local variables if they exist
for var in local_variables:
    if var in locals():  # Check if the variable exists in the local scope
        del locals()[var]

# Preprocess the News Datasets

Read in the external news and Nasdaq news datasets from CSV files to Spark dataframes.

In [None]:
# Read news data from CSV files to Spark dataframes
df_external_news = spark.read.csv(
    FILE_PATH_EXTERNAL_NEWS, header=True, schema=EXTERNAL_NEWS_SCHEMA
)
df_nasdaq_news = spark.read.csv(
    FILE_PATH_NASDAQ_NEWS, header=True, schema=NASDAQ_NEWS_SCHEMA
)

# Store in memory
df_external_news.persist()
df_nasdaq_news.persist()

# Verify the dataframes
print(f"Row count for df_external_news: {df_external_news.count()}")
df_external_news.show(5, truncate=True)

print(f"Row count for df_nasdaq_news: {df_nasdaq_news.count()}")
df_nasdaq_news.show(5, truncate=True)

Drop unused fields and merge the external and Nasdaq news datasets into one dataframe.

In [None]:
# Drop unused fields
df_external_news = df_external_news.drop(
    "Luhn_summary", "Textrank_summary", "Lexrank_summary"
)
df_nasdaq_news = df_nasdaq_news.drop(
    "Unnamed: 0", "Luhn_summary", "Textrank_summary", "Lexrank_summary"
)

# Merge two dataframes
df_news = df_nasdaq_news.unionByName(df_external_news)

# Store in memory
df_news.persist()

# Verify
# Expect count is 7419, where 2945 + 4474 = 7419
print(f"Row count for df_news: {df_news.count()}")
df_news.show(5)

Preprocess the merged news dataset.

In [None]:
# Standardize the timestamps to UTC timezone. Example: Convert "2019-01-15 00:00:00 UTC" to "2019-01-15 00:00:00".
df_news = df_news.withColumn(
    "Date", F.to_utc_timestamp(F.to_timestamp("Date", "yyyy-MM-dd HH:mm:ss zzz"), "UTC")
).filter(F.col("Date").isNotNull())

# Add a "Summarized" field with all values set to 0.
# Add a "Sentiment_score" field with all values set to 0.
# Sort by Date field in descending order.
df_news = (
    df_news.withColumn("Summarized", F.lit(0))
    .withColumn("Sentiment_score", F.lit(0))
    .orderBy("Date", ascending=False)
)

# Add a unique identifier field
uuid_udf = F.udf(lambda: str(uuid.uuid4()), StringType())
df_news = df_news.withColumn("UUID", uuid_udf())

# Verify
print(f"Row count for df_news: {df_news.count()}")
df_news.show(5, truncate=True)

Save a copy of the preprocessed merged news dataset.

In [None]:
# Specify directories
FOLDER_MERGED_NEWS = "stock_news/merged"
TEMP_FOLDER_MERGED_NEWS = f"stock_news/merged/news_{STOCK_SYMBOL_LOWER}"

# Write to a single CSV file
df_news.coalesce(1).write.csv(
    TEMP_FOLDER_MERGED_NEWS, sep=",", header=True, mode="overwrite"
)

# Move the part file to the desired filename
for filename in os.listdir(TEMP_FOLDER_MERGED_NEWS):
    if filename.startswith("part-"):
        shutil.move(
            os.path.join(TEMP_FOLDER_MERGED_NEWS, filename),
            os.path.join(FOLDER_MERGED_NEWS, FILE_NAME_NEWS),
        )

# Remove the temporary directory
shutil.rmtree(TEMP_FOLDER_MERGED_NEWS)

# Uncache the dataframe in a non-blocking operation to free up memory.
df_external_news.unpersist(blocking=False)
df_nasdaq_news.unpersist(blocking=False)
df_news.unpersist(blocking=False)

local_variables = [
    "df_external_news",
    "df_nasdaq_news",
    "df_news",
    "uuid_udf",
]

# Delete local variables if they exist
for var in local_variables:
    if var in locals():  # Check if the variable exists in the local scope
        del locals()[var]

# Summarize the News Dataset

Retrieve the merged news dataset.

In [None]:
# Read the CSV file to Spark dataframe
df_merged_news = spark.read.csv(
    FILE_PATH_MERGED_NEWS, header=True, schema=MERGED_NEWS_SCHEMA
)

# Store in memory
df_merged_news.persist()

# Verify
print(f"Row count for df_merged_news: {df_merged_news.count()}")
df_merged_news.show(5, truncate=True)

Summarize the news texts using a LSA Summarizer.

In [None]:
"""
Description:
Takes the article text as input, parses it using PlaintextParser, and summarizes it using LsaSummarizer.

Parameters:
text (string): The news article.

Returns:
string: The summarized text.
"""


def summarize_article(text):
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))

    # Stemming reduces words to their root form for the summarizer to identify similar concepts expressed with
    # different word forms.
    stemmer = Stemmer(LANGUAGE)

    # Initializes the summarizer with the stemmer
    summarizer = LsaSummarizer(stemmer)

    # Removes stop word to eliminate common non-keyword words.
    summarizer.stop_words = get_stop_words(LANGUAGE)

    # Generates the summarized text
    summary = summarizer(parser.document, SENTENCES_COUNT)
    return " ".join([str(sentence) for sentence in summary])


# A user-designed function to wrap the summarize_article function to be used in Spark.
summarize_udf = F.udf(summarize_article, StringType())

# Concats both fields with a period and space as the separator.
# Execute the summarize_udf function
df_merged_news = df_merged_news.withColumn(
    "Text", F.concat_ws(". ", "Article_title", "Article")
).withColumn("Lsa_summary", summarize_udf("Text"))

# Updated summarized indicator
df_merged_news = df_merged_news.withColumn(
    "Summarized",
    F.when(
        F.col("Lsa_summary").isNotNull() & (F.col("Lsa_summary") != ""), F.lit(1)
    ).otherwise(0),
)

# Verify number of news with no summary
# print(
#     f"Number of news with no summary: {df_merged_news.filter(col("Summarized") == 0).count()}"
# )

# Remove rows without a summary
df_merged_news = df_merged_news.filter((F.col("Summarized") == 1))

Verify the results of the summarization.

In [None]:
# Verify
print(f"Row count for df_merged_news: {df_merged_news.count()}")

df_merged_news.select(
    "Date",
    "Summarized",
    "Text",
    "Lsa_summary",
).show(10, truncate=False)

Save a copy of the summarized news dataset.

In [None]:
# Specify the directories
FOLDER_SUMMARIZED_NEWS = "stock_news/summarized"
TEMP_FOLDER_SUMMARIZED_NEWS = "stock_news/summarized/news_{STOCK_SYMBOL_LOWER}"

# Write to a single CSV file
df_merged_news.coalesce(1).write.csv(
    TEMP_FOLDER_SUMMARIZED_NEWS, sep=",", header=True, mode="overwrite"
)

# Move the part file to the desired filename
for filename in os.listdir(TEMP_FOLDER_SUMMARIZED_NEWS):
    if filename.startswith("part-"):
        shutil.move(
            os.path.join(TEMP_FOLDER_SUMMARIZED_NEWS, filename),
            os.path.join(FOLDER_SUMMARIZED_NEWS, FILE_NAME_NEWS),
        )

# Remove the temporary directory
shutil.rmtree(TEMP_FOLDER_SUMMARIZED_NEWS)

# Uncache the dataframe in a non-blocking operation to free up memory.
df_merged_news.unpersist(blocking=False)

local_variables = [
    "df_merged_news",
    "summarize_udf",
]

# Delete local variables if they exist
for var in local_variables:
    if var in locals():  # Check if the variable exists in the local scope
        del locals()[var]

# Calculate the Sentiment Score of the News

Retrieve the summarized news dataset.

In [None]:
# Read the CSV file to Spark dataframe
df_summarized_news = spark.read.csv(
    FILE_PATH_SUMMARIZED_NEWS, header=True, schema=SUMMARIZED_NEWS_SCHEMA
)

# Store in memory
df_summarized_news.persist()

# Verify
print(f"Row count for df_summarized_news: {df_summarized_news.count()}")
df_summarized_news.show(5, truncate=True)

Retrieve the summaries and initialize the large-language model.

In [None]:
# Get the UUID, Date, Url, and Lsa_summary from the first row
collection = df_summarized_news.select(
    F.collect_list("UUID"),
    F.collect_list("Date"),
    F.collect_list("Lsa_summary"),
).first()
uuids = collection[0]
dates = collection[1]
lsa_summaries = collection[2]

# Verify
print(f"Number of uuids: {len(uuids)}")
print(f"Number of dates: {len(dates)}")
print(f"Number of lsa_summaries: {len(lsa_summaries)}")

# Initializing the OllamaLLM
try:
    llm = ChatOllama(
        model=MODEL, temperature=TEMPERATURE, num_predict=MAX_OUTPUT_TOKENS
    )
    print("ChatOllama instance created successfully!")
except Exception as e:
    print("Error creating ChatOllama instance:", e)

Feed the summmmaries to the model in batches and capture the resulting sentiment score.

In [None]:
# Holds the resulting output, i.e. the sentiment scores
sentiments = []

# Iterate in batches
for i in range(0, len(lsa_summaries), BATCH_SIZE):
    batch_summaries = lsa_summaries[i : i + BATCH_SIZE]
    batch_uuids = uuids[i : i + BATCH_SIZE]

    num_text = len(batch_summaries)
    # print(f"#{i}, num_text: {num_text}")

    batch_text = " ".join(
        [
            f"### {STOCK_SYMBOL_UPPER} Stock News: {summary} "
            for summary in batch_summaries
        ]
    )
    # print(f"#{i}, batch_text: {batch_text}")

    chat_template = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                f"Forget all previous instructions. You are a financial expert with stock recommendation experience. Based on news for a specific stock, provide a sentiment score in the range of 1 to 5 inclusive, where 1 is negative, 2 is somewhat negative, 3 is neutral, 4 is somewhat positive, and 5 is positive. {num_text} summerized news will be passed in each time. You will provide {num_text} scores, one score for each of the summerized news in the format as shown below in the response from the assistant.",
            ),
            (
                "user",
                f"### AAPL Stock News: Apple (AAPL) increased 22%. ### AAPL Stock News: Apple (AAPL) price decreased 30%. ### MSFT Stock News: Microsoft (MSTF) price has not changed. ### AAPL Stock News: Apple (AAPL) announced the new iPhone 15. ### AAPL Stock News: Apple (AAPL) will release the Vison Pro on Feb 2, 2024.",
            ),
            ("ai", "5, 1, 3, 4, 4"),
            ("user", batch_text),
        ]
    )

    messages = chat_template.format_messages(num_text=num_text)
    # print(f"#{i}, messages: {messages}")

    response = llm.invoke(messages)
    print(f"#{i}: {response.content}")

    # Loop through each batch and append the sentiment scores to the list
    for sentiment in response.content.split(","):
        stripped_sentiment = sentiment.strip()
        try:
            if stripped_sentiment:  # Check if the stripped sentiment is not empty
                # Round the sentiment and clamp it between 1 and 5
                rounded_sentiment = max(
                    MIN_VALUE, min(MAX_VALUE, round(float(stripped_sentiment)))
                )
                sentiments.append(rounded_sentiment)
        except ValueError:
            print(f"Invalid sentiment value: {stripped_sentiment}")
        # print(f"#{i}, sentiment: {sentiment}")

In [None]:
# Save the sentiments to a file
with open(FILE_PATH_SENTIMENT_SCORES, "w") as f:
    f.writelines(f"{sentiment}\n" for sentiment in sentiments)

In [None]:
# Verify
print(f"Number of lsa_summaries: {len(lsa_summaries)}")
print(f"Number of sentiments: {len(sentiments)}")
print(f"Row count for df_summarized_news: {df_summarized_news.count()}")

In [None]:
# TO BE EXECUTED ONLY IF THE NUMBER OF SENTIMENTS DOES NOT MATCH THE NUMBER OF SUMMARIES
# Step 1: Re-run the sentiment score for the affected batch
# Step 2: Update the missing scores to the sentiments.csv file
# Step 3: Verify the number of sentiments in the sentiments.csv file matches the number of summaries
# Step 4: Execute code below to copy out the sentiments from sentiments.csv file to the sentiments list

# sentiments = []
# with open(FILE_PATH_SENTIMENT_SCORES, "r") as f:
#     sentiments = [int(sentiment.strip()) for sentiment in f if sentiment.strip()]
# print(f"Number of sentiments: {len(sentiments)}")

Visualize the sentiment distribution.

In [None]:
# Create a dictionary to store the count of each sentiment score
sentiment_counts = {i: sentiments.count(i) for i in range(1, 6)}

# Calculate the total count
total_count = sum(sentiment_counts.values())

# Create the bar chart
plt.bar(sentiment_counts.keys(), sentiment_counts.values())
plt.xlabel("Sentiment Score")
plt.ylabel("Count")
plt.title("Distribution of the Sentiment Scores")

# Add percentage labels above each bar
for i, count in enumerate(sentiment_counts.values()):
    percentage = count / total_count * 100
    plt.text(i + 1, count, f"{percentage:.1f}%", ha="center", va="bottom")

plt.show()

Save a copy of the sentiment scored news dataset.

In [None]:
if len(sentiments) != len(lsa_summaries):
    raise ValueError("The number of sentiments does not match the number of summaries.")

# Combine lists into a list of tuples
scored_news = list(zip(uuids, dates, sentiments, lsa_summaries))

# Create a dataframe with the specified schema
df_scored_news = spark.createDataFrame(scored_news, schema=SCORED_NEWS_SCHEMA)

# Specify the directories
FOLDER_SCORED_NEWS = "stock_news/scored"
TEMP_FOLDER_SCORED_NEWS = "stock_news/scored/news_{STOCK_SYMBOL_LOWER}"

# Write to a single CSV file
df_scored_news.coalesce(1).write.csv(
    TEMP_FOLDER_SCORED_NEWS, sep=",", header=True, mode="overwrite"
)

# Move the part file to the desired filename
for filename in os.listdir(TEMP_FOLDER_SCORED_NEWS):
    if filename.startswith("part-"):
        shutil.move(
            os.path.join(TEMP_FOLDER_SCORED_NEWS, filename),
            os.path.join(FOLDER_SCORED_NEWS, FILE_NAME_NEWS),
        )

# Remove the temporary directory
shutil.rmtree(TEMP_FOLDER_SCORED_NEWS)

# Uncache the dataframe in a non-blocking operation to free up memory.
df_summarized_news.unpersist(blocking=False)

local_variables = [
    "collection",
    "uuids",
    "dates",
    "lsa_summaries",
    "sentiments",
    "llm",
    "chat_template",
    "messages",
    "response",
    "scored_news",
    "df_summarized_news",
    "df_scored_news",
]

# Delete local variables if they exist
for var in local_variables:
    if var in locals():  # Check if the variable exists in the local scope
        del locals()[var]

# Exponentially Decay the Sentiment Score of the News

Read in the sentiment scored stock news.

In [None]:
df_scored_news = spark.read.csv(
    FILE_PATH_SCORED_NEWS, header=True, schema=SCORED_NEWS_SCHEMA
)

# Verify
print(f"Row count for df_scored_news: {df_scored_news.count()}")
df_scored_news.show(5, truncate=True)

Calculate the average sentiment scores for each date. Then, populate the missing dates.

In [None]:
# Extracting only the date component and sort by date
df_scored_news_copy = df_scored_news.select(
    "UUID", F.to_date("Date").alias("Date"), "Sentiment_score", "Lsa_summary"
).orderBy("Date")

# Group by "Date" and calculate the average "Sentiment_score"
df_avg_sentiment = df_scored_news_copy.groupBy("Date").agg(
    F.avg("Sentiment_score").alias("Sentiment_avg")
)

# Retrieve the start and end date
start_date = df_avg_sentiment.agg(F.min("Date")).collect()[0][0]
end_date = df_avg_sentiment.agg(F.max("Date")).collect()[0][0]

# Initialize a dataframe with the start and end dates
df_date_range = spark.createDataFrame(
    [(start_date, end_date)], ["start_date", "end_date"]
)

# Generate a date range using sequence
df_dates = df_date_range.select(
    F.expr("sequence(to_date(start_date), to_date(end_date), interval 1 day)").alias(
        "Date"
    )
)

# Explode the array to get individual dates
df_dates = df_dates.select(F.explode("Date").alias("Date"))

# Join the complete date range with the original dataframe
df_avg_sentiment_filled = df_dates.join(df_avg_sentiment, on="Date", how="left")

# Store in memory
df_avg_sentiment_filled.persist()

# Verify
print(f"Row count for df_avg_sentiment_filled: {df_avg_sentiment_filled.count()}")
df_avg_sentiment_filled.show(df_avg_sentiment_filled.count(), truncate=False)

Apply an exponential decay algorithm to the missing average sentiment scores.

In [None]:
# Create a window specification to get the last valid sentiment and last valid date
window_spec = Window.orderBy("Date").rowsBetween(
    Window.unboundedPreceding, Window.currentRow
)

# Get last valid sentiment
df_avg_sentiment_filled = df_avg_sentiment_filled.withColumn(
    "Last_valid_sentiment", F.last("Sentiment_avg", ignorenulls=True).over(window_spec)
)

# Get last valid date only when sentiment is not null
df_avg_sentiment_filled = df_avg_sentiment_filled.withColumn(
    "Last_valid_date",
    F.last(
        F.when(F.col("Sentiment_avg").isNotNull(), F.col("Date")), ignorenulls=True
    ).over(window_spec),
)

# Calculate the number of days since the last valid sentiment
df_avg_sentiment_filled = df_avg_sentiment_filled.withColumn(
    "Days_since_last_valid", (F.datediff(F.col("Date"), F.col("Last_valid_date")))
)

# Calculate decayed sentiment for rows where the average sentiment is null
df_avg_sentiment_filled = df_avg_sentiment_filled.withColumn(
    "Decayed_sentiment",
    F.when(
        F.col("Sentiment_avg").isNull(),
        BASE_VALUE
        + (F.col("Last_valid_sentiment") - BASE_VALUE)
        * F.exp(-DECAY_RATE * F.col("Days_since_last_valid")),
    ).otherwise(F.col("Sentiment_avg")),
)

# Round the DoubleType values to eight decimal places
df_avg_sentiment_filled = (
    df_avg_sentiment_filled.withColumn(
        "Sentiment_avg", F.round(F.col("Sentiment_avg"), 8)
    )
    .withColumn("Last_valid_sentiment", F.round(F.col("Last_valid_sentiment"), 8))
    .withColumn("Decayed_sentiment", F.round(F.col("Decayed_sentiment"), 8))
)

# Verify
print(f"Row count for df_avg_sentiment_filled: {df_avg_sentiment_filled.count()}")
df_avg_sentiment_filled.show(df_avg_sentiment_filled.count(), truncate=False)

Save a copy of the dataset.

In [None]:
# Specify the directories
FOLDER_DECAYED_NEWS = "stock_news/decayed"
TEMP_FOLDER_DECAYED_NEWS = "stock_news/decayed/news_{STOCK_SYMBOL_LOWER}"

# Write to a single CSV file
df_avg_sentiment_filled.coalesce(1).write.csv(
    TEMP_FOLDER_DECAYED_NEWS, sep=",", header=True, mode="overwrite"
)

# Move the part file to the desired filename
for filename in os.listdir(TEMP_FOLDER_DECAYED_NEWS):
    if filename.startswith("part-"):
        shutil.move(
            os.path.join(TEMP_FOLDER_DECAYED_NEWS, filename),
            os.path.join(FOLDER_DECAYED_NEWS, FILE_NAME_NEWS),
        )

# Remove the temporary directory
shutil.rmtree(TEMP_FOLDER_DECAYED_NEWS)

# Uncache the dataframe in a non-blocking operation to free up memory.
df_avg_sentiment_filled.unpersist(blocking=False)

local_variables = [
    "window_spec",
    "start_date",
    "end_date",
    "df_scored_news",
    "df_scored_news_copy",
    "df_avg_sentiment",
    "df_date_range",
    "df_dates",
    "df_avg_sentiment_filled",
]

# Delete local variables if they exist
for var in local_variables:
    if var in locals():  # Check if the variable exists in the local scope
        del locals()[var]

# Integrate the News and Price Datasets

Ingest the decayed sentiment dataset and the price dataset.

In [None]:
# Read price data from CSV files to Spark dataframes
df_decayed_sentiment = spark.read.csv(
    FILE_PATH_DECAYED_NEWS, header=True, schema=DECAYED_SENTIMENT_SCHEMA
)
df_price = spark.read.csv(
    FILE_PATH_PREPROCESSED_PRICE, header=True, schema=PREPROCESSED_PRICE_SCHEMA
)

# Verify
print(f"Row count for df_decayed_sentiment: {df_decayed_sentiment.count()}")
df_decayed_sentiment.printSchema()
df_decayed_sentiment.show(5, truncate=True)

print(f"Row count for df_price: {df_price.count()}")
df_price.printSchema()
df_price.show(5, truncate=True)

Merge the price dataset with the average sentiment score taken from news dataset.

In [None]:
# Left join the price data with the average sentiment data
df_combined = df_price.join(df_decayed_sentiment, on="Date", how="left")

# Store in memory
df_combined.persist()

# Drop unused fields
df_combined = df_combined.drop(
    "Last_valid_sentiment", "Last_valid_date", "Days_since_last_valid"
)

# Convert the decayed sentiment values to 3 if they are null
df_combined = df_combined.withColumn(
    "Decayed_sentiment",
    F.when(
        F.col("Decayed_sentiment").isNull() | (F.col("Decayed_sentiment") == ""),
        BASE_VALUE,
    ).otherwise(F.col("Decayed_sentiment")),
)

# Verify
df_combined.show(20, truncate=False)

Normalize the sentiments to between 0 and 1.

In [None]:
# Normalize the decayed sentiment
df_combined = df_combined.withColumn(
    "Normalized_sentiment",
    (F.col("Decayed_sentiment") - MIN_VALUE) / (MAX_VALUE - MIN_VALUE),
)

# Round the DoubleType values to eight decimal places
df_combined = df_combined.withColumn(
    "Decayed_sentiment", F.round(F.col("Decayed_sentiment"), 8)
).withColumn("Normalized_sentiment", F.round(F.col("Normalized_sentiment"), 8))

# Verify
df_combined.show(20, truncate=False)

Save a copy of the dataset.

In [None]:
# Specify the directories
FOLDER_COMBINED = "lstm/data"
TEMP_FOLDER_COMBINED = f"lstm/data/{STOCK_SYMBOL_LOWER}"

# Write to a single CSV file
df_combined.coalesce(1).write.csv(
    TEMP_FOLDER_COMBINED, sep=",", header=True, mode="overwrite"
)

# Move the part file to the desired filename
for filename in os.listdir(TEMP_FOLDER_COMBINED):
    if filename.startswith("part-"):
        shutil.move(
            os.path.join(TEMP_FOLDER_COMBINED, filename),
            os.path.join(FOLDER_COMBINED, FILE_NAME_COMBINED),
        )

# Remove the temporary directory
shutil.rmtree(TEMP_FOLDER_COMBINED)

# Uncache the dataframe in a non-blocking operation to free up memory.
df_combined.unpersist(blocking=False)

local_variables = [
    "df_decayed_sentiment",
    "df_price",
    "df_combined",
]

# Delete local variables if they exist
for var in local_variables:
    if var in locals():  # Check if the variable exists in the local scope
        del locals()[var]

# Initialize the Model

In [None]:
import pandas as pd

Initialize the Timer

In [None]:
class Timer:
    def __init__(self):
        self.start_date = None

    def start(self):
        self.start_date = dt.datetime.now()

    def stop(self):
        end_date = dt.datetime.now()
        print(f"Time taken: {end_date - self.start_date}")

Load the Data

In [None]:
class DataLoader:
    """Load and transform data for the LSTM model."""

    def __init__(self, filename, split, cols, cols_to_norm, pred_len):
        dataframe = pd.read_csv(filename)
        i_split = int(len(dataframe) * split)
        self.data_train = dataframe.get(cols).values[:i_split]
        # print(f"self.data_train: {self.data_train[:10]}")
        self.data_test = dataframe.get(cols).values[i_split:]
        # print(f"length of self.data_test: {len(self.data_test)}")
        # print(f"self.data_test: {self.data_test}")
        self.cols_to_norm = cols_to_norm
        self.pred_len = pred_len
        self.len_train = len(self.data_train)
        self.len_test = len(self.data_test)
        self.len_train_windows = None

    def get_test_data(self, seq_len, normalise, cols_to_norm):
        """
        Create x, y test data windows
        Warning: batch method, not generative, make sure you have enough memory to
        load data, otherwise reduce size of the training split.
        """
        data_windows = []
        for i in range(self.len_test - seq_len):
            data_windows.append(self.data_test[i : i + seq_len])

        data_windows = np.array(data_windows).astype(float)
        y_base = data_windows[:, 0, [0]]
        # data_windows = self.normalise_windows(data_windows, single_window=False) if normalise else data_windows
        data_windows = (
            self.normalise_selected_columns(
                data_windows, cols_to_norm, single_window=False
            )
            if normalise
            else data_windows
        )
        cut_point = self.pred_len
        # x = data_windows[:, :-cut_point:]
        x = data_windows[:, :-1, :]
        y = data_windows[:, -1, [0]]
        return x, y, y_base

    def get_train_data(self, seq_len, normalise):
        """
        Create x, y train data windows
        Warning: batch method, not generative, make sure you have enough memory to
        load data, otherwise use generate_training_window() method.
        """
        data_x = []
        data_y = []
        for i in range(self.len_train - seq_len):
            x, y = self._next_window(i, seq_len, normalise)
            data_x.append(x)
            data_y.append(y)
        return np.array(data_x), np.array(data_y)

    def generate_train_batch(self, seq_len, batch_size, normalise):
        """Yield a generator of training data from filename on given list of cols split for train/test"""
        i = 0
        while i < (self.len_train - seq_len):
            x_batch = []
            y_batch = []
            for b in range(batch_size):
                if i >= (self.len_train - seq_len):
                    # stop-condition for a smaller final batch if data doesn't divide evenly
                    yield np.array(x_batch), np.array(y_batch)
                    i = 0
                x, y = self._next_window(i, seq_len, normalise)
                x_batch.append(x)
                y_batch.append(y)
                i += 1
            yield np.array(x_batch), np.array(y_batch)

    def _next_window(self, i, seq_len, normalise):
        """Generates the next data window from the given index location i"""
        window = self.data_train[i : i + seq_len]
        # window = self.normalise_windows(window, single_window=True)[0] if normalise else window
        window = (
            self.normalise_selected_columns(
                window, self.cols_to_norm, single_window=True
            )[0]
            if normalise
            else window
        )
        # x = window[:-1]
        x = window[:-1]
        # y = window[0][2][0]
        y = window[-1, [0]]
        return x, y

    def normalise_windows(self, window_data, single_window=False):
        """Normalise window with a base value of zero"""
        normalised_data = []
        window_data = [window_data] if single_window else window_data
        for window in window_data:
            normalised_window = []
            for col_i in range(window.shape[1]):
                w = window[0, col_i]
                if w == 0:
                    w = 1
                normalised_col = [((float(p) / float(w)) - 1) for p in window[:, col_i]]
                normalised_window.append(normalised_col)
            normalised_window = (
                np.array(normalised_window).T
            )  # reshape and transpose array back into original multidimensional format
            normalised_data.append(normalised_window)
        return np.array(normalised_data)

    # Modified normalization function to normalize only specific columns
    def normalise_selected_columns(
        self, window_data, columns_to_normalise, single_window=False
    ):
        normalised_data = []
        window_data = [window_data] if single_window else window_data
        for window in window_data:
            normalised_window = []
            for col_i in range(window.shape[1]):
                if col_i in columns_to_normalise:
                    # Normalize only if the column index is in the list of columns to normalize
                    w = window[0, col_i]
                    if w == 0:
                        w = 1
                    normalised_col = [
                        ((float(p) / float(w)) - 1) for p in window[:, col_i]
                    ]
                else:
                    # Keep the original data for columns not in the list
                    normalised_col = window[:, col_i].tolist()
                normalised_window.append(normalised_col)
            normalised_window = np.array(normalised_window).T
            normalised_data.append(normalised_window)
        return np.array(normalised_data)


Initialize the LSTM model.

In [None]:
class Model:
    """Build, train, and inference an LSTM model with Tensorflow Keras."""

    def __init__(self):
        self.model = Sequential()

    def load_model(self, filepath):
        print(f"MODEL: Loading model from file {filepath}")
        self.model = load_model(filepath)

    def build_model(self, configs):
        timer = Timer()
        timer.start()

        for layer in configs["model"]["layers"]:
            neurons = layer["neurons"] if "neurons" in layer else None
            dropout_rate = layer["rate"] if "rate" in layer else None
            activation = layer["activation"] if "activation" in layer else None
            return_seq = layer["return_seq"] if "return_seq" in layer else None
            input_timesteps = (
                layer["input_timesteps"] if "input_timesteps" in layer else None
            )
            input_dim = layer["input_dim"] if "input_dim" in layer else None

            if layer["type"] == "dense":
                self.model.add(Dense(neurons, activation=activation))
            if layer["type"] == "lstm":
                self.model.add(
                    LSTM(
                        neurons,
                        input_shape=(input_timesteps, input_dim),
                        return_sequences=return_seq,
                    )
                )
            if layer["type"] == "dropout":
                self.model.add(Dropout(dropout_rate))

        self.model.compile(
            loss=configs["model"]["loss"], optimizer=configs["model"]["optimizer"]
        )

        print("MODEL: Model Compiled")
        timer.stop()

    def train(self, x, y, epochs, batch_size, save_dir):
        timer = Timer()
        timer.start()
        print("MODEL: Training Started")
        print(f"MODEL: epochs: {epochs}, batch size: {batch_size}")

        save_fname = os.path.join(
            save_dir,
            f"{dt.datetime.now().strftime("%d%m%Y-%H%M%S")}-e{str(epochs)}.keras",
        )

        callbacks = [
            EarlyStopping(monitor="val_loss", patience=2),
            ModelCheckpoint(
                filepath=save_fname, monitor="val_loss", save_best_only=True
            ),
        ]
        self.model.fit(x, y, epochs=epochs, batch_size=batch_size, callbacks=callbacks)
        self.model.save(save_fname)

        print(f"MODEL: Training Completed. Model saved as {save_fname}")
        timer.stop()

    def train_generator(
        self,
        data_gen,
        epochs,
        batch_size,
        steps_per_epoch,
        save_dir,
        sentiment_type,
        model_name,
        num_csvs,
    ):
        timer = Timer()
        timer.start()
        print("MODEL: Training Started")
        print(
            f"MODEL: epochs: {epochs}, batch size: {batch_size}, batches per epoch: {steps_per_epoch}"
        )
        model_path = f"{model_name}_{sentiment_type}_{num_csvs}.keras"
        save_fname = os.path.join(save_dir, model_path)

        callbacks = [
            ModelCheckpoint(filepath=save_fname, monitor="loss", save_best_only=True)
        ]
        self.model.fit(
            data_gen,
            steps_per_epoch=steps_per_epoch,
            epochs=epochs,
            callbacks=callbacks,
        )

        print(f"MODEL: Training Completed. Model saved as {save_fname}")
        timer.stop()

    def predict_sequences_multiple_modified(self, data, window_size, prediction_len):
        """
        window_size = 50, prediction_len = 3
        Pros: Computationally less expensive.
        Cons: Less accurate
        """
        print("MODEL: Predicting Sequences Multiple Modified...")
        prediction_seqs = []
        for i in range(0, len(data), prediction_len):
            curr_frame = data[i]
            predicted = []
            for j in range(prediction_len):
                predicted.append(
                    self.model.predict(curr_frame[newaxis, :, :], verbose=0)[0, 0]
                )
                curr_frame = curr_frame[1:]
                curr_frame = np.insert(
                    curr_frame, [window_size - 2], predicted[-1], axis=0
                )
            prediction_seqs.append(predicted)
        return prediction_seqs

    def predict_sequences_full(self, data, window_size):
        """
        window_size = 50
        Pros: By shifting the window one step at a time, it can effectively capture the context of the input data.
        Cons: Computationally expensive to predict the entire dataset and will require more memory.
        """
        # Shift the window by 1 new prediction each time, re-run predictions on new window
        print("MODEL: Predicting Sequences Full...")
        curr_frame = data[0]
        predicted = []
        for i in range(len(data)):
            predicted.append(
                self.model.predict(curr_frame[newaxis, :, :], verbose=0)[0, 0]
            )
            curr_frame = curr_frame[1:]
            curr_frame = np.insert(curr_frame, [window_size - 2], predicted[-1], axis=0)
        return predicted


# Run the Model

In [None]:
CURRENT_TIME = dt.datetime.now().strftime("%Y%m%d%H")
# CURRENT_TIME = "2024102709"


def output_results_and_errors_multiple(
    predicted_data,
    true_data,
    true_data_base,
    prediction_len,
    file_name,
    sentiment_type,
    num_csvs,
):
    save_df = pd.DataFrame()

    save_df["True_Data"] = true_data.reshape(-1)
    save_df["Base"] = true_data_base.reshape(-1)

    save_df["True_Data_origin"] = (save_df["True_Data"] + 1) * save_df["Base"]

    if predicted_data:
        all_predicted_data = np.concatenate([p for p in predicted_data])
    else:
        all_predicted_data = predicted_data

    file_name = file_name.split(".")[0]
    sentiment_type = str(sentiment_type)

    save_df["Predicted_Data"] = pd.Series(all_predicted_data)

    save_df["Predicted_Data_origin"] = (save_df["Predicted_Data"] + 1) * save_df["Base"]

    save_df = save_df.fillna(np.nan)
    result_folder = f"test_result_{num_csvs}"
    save_file_path = os.path.join(
        result_folder,
        f"{file_name}_{sentiment_type}_{CURRENT_TIME}",
        f"{file_name}_{sentiment_type}_{CURRENT_TIME}_predicted_data.csv",
    )

    os.makedirs(
        os.path.join(result_folder, f"{file_name}_{sentiment_type}_{CURRENT_TIME}"),
        exist_ok=True,
    )

    save_df.to_csv(save_file_path, index=False)
    print(f"Data saved to {save_file_path}")

    min_length = min(len(save_df["Predicted_Data"]), len(save_df["True_Data"]))
    predicted_data = save_df["Predicted_Data"][:min_length]
    true_data = save_df["True_Data"][:min_length]

    mae = mean_absolute_error(true_data, predicted_data)
    mse = mean_squared_error(true_data, predicted_data)
    r2 = r2_score(true_data, predicted_data)

    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"R²: {r2}")
    results_df = pd.DataFrame({"MAE": [mae], "MSE": [mse], "R2": [r2]})

    eval_file_path = os.path.join(
        result_folder,
        f"{file_name}_{sentiment_type}_{CURRENT_TIME}",
        f"{file_name}_{sentiment_type}_{CURRENT_TIME}_eval.csv",
    )

    results_df.to_csv(eval_file_path, index=False)
    print(f"\nResults saved to {eval_file_path}")


# Main Function
def main(configs, data_filename, sentiment_type, flag_pred, model_name, num_csvs):
    # print(
    #     f"flag_pred: {flag_pred}, sentiment_type: {sentiment_type}, data_filename: {data_filename}"
    # )
    symbol_name = name.split(".")[0]
    if not os.path.exists(configs["model"]["save_dir"]):
        os.makedirs(configs["model"]["save_dir"])

    data = DataLoader(
        os.path.join("data", data_filename),
        configs["data"]["train_test_split"],
        configs["data"]["columns"],
        configs["data"]["columns_to_normalise"],
        configs["data"]["prediction_length"],
    )

    model = Model()
    model_path = f"saved_models/{model_name}_{sentiment_type}_{num_csvs}.keras"
    if os.path.exists(model_path):
        model.load_model(model_path)
    else:
        model.build_model(configs)

    x, y = data.get_train_data(
        seq_len=configs["data"]["sequence_length"],
        normalise=configs["data"]["normalise"],
    )
    print(f"X: {x.shape}")
    print(f"Y: {y.shape}")
    """
	# In-memory training
	model.train(
		x,
		y,
		epochs = configs['training']['epochs'],
		batch_size = configs['training']['batch_size'],
		save_dir = configs['model']['save_dir']
	)
	"""
    # Out-of-memory training
    steps_per_epoch = math.ceil(
        (data.len_train - configs["data"]["sequence_length"])
        / configs["training"]["batch_size"]
    )
    # print("======= End =======")
    model.train_generator(
        data_gen=data.generate_train_batch(
            seq_len=configs["data"]["sequence_length"],
            batch_size=configs["training"]["batch_size"],
            normalise=configs["data"]["normalise"],
        ),
        epochs=configs["training"]["epochs"],
        batch_size=configs["training"]["batch_size"],
        steps_per_epoch=steps_per_epoch,
        save_dir=configs["model"]["save_dir"],
        sentiment_type=sentiment_type,
        model_name=model_name,
        num_csvs=num_csvs,
    )
    if flag_pred:
        if symbol_name in pred_names:
            print("-----Predicting-----")
            x_test, y_test, y_base = data.get_test_data(
                seq_len=configs["data"]["sequence_length"],
                normalise=configs["data"]["normalise"],
                cols_to_norm=configs["data"]["columns_to_normalise"],
            )
            print("Test Data:")
            print(f"x_test.shape: {x_test.shape}")
            print(f"y_test.shape: {y_test.shape}")
            predictions = model.predict_sequences_multiple_modified(
                x_test,
                configs["data"]["sequence_length"],
                configs["data"]["prediction_length"],
            )

            output_results_and_errors_multiple(
                predictions,
                y_test,
                y_base,
                configs["data"]["prediction_length"],
                symbol_name,
                sentiment_type,
                num_csvs,
            )


if __name__ == "__main__":
    model_name = "lstm"
    sentiment_types = ["sentiment", "non-sentiment"]

    # Test csvs = 5
    names_1 = ["aa.csv"]
    names_5 = ["aa.csv", "aapl.csv", "amzn.csv", "msft.csv", "tsla.csv"]
    names_25 = []
    names_50 = []
    all_names = [names_1]
    # all_names = [names_1, names_5, names_25, names_50]
    pred_names = ["aa"]
    for names in all_names:
        num_stocks = len(names)
        # For the first and second runs, only model training was performed
        # In the third run, it will train and make predictions
        for i in range(3):
            if_pred = False
            if i == 0 or i == 1:
                continue
            if i == 2:
                if_pred = True
            for sentiment_type in sentiment_types:
                for name in names:
                    configs = json.load(open(sentiment_type + "-config.json", "r"))
                    print(f"#{i}")
                    main(configs, name, sentiment_type, if_pred, model_name, num_stocks)


# Visualize Data

In [None]:
FOLDER_TEST_RESULT = "lstm/test_result_5"

FILE_PATH_RESULTS_SENTI_PREDICTED = f"{FOLDER_TEST_RESULT}/{STOCK_SYMBOL_LOWER}_sentiment_{CURRENT_TIME}/{STOCK_SYMBOL_LOWER}_sentiment_{CURRENT_TIME}_predicted_data.csv"
FILE_PATH_RESULTS_SENTI_EVAL = f"{FOLDER_TEST_RESULT}/{STOCK_SYMBOL_LOWER}_sentiment_{CURRENT_TIME}/{STOCK_SYMBOL_LOWER}_sentiment_{CURRENT_TIME}_eval.csv"
FILE_PATH_RESULTS_NON_SENTI_PREDICTED = f"{FOLDER_TEST_RESULT}/{STOCK_SYMBOL_LOWER}_non-sentiment_{CURRENT_TIME}/{STOCK_SYMBOL_LOWER}_non-sentiment_{CURRENT_TIME}_predicted_data.csv"
FILE_PATH_RESULTS_NON_SENTI_EVAL = f"{FOLDER_TEST_RESULT}/{STOCK_SYMBOL_LOWER}_non-sentiment_{CURRENT_TIME}/{STOCK_SYMBOL_LOWER}_non-sentiment_{CURRENT_TIME}_eval.csv"

# Load the data from the files
df_base_sentiment = spark.read.csv(
    FILE_PATH_RESULTS_SENTI_PREDICTED,
    header=True,
    inferSchema=True,
).select("True_Data_origin", "Predicted_Data_origin")

df_base_non_sentiment = spark.read.csv(
    FILE_PATH_RESULTS_NON_SENTI_PREDICTED,
    header=True,
    inferSchema=True,
).select("True_Data_origin", "Predicted_Data_origin")

# Collect the data from the PySpark DataFrame
data_sentiment = df_base_sentiment.collect()
data_non_sentiment = df_base_non_sentiment.collect()

# Extract the data into separate lists
true_data_origin = [row["True_Data_origin"] for row in data_sentiment]
predicted_data_origin_sentiment = [
    row["Predicted_Data_origin"] for row in data_sentiment
]
predicted_data_origin_non_sentiment = [
    row["Predicted_Data_origin"] for row in data_non_sentiment
]

# Plot the chart
plt.plot(true_data_origin, label="Closing Price", linestyle="-")
plt.plot(
    predicted_data_origin_sentiment,
    label="Predicted Closing Price with Sentiment",
    linestyle="-.",
)
plt.plot(
    predicted_data_origin_non_sentiment,
    label="Predicted Closing Price without Sentiment",
    linestyle=":",
)

plt.xlabel("Time")
plt.ylabel("Price")
plt.title(
    f"Stock {STOCK_SYMBOL_UPPER} - Predicted Closing Price Comparison between Sentiment and Non-Sentiment"
)
plt.legend()
plt.show()


In [None]:
# Load the data from the files
df_senti_eval = spark.read.csv(
    FILE_PATH_RESULTS_SENTI_EVAL,
    header=True,
    inferSchema=True,
)

df_non_senti_eval = spark.read.csv(
    FILE_PATH_RESULTS_NON_SENTI_EVAL,
    header=True,
    inferSchema=True,
)

# Calculate the evaluation metrics
senti_mae = df_senti_eval.agg({"MAE": "mean"}).first()["avg(MAE)"]
senti_mse = df_senti_eval.agg({"MSE": "mean"}).first()["avg(MSE)"]
senti_r2 = df_senti_eval.agg({"R2": "mean"}).first()["avg(R2)"]

non_senti_mae = df_non_senti_eval.agg({"MAE": "mean"}).first()["avg(MAE)"]
non_senti_mse = df_non_senti_eval.agg({"MSE": "mean"}).first()["avg(MSE)"]
non_senti_r2 = df_non_senti_eval.agg({"R2": "mean"}).first()["avg(R2)"]

# Plot the bar chart
plt.figure(figsize=(10, 6))
plt.subplot(1, 3, 1)
plt.bar(["Sentiment", "Non-Sentiment"], [senti_mae, non_senti_mae])
plt.xlabel("Evaluation Type")
plt.ylabel("MAE")
plt.title("MAE Comparison")
plt.text(0, senti_mae, str(round(senti_mae, 4)), ha="center")
plt.text(1, non_senti_mae, str(round(non_senti_mae, 4)), ha="center")

plt.subplot(1, 3, 2)
plt.bar(["Sentiment", "Non-Sentiment"], [senti_mse, non_senti_mse])
plt.xlabel("Evaluation Type")
plt.ylabel("MSE")
plt.title("MSE Comparison")
plt.text(0, senti_mse, str(round(senti_mse, 4)), ha="center")
plt.text(1, non_senti_mse, str(round(non_senti_mse, 4)), ha="center")

plt.subplot(1, 3, 3)
plt.bar(["Sentiment", "Non-Sentiment"], [senti_r2, non_senti_r2])
plt.xlabel("Evaluation Type")
plt.ylabel("R2")
plt.title("R2 Comparison")
plt.text(0, senti_r2, str(round(senti_r2, 4)), ha="center")
plt.text(1, non_senti_r2, str(round(non_senti_r2, 4)), ha="center")

plt.tight_layout()
plt.show()