# Initialize the Python Notebook

Install the necessary Python packages found in requirements.txt and import the necessary Python libraries.

In [1]:
%%capture
!pip3 install -r requirements.txt

from keras.layers import Dense, Activation, Dropout, LSTM
from keras.models import Sequential, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from pyspark.sql.types import (
    DateType,
    DoubleType,
    IntegerType,
    LongType,
    StructType,
    StructField,
    StringType,
    TimestampType,
)

2024-10-27 02:13:51.963533: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Initialize the Python notebook environment.

In [5]:
# Stock symbol
STOCK_SYMBOL_UPPER = "TSLA"  # AA, AAPL, AMZN, MSFT, TSLA, JPM, LLY
STOCK_SYMBOL_LOWER = "tsla"  # aa, aapl, amzn, msft, tsla, jpm, lly

# File names
FILE_NAME_COMBINED = f"{STOCK_SYMBOL_LOWER}.csv"

# File paths
FILE_PATH_COMBINED = f"stock_combined/{FILE_NAME_COMBINED}"

# Define the schema
COMBINED_SCHEMA = StructType(
    [
        StructField("Date", DateType(), True),
        StructField("Open", DoubleType(), True),
        StructField("High", DoubleType(), True),
        StructField("Low", DoubleType(), True),
        StructField("Close", DoubleType(), True),
        StructField("Adj_close", DoubleType(), True),
        StructField("Volume", LongType(), True),
        StructField("Normalized_sentiment", DoubleType(), True),
    ]
)

# Initialize Model

# Run Model

In [13]:
import os
import json
import time
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from datetime import datetime
from core.data_processor import DataLoader
from core.LSTM_modified_model import Model

# 生成带有当前时间的文件夹名
current_time = datetime.now().strftime("%Y%m%d%H")


def output_results_and_errors_multiple(
    predicted_data,
    true_data,
    true_data_base,
    prediction_len,
    file_name,
    sentiment_type,
    num_csvs,
):
    ### 输出预测和真实值
    # 创建一个空的DataFrame
    save_df = pd.DataFrame()

    # 将真实值添加到DataFrame中
    save_df["True_Data"] = true_data.reshape(-1)
    save_df["Base"] = true_data_base.reshape(-1)

    # 转化回原scale
    save_df["True_Data_origin"] = (save_df["True_Data"] + 1) * save_df["Base"]

    # 将所有预测数据拼接在一起
    if predicted_data:
        # 使用列表推导式将数组拼接
        all_predicted_data = np.concatenate([p for p in predicted_data])
    else:
        # 如果 predicted_data 为空，则赋值为一个空数组或者根据你的需求进行处理
        all_predicted_data = predicted_data

    file_name = file_name.split(".")[0]
    sentiment_type = str(sentiment_type)

    # 将拼接后的预测数据添加到DataFrame中
    save_df["Predicted_Data"] = pd.Series(all_predicted_data)

    # 转化回原scale
    save_df["Predicted_Data_origin"] = (save_df["Predicted_Data"] + 1) * save_df["Base"]

    # 如果预测值的长度不同，则填充NaN
    save_df = save_df.fillna(np.nan)
    result_folder = f"test_result_{num_csvs}"
    save_file_path = os.path.join(
        result_folder,
        f"{file_name}_{sentiment_type}_{current_time}",
        f"{file_name}_{sentiment_type}_{current_time}_predicted_data.csv",
    )
    # 保存DataFrame到CSV文件
    # 创建目录（如果不存在）
    os.makedirs(
        os.path.join(result_folder, f"{file_name}_{sentiment_type}_{current_time}"),
        exist_ok=True,
    )

    save_df.to_csv(save_file_path, index=False)
    print(f"Data saved to {save_file_path}")
    ### 输出eval
    # 截断数据以确保长度一致
    min_length = min(len(save_df["Predicted_Data"]), len(save_df["True_Data"]))
    predicted_data = save_df["Predicted_Data"][:min_length]
    true_data = save_df["True_Data"][:min_length]

    # 计算 MAE, MSE, R²
    mae = mean_absolute_error(true_data, predicted_data)
    mse = mean_squared_error(true_data, predicted_data)
    r2 = r2_score(true_data, predicted_data)

    print("MAE:", mae)
    print("MSE:", mse)
    print("R²:", r2)
    results_df = pd.DataFrame({"MAE": [mae], "MSE": [mse], "R2": [r2]})

    eval_file_path = os.path.join(
        result_folder,
        f"{file_name}_{sentiment_type}_{current_time}",
        f"{file_name}_{sentiment_type}_{current_time}_eval.csv",
    )

    # 保存结果到CSV文件
    results_df.to_csv(eval_file_path, index=False)
    print(f"\nResults saved to {eval_file_path}")


# Main Function
def main(configs, data_filename, sentiment_type, flag_pred, model_name, num_csvs):
    print(
        f"flag_pred: {flag_pred}, sentiment_type: {sentiment_type}, data_filename: {data_filename}"
    )
    symbol_name = name.split(".")[0]
    if not os.path.exists(configs["model"]["save_dir"]):
        os.makedirs(configs["model"]["save_dir"])

    data = DataLoader(
        os.path.join("data", data_filename),
        configs["data"]["train_test_split"],
        configs["data"]["columns"],
        configs["data"]["columns_to_normalise"],
        configs["data"]["prediction_length"],
    )

    model = Model()
    model_path = f"saved_models/{model_name}_{sentiment_type}_{num_csvs}.keras"
    if os.path.exists(model_path):
        model.load_model(model_path)
    else:
        model.build_model(configs)

    x, y = data.get_train_data(
        seq_len=configs["data"]["sequence_length"],
        normalise=configs["data"]["normalise"],
    )
    print("X:", x.shape)
    # print(x[0])
    print("Y:", y.shape)
    # print(y)
    """
	# in-memory training
	model.train(
		x,
		y,
		epochs = configs['training']['epochs'],
		batch_size = configs['training']['batch_size'],
		save_dir = configs['model']['save_dir']
	)
	"""
    # out-of memory generative training
    steps_per_epoch = math.ceil(
        (data.len_train - configs["data"]["sequence_length"])
        / configs["training"]["batch_size"]
    )
    model.train_generator(
        data_gen=data.generate_train_batch(
            seq_len=configs["data"]["sequence_length"],
            batch_size=configs["training"]["batch_size"],
            normalise=configs["data"]["normalise"],
        ),
        epochs=configs["training"]["epochs"],
        batch_size=configs["training"]["batch_size"],
        steps_per_epoch=steps_per_epoch,
        save_dir=configs["model"]["save_dir"],
        sentiment_type=sentiment_type,
        model_name=model_name,
        num_csvs=num_csvs,
    )
    if flag_pred:
        if symbol_name in pred_names:
            print("-----Predicting-----")
            x_test, y_test, y_base = data.get_test_data(
                seq_len=configs["data"]["sequence_length"],
                normalise=configs["data"]["normalise"],
                cols_to_norm=configs["data"]["columns_to_normalise"],
            )
            print("test data:")
            print("X:", x_test.shape)
            print("Y:", y_test.shape)
            predictions = model.predict_sequences_multiple_modified(
                x_test,
                configs["data"]["sequence_length"],
                configs["data"]["prediction_length"],
            )

            output_results_and_errors_multiple(
                predictions,
                y_test,
                y_base,
                configs["data"]["prediction_length"],
                symbol_name,
                sentiment_type,
                num_csvs,
            )


if __name__ == "__main__":
    model_name = "LSTM"
    sentiment_types = ["sentiment", "non-sentiment"]  # "sentiment", "non-sentiment"

    # Test csvs = 5
    names_5 = ["aa.csv", "aapl.csv", "amzn.csv", "msft.csv", "tsla.csv"]
    names_25 = []
    names_50 = []
    all_names = [names_5]
    # all_names = [names_5, names_25, names_50]
    pred_names = ["aa", "aapl", "amzn", "msft", "tsla"]
    for names in all_names:
        num_stocks = len(names)
        # num_stocks = 5
        # num_stocks = 25
        # num_stocks = 50
        # For the first and second runs, only model training was performed
        # In the third run, it will train and make predictions
        for i in range(3):
            if_pred = False
            if i == 0 or i == 1:
                continue
            if i == 2:
                if_pred = True
            for sentiment_type in sentiment_types:
                for name in names:
                    configs = json.load(open(sentiment_type + "-config.json", "r"))
                    print(f"#{i}")
                    main(configs, name, sentiment_type, if_pred, model_name, num_stocks)


#2
flag_pred: True, sentiment_type: sentiment, data_filename: aa.csv
[Model] Loading model from file saved_models/LSTM_sentiment_5.keras


X: (13214, 49, 3)
Y: (13214, 1)
-----Predicting-----
test data:
X: (2291, 49, 3)
Y: (2291, 1)
Data saved to test_result_5/aa_sentiment_2024102709/aa_sentiment_2024102709_predicted_data.csv

Results saved to test_result_5/aa_sentiment_2024102709/aa_sentiment_2024102709_eval.csv
#2
flag_pred: True, sentiment_type: sentiment, data_filename: aapl.csv
[Model] Loading model from file saved_models/LSTM_sentiment_5.keras
X: (9174, 49, 3)
Y: (9174, 1)
-----Predicting-----
test data:
X: (1578, 49, 3)
Y: (1578, 1)
Data saved to test_result_5/aapl_sentiment_2024102709/aapl_sentiment_2024102709_predicted_data.csv

Results saved to test_result_5/aapl_sentiment_2024102709/aapl_sentiment_2024102709_eval.csv
#2
flag_pred: True, sentiment_type: sentiment, data_filename: amzn.csv
[Model] Loading model from file saved_models/LSTM_sentiment_5.keras
X: (5645, 49, 3)
Y: (5645, 1)
-----Predicting-----
test data:
X: (955, 49, 3)
Y: (955, 1)
Data saved to test_result_5/amzn_sentiment_2024102709/amzn_sentiment_2

# Sentiment Training

Define global constants for sentiment training.

In [None]:
TRAIN_TEST_SPLIT = 0.85
COLUMNS = ["Close", "Volume", "Normalized_sentiment"]
COLUMNS_TO_NORMALISE = [0, 1]
PREDICTION_LENGTH = 3


# Non-Sentiment Training

In [35]:
import keras


sequential_model = keras.Sequential([keras.layers.Dense(2)])
sequential_model.save("saved_model.keras")

# Ingest Data

In [None]:
# Read combined data from CSV files to Spark dataframe
df_data = spark.read.csv(FILE_PATH_COMBINED, header=True, schema=COMBINED_SCHEMA)

In [None]:
# Verify
print(f"Row count for df_data: {df_data.count()}")
df_data.show(5, truncate=True)