### file name: clstm_ppo_model.ipynb
### description: This notebook contains the implementation of the clstm_ppo_model described in the paper "A novel Deep Reinforcement Learning based automated stock trading system using cascaded LSTM networks" by Jie Zou et al.
### author: Damiano Pasquini [pasquini.damiano00@gmail.com]
### dataset citation: Dong, Z., Fan, X., & Peng, Z. (2024). FNSPID: A Comprehensive Financial News Dataset in Time Series. arXiv preprint arXiv:2402.06698.
### license: Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC-4.0) license

In [16]:
# imports
import os, os.path
import pandas as pd

dir_path = 'dataset/processed/data_for_lstm' # Path to the dataset containing n stocks data labelled also with sentiment
if not os.path.exists(dir_path):
    raise ValueError("The specified path does not exist.")

# Test csvs = 50
names_50 = ['aal.csv', 'AAPL.csv', 'ABBV.csv', 'AMD.csv', 'amgn.csv', 'AMZN.csv', 'BABA.csv',
            'bhp.csv', 'bidu.csv', 'biib.csv', 'BRK-B.csv', 'C.csv', 'cat.csv', 'cmcsa.csv', 'cmg.csv',
            'cop.csv', 'COST.csv', 'crm.csv', 'CVX.csv', 'dal.csv', 'DIS.csv', 'ebay.csv', 'GE.csv',
            'gild.csv', 'gld.csv', 'GOOG.csv', 'gsk.csv', 'INTC.csv', 'KO.csv', 'mrk.csv', 'MSFT.csv',
            'mu.csv', 'nke.csv', 'nvda.csv', 'orcl.csv', 'pep.csv', 'pypl.csv', 'qcom.csv', 'QQQ.csv',
            'SBUX.csv', 'T.csv', 'tgt.csv', 'tm.csv', 'TSLA.csv', 'TSM.csv', 'uso.csv', 'v.csv', 'WFC.csv',
            'WMT.csv', 'xlf.csv']

# Test csvs = 25
names_25 = ['AAPL.csv', 'ABBV.csv', 'AMZN.csv', 'BABA.csv', 'BRK-B.csv', 'C.csv', 'COST.csv', 'CVX.csv', 'DIS.csv',
            'GE.csv',
            'INTC.csv', 'MSFT.csv', 'nvda.csv', 'pypl.csv', 'QQQ.csv', 'SBUX.csv', 'T.csv', 'TSLA.csv', 'WFC.csv',
            'KO.csv', 'AMD.csv', 'TSM.csv', 'GOOG.csv', 'WMT.csv']

# Test csvs = 5
names_5 = ['KO.csv', 'AMD.csv', 'TSM.csv', 'GOOG.csv', 'WMT.csv']

In [14]:
# CUDA availability
import torch

print("PyTorch Version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA Version:", torch.version.cuda)
    print("GPU Name:", torch.cuda.get_device_name(0))

PyTorch Version: 2.6.0+cu126
CUDA Available: True
CUDA Version: 12.6
GPU Name: NVIDIA GeForce RTX 3050 Laptop GPU


In [15]:
def build_df(path, num_stocks):
    """
    This function builds a dataframe from the csv files in the path directory, with the first num_stocks.
    :param path: path to the directory containing the csv files
    :param num_stocks: number of stocks to consider, must be 5, 25 or 50
    :return: a dataframe containing the data from the csv files
    """
    if not os.path.exists(path):
        raise ValueError("The specified path does not exist.")
    if not os.path.isdir(path):
        raise ValueError("The specified path is not a directory.")
    if not os.listdir(path):
        raise ValueError("The specified directory is empty.")
    if not all([f.endswith('.csv') for f in os.listdir(path)]):
        raise ValueError("The specified directory contains files that are not CSV files.")
    if not num_stocks == 5 or num_stocks == 25 or num_stocks == 50:
        raise ValueError("The number of stocks must be 5, 25 or 50.")
    df = pd.DataFrame()
    for i in range(num_stocks):
        stock = pd.read_csv(path + '/' + names_50[i])
        df = pd.concat([df, stock])
    df.sort_values(by='Date', inplace=True)
    return df

data_frame = build_df(dir_path, 5)
print(data_frame.shape)
print(data_frame.head())

(11864, 10)
                        Date       Open       High        Low      Close  \
0  2010-02-09 00:00:00+00:00  58.240002  58.400002  56.700001  57.209999   
1  2010-02-10 00:00:00+00:00  57.090000  57.290001  56.369999  56.840000   
2  2010-02-11 00:00:00+00:00  56.560001  57.000000  56.230000  56.500000   
3  2010-02-12 00:00:00+00:00  56.220001  56.720001  56.020000  56.480000   
4  2010-02-16 00:00:00+00:00  56.619999  57.000000  56.200001  56.990002   

   Adj close    Volume  Sentiment_gpt  News_flag  Scaled_sentiment  
0  41.448292  18732200            3.0        1.0          0.500025  
1  41.180233   6019100            3.0        0.0          0.500025  
2  40.933891   8491100            3.0        0.0          0.500025  
3  40.919403   8008700            3.0        0.0          0.500025  
4  41.288906   5245900            3.0        0.0          0.500025  


In [None]:
# LSTM PPO model

