Copy the file **nysi.csv.gz** located in https://github.com/crapher/medium/tree/main/12.RLNextTrendNysi/data to your colab folder

In [1]:
!pip install gymnasium stable_baselines3[extra]



In [2]:
import math
import pandas as pd
import numpy as np
import gymnasium as gym
from gymnasium import spaces

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback

from sklearn.metrics import confusion_matrix, classification_report

# Constants
SCALE = 1500
OBSERVATION_SIZE = 2

  if not hasattr(tensorboard, "__version__") or LooseVersion(


In [3]:
class NextTrendEnv(gym.Env):

    def __init__(self, observation_size, closes):

        # Data
        self.__features = closes[:-1]
        self.__targets = closes[1:]

        # Spaces
        self.observation_space = spaces.Box(low=-1, high=1, shape=(observation_size,), dtype=np.float32)
        self.action_space = spaces.Discrete(2)

        # Episode Management
        self.__start_tick = observation_size
        self.__end_tick = len(self.__targets)
        self.__current_tick = self.__end_tick

    def reset(self, seed=None, options=None):

        # Reset the current tick pointer and return a new observation
        self.__current_tick = self.__start_tick

        return self.__get_observation(), None

    def step(self, action):

        # If current tick is over the last index in the feature array, the environment needs to be reset
        if self.__current_tick > self.__end_tick:
            raise Exception('The environment needs to be reset.')

        # Assuming that the model returns 0 for downtrend and 1 for uptrend, it replaces a 0 with -1 for easier comparison.
        action = -1 if action == 0 else action

        # Compute the step reward (-1 if the model value is different from the target or 0 if the value is the same)
        step_reward = -1 if action != self.__target else 0

        # Generate the custom info array with the real and predicted values
        info = {
            'agent_target': action,
            'real_target': self.__target}

        # Increase the current tick pointer, check if the environment is fully processed, and get a new observation
        self.__current_tick += 1
        terminated = self.__current_tick >= self.__end_tick
        truncated = False
        obs = self.__get_observation()

        # Returns the observation, the step reward, the status of the environment, and the custom information
        return obs, step_reward, terminated, truncated, info

    def __get_observation(self):

        # If the current tick is over the last value in the feature array, the environment needs to be reset
        if self.__current_tick >= self.__end_tick:
            return None

        # Generate the observation (and scale it) and the target value
        self.__observation = self.__features[(self.__current_tick - self.__start_tick):self.__current_tick]
        self.__observation = np.clip(self.__observation / SCALE, -1, 1)
        self.__target = np.where(self.__targets[self.__current_tick] > self.__targets[self.__current_tick - 1], 1, -1)

        # Return the calculated observation
        return self.__observation

  and should_run_async(code)


In [4]:
# Read the data and generate the train, validation, and test dataframes
df = pd.read_csv('./nysi.csv.gz', compression='gzip')
train = df[df['date'] <= '2020-06-01']
validation = df[(df['date'] > '2020-06-01') & (df['date'] <= '2022-01-01')]
test = df[df['date'] > '2022-01-01']

In [5]:
# Create 4 parallel train environments
env = make_vec_env(NextTrendEnv, seed=42, n_envs=4, env_kwargs={'observation_size': OBSERVATION_SIZE, 'closes': train['value'].values})

  and should_run_async(code)


In [6]:
# Create a validation environment
eval_env = NextTrendEnv(observation_size=OBSERVATION_SIZE, closes=validation['value'].values)
eval_callback = EvalCallback(eval_env, best_model_save_path="./", eval_freq=2 * len(train), deterministic=True, render=False)

In [7]:
# Train the model
model = PPO("MlpPolicy", env, verbose=0)
model.learn(total_timesteps=1_000_000, callback=eval_callback, progress_bar=True)

Output()

<stable_baselines3.ppo.ppo.PPO at 0x7a0046bc76d0>

In [8]:
# Remove and reload the best model (To be sure it works as expected)
del model
model = PPO.load("best_model")

In [9]:
# Create a test environment
env = NextTrendEnv(observation_size=OBSERVATION_SIZE, closes=test['value'].values)

In [10]:
# Create the required variables for calculation
real = []
predicted = []
terminated = False

# Predict the test values with the trained model
obs, _ = env.reset()
while not terminated:
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, terminated, truncated, info = env.step(action)

    # Save the values to calculate the errors
    real.append(info['real_target'])
    predicted.append(info['agent_target'])

In [11]:
# Show results
real = np.array(real)
predicted = np.array(predicted)

print(' RESULT TEST '.center(56, '*'))
print('* Confusion Matrix (Top: Predicted - Left: Real)')
print(confusion_matrix(real, predicted))
print('* Classification Report')
print(classification_report(real, predicted))

********************* RESULT TEST **********************
* Confusion Matrix (Top: Predicted - Left: Real)
[[135  46]
 [ 31 162]]
* Classification Report
              precision    recall  f1-score   support

          -1       0.81      0.75      0.78       181
           1       0.78      0.84      0.81       193

    accuracy                           0.79       374
   macro avg       0.80      0.79      0.79       374
weighted avg       0.80      0.79      0.79       374

