## ZBTCNN RL
**Reinforcement Learning Trading Bot**

Decides between buy/sell/stand given past and future price input

In [1]:
import os
import sys
import random
import time
from collections import deque
import pandas as pd
import numpy as np
from sklearn import preprocessing
from enum import Enum
import stable_baselines3
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.a2c.a2c import A2C
import gym
from gym import error, spaces, utils
from gym.utils import seeding
import json
import joblib
import matplotlib.pyplot as plt
import os

# CD to top level git directory
if ".git" not in os.listdir("."):
    os.chdir("../../")

In [2]:
# Data parameters
SEQ_LEN = 48 #hours
FUTURE_PERIOD_PREDICT = 1 #hours

# Hyperparameter Optimizer parameters
# MAX_TRIALS = 40 # 10t x 20e ~ 4h

# Model parameters
# EPOCHS = 20
# BATCH_SIZE = 16
NAME = f"RL-SB3-Myenv-{int(time.time())}"

In [14]:
## Import data
# DATA MUST BE FORMATTED USING CSV_FORMATTER.IPYNB

csv_file = "data/formatted/BTCUSDT-1h-data.csv"

data = pd.read_csv(csv_file, skiprows=[0], names=["timestamp", "open", "high", "low", "close", "volume", "rsi", "ema"])

data.set_index("timestamp", inplace=True)

data.head()

Unnamed: 0_level_0,open,high,low,close,volume,rsi,ema
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1503064800,4304.15,4371.52,4296.04,4356.31,51.563675,52.623958,4327.15156
1503068400,4356.31,4357.37,4302.72,4340.31,24.093449,51.678528,4327.804777
1503072000,4320.52,4340.31,4287.79,4331.71,15.118957,51.167386,4327.995329
1503075600,4302.97,4318.16,4221.05,4293.09,46.533767,48.919621,4326.319858
1503079200,4293.09,4293.09,4193.7,4259.4,74.368943,47.054235,4323.157459


In [15]:
## Add min max bounds to data

price_max = 80000.0
volume_max = 60000.0
rsi_max = 100.0

price_min = 2000.0
volume_min = 0.0
rsi_min = 0.0

max_df = pd.DataFrame()

max_df["timestamp"] = []

for col in data.columns:
    max_df[col] = []

max_df = max_df.append({"timestamp": "1",
                "open": price_max,
                "high": price_max,
                "low": price_max,
                "close": price_max,
                "volume": volume_max,
                "rsi": rsi_max,
                "ema": price_max,
                "target": price_max}, ignore_index=True)

max_df = max_df.append({"timestamp": "0",
                "open": price_min,
                "high": price_min,
                "low": price_min,
                "close": price_min,
                "volume": volume_min,
                "rsi": rsi_min,
                "ema": price_min,
                "target": price_min}, ignore_index=True)

max_df.set_index("timestamp", inplace=True)

max_df.head()

  max_df = max_df.append({"timestamp": "1",
  max_df = max_df.append({"timestamp": "0",


Unnamed: 0_level_0,open,high,low,close,volume,rsi,ema,target
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,80000.0,80000.0,80000.0,80000.0,60000.0,100.0,80000.0,80000.0
0,2000.0,2000.0,2000.0,2000.0,0.0,0.0,2000.0,2000.0


In [16]:
## Formatting data and Scaler Initialization

# def classify(current, future):
#     return float((future - current) / current)


data["target"] = data["close"].shift(-FUTURE_PERIOD_PREDICT)

# # Cut off NaNs
# # data = data[:-FUTURE_PERIOD_PREDICT]
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

# data["target"] = list(map(classify, data["close"], data["future"]))
# # data[["close", "future", "target"]].tail()
# data = data.drop("future", 1)

# Fit scalers
price_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
volume_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
rsi_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))

data = data.append(max_df)

price_scaler.fit(np.array(data["close"]).reshape(-1, 1))
volume_scaler.fit(np.array(data["volume"]).reshape(-1, 1))
rsi_scaler.fit(np.array(data["rsi"]).reshape(-1, 1))

#Dump scalers
try:
    os.mkdir(f"scalers/{NAME}")
except:
    pass

joblib.dump(price_scaler, f"scalers/{NAME}/price_scaler")
joblib.dump(volume_scaler, f"scalers/{NAME}/volume_scaler")
joblib.dump(rsi_scaler, f"scalers/{NAME}/rsi_scaler")

# Remove min max boundary values
data = data[:-2]


# Split dataset # Dont split here; do split after shuffle
# last_5_pct = int(len(data) * .95)

# train_data = data[:last_5_pct]
# validation_data = data[last_5_pct:]

# print(f"{len(train_data)} :: {len(validation_data)}")

data.head()

  data = data.append(max_df)


Unnamed: 0_level_0,open,high,low,close,volume,rsi,ema,target
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1503064800,4304.15,4371.52,4296.04,4356.31,51.563675,52.623958,4327.15156,4340.31
1503068400,4356.31,4357.37,4302.72,4340.31,24.093449,51.678528,4327.804777,4331.71
1503072000,4320.52,4340.31,4287.79,4331.71,15.118957,51.167386,4327.995329,4293.09
1503075600,4302.97,4318.16,4221.05,4293.09,46.533767,48.919621,4326.319858,4259.4
1503079200,4293.09,4293.09,4193.7,4259.4,74.368943,47.054235,4323.157459,4236.89


In [17]:
## Preprocess Data

def preprocess_df_p1(df_p):

    df = pd.DataFrame()
    for col in df_p.columns:
        df[col] = df_p[col]

    for col in df.columns:
        scaler = None
        if col in ["open", "high", "low", "close", "ema", "target"]:
            scaler = price_scaler
        elif col == "volume":
            scaler = volume_scaler
        elif col == "rsi":
            scaler = rsi_scaler
        else:
            raise Exception("Column not recognized and scaler cannot be determined")

        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.dropna(inplace=True)
        df[col] = scaler.transform(np.array(df[col]).reshape(-1, 1))

        
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)

    sequential_data = []
    prev_periods = deque(maxlen=SEQ_LEN)

    for i in df.values:
        prev_periods.append([n for n in i[:-1]])
        if len(prev_periods) == SEQ_LEN:
            sequential_data.append([np.array(prev_periods), i[-1]])

    return sequential_data
    # random.shuffle(sequential_data)

def preprocess_df_p2(seq_data):

    # Balance buys and sells
    buys = []
    sells = []

    for seq, target in seq_data:

        if target < seq[-1][3]: #compares to close column
            sells.append([seq, target])
        elif target > seq[-1][3]:
            buys.append([seq, target])

    lower = min(len(buys), len(sells))

    buys = buys[:lower]
    sells = sells[:lower]

    local_seq_data = buys + sells

    random.shuffle(local_seq_data)

    X = [d[0] for d in local_seq_data]
    Y = [d[1] for d in local_seq_data]

    return np.array(X), np.array(Y)
    

# train_x, train_y = preprocess_df(train_data)
# validation_x, validation_y = preprocess_df(validation_data)

# Preprocess and split data here
seq_data_full = preprocess_df_p1(data)

last_5_pct = int(len(seq_data_full) * .95)

seq_data_train = seq_data_full[:last_5_pct]
seq_data_val = seq_data_full[last_5_pct:]

train_x, train_y = preprocess_df_p2(seq_data_train)
validation_x, validation_y = preprocess_df_p2(seq_data_val)

In [18]:
## Dataset metrics

close_target_list = list(zip([x[-1][3] for x in list(train_x)], list(train_y)))

train_sell_counter = len([x for x in close_target_list if x[0] > x[1]])
train_buy_counter = len([x for x in close_target_list if x[0] < x[1]])

close_target_list = list(zip([x[-1][3] for x in list(validation_x)], list(validation_y)))

val_sell_counter = len([x for x in close_target_list if x[0] > x[1]])
val_buy_counter = len([x for x in close_target_list if x[0] < x[1]])

print(f"Train : Validation == {len(train_x)} : {len(validation_x)}")
print(f"Train\t\tBuys : Sells == {train_buy_counter} : {train_sell_counter}")
print(f"Validation\tBuys : Sells == {val_buy_counter} : {val_sell_counter}")

Train : Validation == 35626 : 1882
Train		Buys : Sells == 17813 : 17813
Validation	Buys : Sells == 941 : 941


In [19]:
action_str_list = ["BUY", "SELL", "HOLD"]


class TradingEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    INITIAL_BAL = 10000

    def __init__(self, data):
        # Env Dataset
        # data fmt = (train_x, train_y)
        self.data = data
        self.data_index = 0

        # Gym Vars

        # These are raw price ranges but train_x is scaled
        # observation_high = np.array([max_df.loc["1"].to_numpy()[:-1] for _ in range(SEQ_LEN)])
        # observation_low = np.array([max_df.loc["0"].to_numpy()[:-1] for _ in range(SEQ_LEN)])

        observation_high = np.array([np.array([1,1,1,1,1,1,1], dtype=np.float64) for _ in range(SEQ_LEN)], dtype=np.float64)
        observation_low = np.array([np.array([1,1,1,1,1,1,1], dtype=np.float64)*-1 for _ in range(SEQ_LEN)], dtype=np.float64)

        self.action_space = spaces.Discrete(3)
        self.observation_space = spaces.Box(low=observation_low, high=observation_high, dtype=np.float64)

        self.score = 0.0 #optional

        # Trading Vars
        self.balance = self.INITIAL_BAL
        self.position = 0 #long=1;flat=0
        self.last_sharpe = 0
        self.profits = []
        self.total_profit = 0

    def seed(self):
        pass

    def _calc_reward(self, profits):
        sharpe = np.float64(np.sum(np.array(profits)) / (np.std(np.array(profits)) + .0001))
        ret = sharpe - self.last_sharpe
        if ret != np.inf and ret != -np.inf and ret != np.nan:
            self.last_sharpe = sharpe
            return ret
        else:
            return 0

    def step(self, action):
        # Make sure action is within bounds
        assert self.action_space.contains(action)

        done = False
        reward = 0

        if self.data_index == len(self.data[0]) - 2:
            done = True

        state = self.data[0][self.data_index]

        # Calculate Reward and Balance
        curr_price = price_scaler.inverse_transform(np.array([self.data[0][self.data_index][-1][3]]).reshape(-1,1)).tolist()[0][0]
        next_price = price_scaler.inverse_transform(np.array([self.data[1][self.data_index]]).reshape(-1,1)).tolist()[0][0]

        old_balance = self.balance
        if action == action_str_list.index("BUY"):
            self.balance *= next_price / curr_price
            self.position = 1 #long
        elif action == action_str_list.index("SELL"):
            self.position = 0 #flat
            pass # balance is unchanged
        elif action == action_str_list.index("HOLD"):
            # continue with previous position
            if self.position == 1:
                self.balance *= next_price / curr_price
            elif self.position == 0:
                pass # balance is unchanged


        # Reward is difference in balances
        profit = self.balance - old_balance
        self.profits.append(profit)
        reward = self._calc_reward(self.profits)
        self.score += profit

        # Record step results in info
        info =  {
                    "index": self.data_index,
                    "action": action_str_list[action],
                    "profit": profit,
                    "reward": reward,
                    "score": self.score
                }

        # Increment data index and set state to next step data
        self.data_index += 1
        state = self.data[0][self.data_index]

        return state, reward, done, info

    def render(self, mode='human'):
        pass

    def reset(self):
        self.data_index = 0
        self.score = 0.0
        self.balance = self.INITIAL_BAL
        self.position = 0 #long=1;flat=0
        self.last_sharpe = 0
        self.profits = []
        self.total_profit = 0
        return self.data[0][self.data_index]

In [20]:
# Set bounds

train_bounds = (10000, 25000)
test_bounds = (25000, 38000)

In [21]:
# Auto env generation

train_env_factory = lambda: TradingEnv((train_x[train_bounds[0]:train_bounds[1]], train_y[train_bounds[0]:train_bounds[1]]))
test_env_factory = lambda: TradingEnv((train_x[test_bounds[0]:test_bounds[1]], train_y[test_bounds[0]:test_bounds[1]]))


train_env = DummyVecEnv([train_env_factory])
test_env = DummyVecEnv([test_env_factory])

In [22]:
# Make model

model = A2C("MlpPolicy", train_env, verbose=1, tensorboard_log=f"./logs/{NAME}")

Using cuda device


In [23]:
# Train model

model = model.learn(total_timesteps=300000, eval_env=test_env)

Logging to ./logs/RL-SB3-Myenv-1647152968/A2C_1
------------------------------------
| time/                 |          |
|    fps                | 355      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.06    |
|    explained_variance | -0.184   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.0245  |
|    value_loss         | 0.133    |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 295      |
|    iterations         | 200      |
|    time_elapsed       | 3        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -1.04    |
|    explained_variance | 0.00588  |
|    learning_rate      | 0.0007   |
|    n_updates          | 199      |
|    policy_loss        | 0

In [27]:
model.save(f"models/{NAME}")

In [28]:
# Run env with model

eval_bounds = (0,38628)

profit_list = []

env = TradingEnv((train_x[eval_bounds[0]:eval_bounds[1]], train_y[eval_bounds[0]:eval_bounds[1]]))

state = env.reset()
while True: 
    action, *_ = model.predict(state)
    state, reward, done, info = env.step(action)
    print(info)
    if done: 
        print("info", info)
        break
        
# plt.figure(figsize=(15,6))
# plt.cla()
# env.render_all()
# plt.show()

{'index': 0, 'action': 'SELL', 'profit': 0, 'reward': 0.0, 'score': 0.0}
{'index': 1, 'action': 'SELL', 'profit': 0, 'reward': 0.0, 'score': 0.0}
{'index': 2, 'action': 'SELL', 'profit': 0, 'reward': 0.0, 'score': 0.0}
{'index': 3, 'action': 'SELL', 'profit': 0, 'reward': 0.0, 'score': 0.0}
{'index': 4, 'action': 'SELL', 'profit': 0, 'reward': 0.0, 'score': 0.0}
{'index': 5, 'action': 'SELL', 'profit': 0, 'reward': 0.0, 'score': 0.0}
{'index': 6, 'action': 'SELL', 'profit': 0, 'reward': 0.0, 'score': 0.0}
{'index': 7, 'action': 'SELL', 'profit': 0, 'reward': 0.0, 'score': 0.0}
{'index': 8, 'action': 'SELL', 'profit': 0, 'reward': 0.0, 'score': 0.0}
{'index': 9, 'action': 'SELL', 'profit': 0, 'reward': 0.0, 'score': 0.0}
{'index': 10, 'action': 'SELL', 'profit': 0, 'reward': 0.0, 'score': 0.0}
{'index': 11, 'action': 'SELL', 'profit': 0, 'reward': 0.0, 'score': 0.0}
{'index': 12, 'action': 'SELL', 'profit': 0, 'reward': 0.0, 'score': 0.0}
{'index': 13, 'action': 'SELL', 'profit': 0, 're

In [26]:
plt.plot(list(range(len(profit_list))), np.array(profit_list)+10000)
plt.plot(list(range(len(profit_list))), data["close"].tolist()[eval_bounds[0]:eval_bounds[1]])

plt.show()

NameError: name 'profit_list' is not defined