In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import argparse
import re
import os
import pickle

from datetime import datetime
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

#tf.compat.v1.disable_eager_execution()

In [2]:
def get_data():
    return pd.read_csv("data/aapl_msi_sbux.csv").values

In [3]:
class ReplayBuffer:
    def __init__(self, obs_dim, act_dim, size):
        self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32) # Buffer per lo state
        self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32) # Buffer per il "next" state
        self.act_buf = np.zeros(size, dtype=np.uint8)
        self.rew_buf = np.zeros(size, dtype=np.float32)
        self.done_buf = np.zeros(size, dtype=np.uint8)
        self.ptr = 0 # Storiamo il pointer per fare l'array circolare
        self.size = 0 # Storiamo la current size del buffer
        self.max_size = size # Storiamo la max size oltre il quale il buffer cancella le vecchie azioni

    def store(self, obs, act, rew, next_obs, done):
        self.obs1_buf[self.ptr] = obs
        self.act_buf[self.ptr] = act
        self.rew_buf[self.ptr] = rew
        self.obs2_buf[self.ptr] = next_obs
        self.done_buf[self.ptr] = done
        self.ptr = (self.ptr + 1) % self.max_size # In questo modo creiamo un buffer circolare: se si raggiunge la max size, si torna indietro
        self.size = min(self.size + 1, self.max_size) # Settiamo la size come valore, cappando in max_size

    def sample_batch(self, batch_size=32):
        idx = np.random.randint(0, self.size, size=batch_size) # Seleziona "batch_size" valori random tra 0 e size
        return dict(
            s = self.obs1_buf[idx],
            a = self.act_buf[idx],
            r = self.rew_buf[idx],
            s2 = self.obs2_buf[idx],
            d = self.done_buf[idx]
        )

In [4]:
def get_scaler(env):
    """
    Crea uno scaler e fitta sui training states
    """
    states = []
    for _ in range(env.n_steps):
        action = np.random.choice(env.action_space)
        state, reward, done, info = env.step(action)
        states.append(state)
        if done:
            break

    states = np.array(states).reshape(-1, state.shape[1])
    scaler = StandardScaler()
    scaler.fit(states)
    return scaler

In [5]:
def make_dir(directory):
    if not os.path.exists(directory):
        os.mkdir(directory)

In [6]:
def mlp(input_dim, n_actions, n_hidden_layers=1, n_hidden_dim=64):
    i = Input(shape=(input_dim,))
    x = i
    for _ in range(n_hidden_layers):
        x = Dense(n_hidden_dim, activation="relu")(x)
    x = Dense(n_actions)(x)
    model = Model(i, x)
    model.compile(loss="mse", optimizer="adam")
    model.summary()
    return model

In [7]:
class MultiStockEnv:
    """
    Ambiente di trading 3 stocks
    Stato: un array a 7 dimensioni che contiene:
        - i prezzi per le tre stock
        - il numero di shares acquistate per i tre stock
        - i soldi contanti disponibili
    Azioni: variabile categorica tra 0 e (3^3)-1, con i seguenti valori per stock:
        - 0: vendi
        - 1: tieni
        - 2: compra
    """
    def __init__(self, data, initial_investment=20_000):
        self.stock_price_history = data
        self.n_steps = self.stock_price_history.shape[0]
        self.n_stocks = self.stock_price_history.shape[1]

        self.initial_investment = initial_investment
        self.cur_step = None
        self.owned_stocks = None
        self.stock_prices = None
        self.cash_in_hands = None
        
        self.action_space = np.arange(3**n_stocks) # numeri tra 0 e 26

        # Creiamo le permutazioni di azioni come gruppi da tre contenenti i numeri [0, 1, 2]
        # Sto costrutto mappa una lista di tuple in una lista di liste. La lista di tuple è generata da Itertools
        self.action_list = list(map(list, itertools.product([0, 1, 2], repeat=3)))

        self.state_dim = self.n_stocks * 2 + 1 # Dimensione dello stato
        self.reset()

    def reset(self):
        self.cur_step = 0
        self.cash_in_hands = self.initial_investment
        self.owned_stocks = np.zeros(self.n_stocks)
        self.stock_prices = self.stock_price_history[self.cur_step]
        return self._get_obs() # Ritorna lo stato iniziale

    def step(self, action):
        assert action in self.action_space

        prev_val = self._get_val()
        self.cur_step +=1
        self.stock_prices = self.stock_price_history[self.cur_step]

        self._trade(action)
        
        cur_val = self._get_val()
        reward = cur_val - prev_val
        done = self.cur_step == self.n_steps-1
        info = {"cur_val": cur_val}

        return self._get_obs(), reward, done, info

    def _get_obs(self):
        price = self.stock_prices.reshape(1, -1)
        own = self.owned_stocks.reshape(1, -1)
        hands = np.array(self.cash_in_hands).reshape(1, -1)
        obs = np.c_[own, price, hands]
        return obs

    def _get_val(self):
        return self.owned_stocks.dot(self.stock_prices) + self.cash_in_hands # Somma il valore le stock in mano + i soldi

    def _trade(self, action):
        # Vendiamo prima le stock, poi compriamo in round robin quelle da comprare
        # L'azione qui è un numero tra 0 e 26.
        # Campiona un vettore di azioni tra la lista di permutazioni
        # Il significato è, ad esempio, [2, 0, 1] -> Compra stock 0, Vendi stock 1, Tieni stock 2
        action_vec = np.array(self.action_list[action]).reshape(-1,)
        sell_idx = np.where(action_vec==0)[0]
        buy_idx = np.where(action_vec==2)[0]


        # Vende tutti gli stock contrassegnati da sell
        for idx in sell_idx:
            stock_count = self.owned_stocks[idx]
            stock_value = self.stock_prices[idx] * stock_count
            self.owned_stocks[idx] = 0
            self.cash_in_hands += stock_value

        # Compra uno stock per volta in "round robin" per tutti gli stock contrassegnati da buy fino a esaurimento fondi
        if len(buy_idx) > 0:
            min_price = min(self.stock_prices[buy_idx])
            while self.cash_in_hands >= min_price:
                for idx in buy_idx:
                    price = self.stock_prices[idx]
                    if self.cash_in_hands >= price:
                        self.owned_stocks[idx] += 1
                        self.cash_in_hands -= price


In [8]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = ReplayBuffer(state_size, action_size, size=500)
        self.gamma = 0.9 # fattore di sconto per i reward futuri
        self.epsilon = 1 # Epsilon per esplorazione (inizia da 1)
        self.epsilon_min = 0.01 # Epsilon minimo da raggiungere
        self.epsilon_decay = 0.995 # Fattore di decadimento Epsilon
        self.model = mlp(state_size, action_size) # La rete neurale

    def update_replay_memory(self, state, action, reward, next_state, done):
        self.memory.store(state, action, reward, next_state, done)

    def act(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.action_size)
            
        act_values = self.model.predict(state, verbose=0) # Ritorna il Q value di tutte le azioni.
        return np.argmax(act_values) # Ritorna l'indice dell'azione a valore maggiore

    def replay(self, batch_size=32):
        # Check se ci sono abbastanza samples
        if self.memory.size < batch_size:
            return

        minibatch = self.memory.sample_batch(batch_size)
        # Il minibatch è un dizionario con array di dimensione "batch_size"
        states = minibatch["s"]
        actions = minibatch["a"]
        rewards = minibatch["r"]
        next_states = minibatch["s2"]
        done = minibatch["d"] # -> Array di True/False

        # Piccolo reminder: Nel reinforcement learning (Q learning) sia la predizione che il target sono previsti dal modello
        # La differenza sta nel fatto che il target si calcola come valore massimo delle previsioni sul prossimo stato
        # Predizione: y = Q(s, a) -> su tutte le azioni
        # Target:  y_hat = r + gamma * max_a'(Q(s', a')) -> sull'azione presa
        targets = rewards + self.gamma * np.max(self.model.predict(next_states, verbose=0), axis=1)
        # Se il flag done è True, sistemiamo il reward affinchè non abbia "future rewards"
        targets[done] = rewards[done] # Qui cambiamo solo i valori laddove done = True

        # Ora, con Keras dobbiamo assicurarci che targets e model predictions abbiano la stessa shape
        # Ma noi vogliamo fare l'update dei valori solo sull'azione intrapresa
        # La nostra predizione avrà shape batch x actions ma ora i targets sono batch x 1
        # Per evitare l'update, semplicemente creiamo dei target fasulli pari alle predizioni, così che la loss sia 0
        target_full = self.model.predict(states, verbose=0)
        target_full[np.arange(batch_size), actions] = targets # All'indice dell'azione inseriamo il vero valore, per ogni batch

        self.model.train_on_batch(states, target_full)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [9]:
def play_one_episode(agent, env, is_train):
    state = env.reset()
    state = scaler.transform(state) # 1xD state
    done = False

    while not done:
        action = agent.act(state)
        next_state, reward, done, info = env.step(action)
        next_state = scaler.transform(next_state)
        if is_train == True:
            agent.update_replay_memory(state, action, reward, next_state, done)
            if env.cur_step % 40 == 0: 
                agent.replay(batch_size)
        state = next_state

    return info["cur_val"]
    

In [10]:
# configs
models_folder = "models"
rewards_folder = "rewards"
num_episodes = 2000
batch_size = 32
initial_investment = 20000

"""
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--mode", type=bool, required=True, help="either True or False")
args = parser.parse_args()
""";

In [11]:
make_dir(models_folder)
make_dir(rewards_folder)
data = get_data()
n_timesteps, n_stocks = data.shape
n_train = n_timesteps // 2
train_data = data[:n_train]
test_data = data[n_train:]

In [12]:
env = MultiStockEnv(train_data, initial_investment)
state_size = env.state_dim
action_size = len(env.action_space)
agent = DQNAgent(state_size, action_size)
scaler = get_scaler(env)

portfolio_value = []

print("Starting")
for e in range(num_episodes):
    t0 = datetime.now()
    val = play_one_episode(agent, env, is_train=True)
    dt = datetime.now() - t0
    print(f"Episode {e+1}/{num_episodes}: end episode value -> {val} | duration -> {dt}")
    portfolio_value.append(val)

agent.save(f"{models_folder}/dqn.h5")

with open(f"{models_folder}/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

np.save(f"{rewards_folder}/Train.npy", portfolio_value)

Starting
Episode 1/2000: end episode value -> 28009.88619999897 | duration -> 0:00:02.964501
Episode 2/2000: end episode value -> 31257.530000001057 | duration -> 0:00:05.398296
Episode 3/2000: end episode value -> 35086.884299999816 | duration -> 0:00:07.785205
Episode 4/2000: end episode value -> 27889.732899999115 | duration -> 0:00:08.391955
Episode 5/2000: end episode value -> 31866.45340000168 | duration -> 0:00:11.020333
Episode 6/2000: end episode value -> 27510.10500000112 | duration -> 0:00:11.226567
Episode 7/2000: end episode value -> 20650.677799998546 | duration -> 0:00:14.738201
Episode 8/2000: end episode value -> 28623.10029999592 | duration -> 0:00:15.521881
Episode 9/2000: end episode value -> 32591.927800000034 | duration -> 0:00:18.370156
Episode 10/2000: end episode value -> 23908.13499999946 | duration -> 0:00:16.697589
Episode 11/2000: end episode value -> 19406.158299999825 | duration -> 0:00:17.453248
Episode 12/2000: end episode value -> 24422.123999999098 | 

KeyboardInterrupt: 

In [None]:

portfolio_value = []
env = MultiStockEnv(test_data, initial_investment)
agent.epsilon
for e in range(num_episodes):
    t0 = datetime.now()
    val = play_one_episode(agent, env, is_train=False)
    dt = datetime.now() - t0
    print(f"Episode {e+1}/{num_episodes}: end episode value -> {val} | duration -> {dt}")
    portfolio_value.append(val)