<a href="https://colab.research.google.com/github/geekpradd/Reinforcement-Learning-Stock-Trader/blob/master/Structure_Discretized_Stock_Trader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# !apt-get install -y xvfb python-opengl > /dev/null 2>&1
# !pip install gym pyvirtualdisplay > /dev/null 2>&1

from google.colab import drive
drive.mount('/content/drive')

import gym
import numpy as np
import tensorflow as tf
import keras
import matplotlib.pyplot as plt
import random
import pickle 
import time

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
class ReplayMemory:
    def __init__(self, max_size):
        self.buffer = [None] * max_size
        self.max_size = max_size
        self.index = 0
        self.size = 0

    def append(self, obj):
        self.buffer[self.index] = obj
        self.size = min(self.size + 1, self.max_size)
        self.index = (self.index + 1) % self.max_size

    def sample(self, batch_size, timesteps):
        indices = random.sample(range(self.size), batch_size)
        final = []
        for i in indices:
            if i == self.index:
                i = random.sample(range(self.size), 1)
            final.append(np.array(buffer[i:min(self.max_size, i+timesteps)]))
        return final

In [0]:
class Model:
    def __init__(self, num_actions):
        self.model = keras.Sequential()
        self.model.add(keras.layers.Dense(256, activation = 'relu', input_shape = (4, )))
        self.model.add(keras.layers.Dropout(0.5))
        self.model.add(keras.layers.Dense(128, activation = 'relu'))
        self.model.add(keras.layers.LSTM(64, initial_states = [np.zeros(()), np.zeros(())]))
        self.model.add(keras.layers.Dense(num_actions))
        self.model.compile(optimizer = keras.optimizers.Adam(), loss = 'mse', metrics = ['accuracy'])

In [0]:
class Agent:
    def __init__(self, agent_params):
        self.discount = agent_params['discount']
        self.epsilon = agent_params['epsilon']
        self.num_actions = agent_params['num_actions']
        self.exp_size = agent_params['exp_size']
        self.timesteps = agent_params['timesteps']
        self.merge = agent_params['merge_threshold']
        self.batch_size = agent_params['batch_size']
        self.last_action = None
        self.last_state = None
        self.exp = ReplayMemory(self.exp_size)
        self.target_model = Model(self.num_actions)
        self.value_model = Model(self.num_actions)
        self.num_updates = 0
        self.num_steps = 0
        self.num_merge = 0
        self.target_model.model.set_weights(self.value_model.model.get_weights())

    def run(self):
        updates = min(self.batch_size, self.exp.size)
        self.num_updates += updates
        batch = self.exp.sample(updates)

        input = [x[0] for x in batch]
        inp = [x[3] for x in batch]
        x_train = np.concatenate(input, axis = 0)
        y_train = self.value_model.model.predict(x_train)
        x_target = np.concatenate(inp, axis = 0)
        y_target = self.target_model.model.predict(x_target)

        for count, memory in enumerate(batch):
            last_state, last_action, reward, state, terminal = memory
            y_train[count][last_action] = reward
            if not terminal:
                y_train[count][last_action] += self.discount*np.amax(y_target[count])

        self.value_model.model.fit(x_train, y_train, verbose = 0, epochs = 1)

        if self.num_updates > self.merge:
            self.merge_model()

    def merge_model(self):
        self.target_model.model.set_weights(self.value_model.model.get_weights())
        self.num_merge += 1
        self.num_updates = 0

    def epsilon_decay(self):
        if self.epsilon > 0.1:
            self.epsilon -= 0.9/1000000

    def choose_action(self, state):
        if np.random.random() < self.epsilon:
            action = np.random.choice(self.num_actions)
        else:
            action = np.argmax(self.value_model.model.predict(state)[0])

        self.num_steps += 1
        self.epsilon_decay()
        return action

    def agent_start(self, state):
        self.last_action = self.choose_action(state)
        self.last_state = state
        return self.last_action

    def agent_step(self, reward, state):
        memory = (self.last_state, self.last_action, reward, state, 0)
        self.exp.append(memory)
        self.run()
        self.last_action = self.choose_action(state)
        self.last_state = state
        return self.last_action

    def agent_end(self, reward):
        memory = (self.last_state, self.last_action, reward, np.zeros((1, 4)), 1)
        self.exp.append(memory)
        self.run()
        self.num_games += 1

    def plot(self, states):
        y = np.mean(np.max(self.value_model.model.predict(states), axis = 1))
        return y

In [21]:
agent_params = {
    'discount' :0.99,
    'epsilon' :1.0,
    'num_actions' : 200,
    'exp_size': 600000,
    'timesteps' : 10,
    'merge_threshold' : 320000,
    'batch_size' : 32,
}
agent = Agent(agent_params)
timesteps = []
q_value = []
avg_reward = []

TypeError: ignored

In [4]:
# env = 

SyntaxError: ignored

In [0]:
def train(agent, env):
    timesteps = 4000
    total = 0
    sum = 0
    for _ in range(1, timesteps+1):
        action = agent.agent_start(env.reset().reshape(1, 4))
        observations, reward, done, info = env.step([action])
        total += reward
        sum += reward

        while not done:
            observations, reward, done, info = env.step([agent.agent_step(reward, observations.reshape((1, 128)))])
            total += reward
            sum += reward

        agent.agent_end(reward)

        if agent.num_games % 50 == 0:
            episodes.append(agent.num_games)
            q_value.append(agent.plot(test_states))
            avg_reward.append(total/50)
            total = 0

        if agent.num_games % 100 == 0:
            print('Games = {}, Steps = {}, Reward = {}'.format(_, agent.num_steps, sum/100))
            sum = 0

        if agent.num_games % 400 == 0:
            with open('/content/drive/My Drive/DQN/episodes2.pkl', 'wb') as f:
                pickle.dump(episodes, f)
            with open('/content/drive/My Drive/DQN/q_value2.pkl', 'wb') as f:
                pickle.dump(q_value, f)
            with open('/content/drive/My Drive/DQN/avg_reward2.pkl', 'wb') as f:
                pickle.dump(avg_reward, f)
            with open('/content/drive/My Drive/DQN/test_states2.pkl', 'wb') as f:
                pickle.dump(test_states, f)
            with open('/content/drive/My Drive/DQN/agent2.pkl', 'wb') as f:
                pickle.dump(agent, f)
        elif agent.num_games % 200 == 0:
            with open('/content/drive/My Drive/DQN/episodes3.pkl', 'wb') as f:
                pickle.dump(episodes, f)
            with open('/content/drive/My Drive/DQN/q_value3.pkl', 'wb') as f:
                pickle.dump(q_value, f)
            with open('/content/drive/My Drive/DQN/avg_reward3.pkl', 'wb') as f:
                pickle.dump(avg_reward, f)
            with open('/content/drive/My Drive/DQN/test_states3.pkl', 'wb') as f:
                pickle.dump(test_states, f)
            with open('/content/drive/My Drive/DQN/agent3.pkl', 'wb') as f:
                pickle.dump(agent, f)
