## Tensorflow 2 Tutorial in Reinforcement Learning to predict stock prices

## Step 1: Loading the libraries

In [None]:
!pip install pandas-datareader

In [None]:
import tensorflow as tf

import math
import random
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from pandas_datareader import data as pdr
import yfinance as yf


from tqdm import tqdm_notebook, tqdm
from collections import deque

In [None]:
print(tf.__version__)

## Step 2: Build the neural network

First, we create an agent to work in our enviroment. It will use a FFN to generate the action and implement the RL reward and policy.

In [None]:
class AI_Trader():

  def __init__(self, state_size, action_space=3, model_name="AITrader"): #Manten, Compra, Vende
    # Set the state, action space and parameters
    self.state_size = state_size
    self.action_space = action_space
    self.memory = deque(maxlen=2000)
    self.inventory = []
    self.model_name = model_name

    self.gamma = 0.95
    self.epsilon = 1.0
    self.epsilon_final = 0.01
    self.epsilon_decay = 0.995

    self.model = self.model_builder()

  def model_builder(self):
    # Create a FFN with three Dense layers and the output layer
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(units=32, activation='relu', input_dim=self.state_size))
    model.add(tf.keras.layers.Dense(units=64, activation='relu'))
    model.add(tf.keras.layers.Dense(units=128, activation='relu'))
    model.add(tf.keras.layers.Dense(units=self.action_space, activation='linear'))
    model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

    return model

  def trade(self, state):
    # Predict the next action based on the FFN
    if random.random() <= self.epsilon:
      return random.randrange(self.action_space)

    actions = self.model.predict(state)
    return np.argmax(actions[0])


  def batch_train(self, batch_size):
    # Train the FFN using the reward function
    batch = []
    for i in range(len(self.memory) - batch_size + 1, len(self.memory)):
      batch.append(self.memory[i])

    for state, action, reward, next_state, done in batch:
      reward = reward
      if not done:
        reward = reward + self.gamma * np.amax(self.model.predict(next_state)[0])

      target = self.model.predict(state)
      target[0][action] = reward

      self.model.fit(state, target, epochs=1, verbose=0)

    if self.epsilon > self.epsilon_final:
      self.epsilon *= self.epsilon_decay

## Step 3: Data preprocessing

Create some helper functions

In [None]:
# Calculate sigmoid function
def sigmoid(x):
  return 1 / (1 + math.exp(-x))

# Set the stock price format
def stocks_price_format(n):
  if n < 0:
    return "- $ {0:2f}".format(abs(n))
  else:
    return "$ {0:2f}".format(abs(n))

### Load the dataset

In [None]:
def dataset_loader(stock_names, start_date='' , end_date=''):
# Read the dataset

  yf.pdr_override()

  if start_date and end_date:
    # Load the sotkc prices between start and end dates
    dataset = pdr.get_data_yahoo(stock_names, start=start_date, end=end_date)
  else:
    # Load the whole stock prices set
    dataset = pdr.get_data_yahoo(stock_names)


  close = dataset['Close']

  return close

In [None]:
#stock_name = "AAPL"
stock_name = "BTC-USD"
data = dataset_loader(stock_name)

print(data)

## Step 4: Train the agent

In [None]:
# Return the state
def state_creator(data, timestep, window_size):

  starting_id = timestep - window_size + 1

  if starting_id >= 0:
    windowed_data = list(data[starting_id:timestep+1])
  else:
    windowed_data = - starting_id * [data[0]] + list(data[0:timestep+1])

  state = []
  for i in range(window_size - 1):
    state.append(sigmoid(windowed_data[i+1] - windowed_data[i]))

  return np.array([state])

Set the hyper parameters

In [None]:
window_size = 10
episodes = 1000

batch_size = 32
data_samples = len(data) - 1

Create the Agent or model

In [None]:
trader = AI_Trader(window_size)

In [None]:
trader.model.summary()

Create the training loop

In [None]:
for episode in range(1, episodes + 1):

  print("Episodio: {}/{}".format(episode, episodes))

  state = state_creator(data, 0, window_size + 1)

  total_profit = 0
  trader.inventory = []

  for t in tqdm(range(data_samples)):

    action = trader.trade(state)

    next_state = state_creator(data, t+1, window_size + 1)
    reward = 0

    if action == 1: #Compra
      trader.inventory.append(data[t])
      print("AI Trader compró: ", stocks_price_format(data[t]))

    elif action == 2 and len(trader.inventory) > 0: #Vende
      buy_price = trader.inventory.pop(0)

      reward = max(data[t] - buy_price, 0)
      total_profit += data[t] - buy_price
      print("AI Trader vendió: ", stocks_price_format(data[t]), " Beneficio: " + stocks_price_format(data[t] - buy_price) )

    if t == data_samples - 1:
      done = True
    else:
      done = False

    trader.memory.append((state, action, reward, next_state, done))

    state = next_state

    if done:
      print("########################")
      print("BENEFICIO TOTAL: {}".format(total_profit))
      print("########################")

    if len(trader.memory) > batch_size:
      trader.batch_train(batch_size)

  if episode % 10 == 0:
    trader.model.save("ai_trader_{}.h5".format(episode))
