<a href="https://colab.research.google.com/github/cxz260/XzUtil/blob/main/RL_stock.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install numpy pandas yfinance gym

In [9]:
import gym
import numpy as np
import pandas as pd

class StockEnv(gym.Env):
    def __init__(self, stock_data, window_size=5, initial_balance=10000):
        super(StockEnv, self).__init__()
        self.stock_data = stock_data
        self.window_size = window_size
        self.initial_balance = initial_balance

        self.action_space = gym.spaces.Discrete(3)  # Buy, Hold, Sell
        self.observation_space = gym.spaces.Box(low=0, high=np.inf, shape=(window_size,))

        self.reset()

    def step(self, action):
        self.current_step += 1

        if self.current_step >= len(self.stock_data) - 1:
            done = True
        else:
            done = False

        prev_balance = self.balance
        prev_stock_value = self.stock_data[self.current_step - 1] * self.stock_count

        if action == 0:  # Buy
            self.stock_count += self.balance // self.stock_data[self.current_step]
            self.balance %= self.stock_data[self.current_step]
        elif action == 2:  # Sell
            self.balance += self.stock_data[self.current_step] * self.stock_count
            self.stock_count = 0

        current_stock_value = self.stock_data[self.current_step] * self.stock_count
        reward = (self.balance + current_stock_value) - (prev_balance + prev_stock_value)

        return self._next_observation(), reward, done, {}

    def reset(self):
        self.balance = self.initial_balance
        self.stock_count = 0
        self.current_step = self.window_size
        return self._next_observation()

    def _next_observation(self):
        return self.stock_data[self.current_step - self.window_size : self.current_step]

    def render(self):
        print(f"Step: {self.current_step}, Balance: {self.balance}, Stock count: {self.stock_count}")


In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

class DQNAgent:
    def __init__(self, state_size, action_size, hidden_size=64, lr=0.001, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01, memory_size=10000, batch_size=32, device="cpu"):
        self.state_size = state_size
        self.action_size = action_size
        self.hidden_size = hidden_size
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.device = device

        self.memory = deque(maxlen=self.memory_size)
        self.model = self._build_model().to(self.device)
        self.target_model = self._build_model().to(self.device)
        self.update_target_model()

    def _build_model(self):
        model = nn.Sequential(
            nn.Linear(self.state_size, self.hidden_size),
            nn.ReLU(),
            nn.Linear(self.hidden_size, self.hidden_size),
            nn.ReLU(),
            nn.Linear(self.hidden_size, self.action_size)
            )
        return model

    def update_target_model(self):
      self.target_model.load_state_dict(self.model.state_dict())

    def remember(self, state, action, reward, next_state, done):
      self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state_tensor = torch.FloatTensor(state).to(self.device)
        q_values = self.model(state_tensor)
        return np.argmax(q_values.detach().cpu().numpy())

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        minibatch = random.sample(self.memory, self.batch_size)
        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        criterion = nn.MSELoss()

        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                next_state_tensor = torch.FloatTensor(next_state).to(self.device)
                target = reward + self.gamma * np.amax(self.target_model(next_state_tensor).detach().cpu().numpy())

            state_tensor = torch.FloatTensor(state).to(self.device)
            target_f = self.model(state_tensor)
            target_f[action] = target
            target_f = target_f.to(self.device)

            optimizer.zero_grad()
            output = self.model(state_tensor)
            loss = criterion(output, target_f)
            loss.backward()
            optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay



In [14]:
import yfinance as yf

# Download stock data
ticker = "AAPL"
start_date = "2020-01-01"
end_date = "2021-01-01"
stock_data = yf.download(ticker, start=start_date, end=end_date)["Adj Close"].values

# Create the environment
window_size = 5
env = StockEnv(stock_data, window_size=window_size)

# Create the agent
state_size = window_size
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size, device="cuda" if torch.cuda.is_available() else "cpu")

# Train the agent
episodes = 200
for e in range(episodes):
    state = env.reset()
    done = False
    while not done:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        agent.replay()

    agent.update_target_model()
    print(f"Episode: {e + 1}/{episodes}, Epsilon: {agent.epsilon:.2f}")

print("Training completed.")


[*********************100%***********************]  1 of 1 completed
Episode: 1/200, Epsilon: 0.34
Episode: 2/200, Epsilon: 0.10
Episode: 3/200, Epsilon: 0.03
Episode: 4/200, Epsilon: 0.01
Episode: 5/200, Epsilon: 0.01
Episode: 6/200, Epsilon: 0.01
Episode: 7/200, Epsilon: 0.01
Episode: 8/200, Epsilon: 0.01
Episode: 9/200, Epsilon: 0.01
Episode: 10/200, Epsilon: 0.01
Episode: 11/200, Epsilon: 0.01
Episode: 12/200, Epsilon: 0.01
Episode: 13/200, Epsilon: 0.01
Episode: 14/200, Epsilon: 0.01
Episode: 15/200, Epsilon: 0.01
Episode: 16/200, Epsilon: 0.01
Episode: 17/200, Epsilon: 0.01
Episode: 18/200, Epsilon: 0.01
Episode: 19/200, Epsilon: 0.01
Episode: 20/200, Epsilon: 0.01
Episode: 21/200, Epsilon: 0.01
Episode: 22/200, Epsilon: 0.01
Episode: 23/200, Epsilon: 0.01
Episode: 24/200, Epsilon: 0.01
Episode: 25/200, Epsilon: 0.01
Episode: 26/200, Epsilon: 0.01
Episode: 27/200, Epsilon: 0.01
Episode: 28/200, Epsilon: 0.01
Episode: 29/200, Epsilon: 0.01
Episode: 30/200, Epsilon: 0.01
Episode: 