# DeepScalper: Dueling DQN con *Action Branching* (skeleton)

In [None]:

import numpy as np
import torch, torch.nn as nn, torch.optim as optim
from collections import deque
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ToyIntradayEnv:
    def __init__(self, T=2000, k_price=11, k_size=5):
        self.T = T
        self.t = 0
        self.k_price = k_price
        self.k_size = k_size
        rng = np.random.default_rng(42)
        self.p = 100 + np.cumsum(rng.normal(0, 0.02, size=T))
        self.position = 0.0
        self.cash = 0.0
        self.inv = 0.0
    def reset(self):
        self.t=0; self.position=0.0; self.cash=0.0; self.inv=0.0
        return self._obs()
    def _obs(self):
        lo = max(0, self.t-20)
        win = self.p[lo:self.t+1]
        pad = 21-len(win)
        x = np.pad((win - win.mean())/(win.std()+1e-6), (pad,0))
        return x.astype(np.float32)
    def step(self, a_price_idx, a_size_idx):
        done = False
        price_levels = np.linspace(-0.1, 0.1, self.k_price)
        size_levels = np.linspace(0.0, 1.0, self.k_size)
        rel = price_levels[a_price_idx]
        sz = size_levels[a_size_idx]
        mid = self.p[self.t]
        limit_price = mid*(1+rel)
        executed = sz
        self.cash -= executed*limit_price
        self.inv += executed
        future = self.p[min(self.t+1, self.T-1)]
        pnl = self.inv*(future - mid)
        bonus = -abs(limit_price - future)
        reward = pnl + 0.01*bonus
        self.t += 1
        if self.t >= self.T-1:
            done = True
            self.cash += self.inv*self.p[self.t]
            self.inv = 0.0
            reward += self.cash*1e-4
        return self._obs(), float(reward), done, {}

class DuelingBranchQ(nn.Module):
    def __init__(self, obs_dim, n_price, n_size):
        super().__init__()
        self.feat = nn.Sequential(
            nn.Linear(obs_dim, 128), nn.ReLU(),
            nn.Linear(128, 128), nn.ReLU(),
        )
        self.value = nn.Sequential(nn.Linear(128, 64), nn.ReLU(), nn.Linear(64, 1))
        self.adv_price = nn.Sequential(nn.Linear(128, 64), nn.ReLU(), nn.Linear(64, n_price))
        self.adv_size  = nn.Sequential(nn.Linear(128, 64), nn.ReLU(), nn.Linear(64, n_size))
    def forward(self, x):
        h = self.feat(x)
        V = self.value(h)
        Ap = self.adv_price(h)
        As = self.adv_size(h)
        Q = V + (Ap - Ap.mean(dim=1, keepdim=True)) + (As - As.mean(dim=1, keepdim=True))
        return Q, Ap, As

def train_agent(episodes=10):
    env = ToyIntradayEnv()
    obs_dim = 21
    n_price, n_size = env.k_price, env.k_size
    net = DuelingBranchQ(obs_dim, n_price, n_size).to(device)
    tgt = DuelingBranchQ(obs_dim, n_price, n_size).to(device)
    tgt.load_state_dict(net.state_dict())
    opt = optim.Adam(net.parameters(), lr=1e-3)
    buf = deque(maxlen=50000)
    gamma = 0.99
    eps = 1.0
    batch = 128
    for ep in range(episodes):
        s = env.reset()
        done = False
        while not done:
            if random.random() < eps:
                a_p = random.randrange(n_price)
                a_s = random.randrange(n_size)
            else:
                with torch.no_grad():
                    qs, Ap, As = net(torch.tensor(s).unsqueeze(0).to(device))
                    a_p = Ap.argmax(dim=1).item()
                    a_s = As.argmax(dim=1).item()
            s2, r, done, _ = env.step(a_p, a_s)
            buf.append((s, a_p, a_s, r, s2, done))
            s = s2
            if len(buf) >= batch:
                batch_s, bp, bs, br, bs2, bd = zip(*random.sample(buf, batch))
                bS = torch.tensor(np.stack(batch_s)).to(device)
                bS2= torch.tensor(np.stack(bs2)).to(device)
                bP = torch.tensor(bp).long().to(device)
                bSiz= torch.tensor(bs).long().to(device)
                bR = torch.tensor(br).float().to(device)
                bD = torch.tensor(bd).float().to(device)
                Q, Ap, As = net(bS)
                Qsa = net.value(net.feat(bS)).squeeze() + Ap.gather(1, bP.unsqueeze(1)).squeeze() - Ap.mean(dim=1)                          + As.gather(1, bSiz.unsqueeze(1)).squeeze() - As.mean(dim=1)
                with torch.no_grad():
                    Qt, Apt, Ast = tgt(bS2)
                    ap2 = Apt.argmax(dim=1)
                    as2 = Ast.argmax(dim=1)
                    Qt_next = tgt.value(tgt.feat(bS2)).squeeze() + Apt.gather(1, ap2.unsqueeze(1)).squeeze() - Apt.mean(dim=1)                                 + Ast.gather(1, as2.unsqueeze(1)).squeeze() - Ast.mean(dim=1)
                    y = bR + (1.0 - bD)*gamma*Qt_next
                loss = ((Qsa - y.detach())**2).mean()
                opt.zero_grad(); loss.backward(); opt.step()
        if ep % 2 == 1:
            tgt.load_state_dict(net.state_dict())
        eps = max(0.05, eps*0.95)
    return net

_ = train_agent(episodes=5)
print("Entrenamiento demo completado.")
