# Plantilla de replicaciones según Fischer (2018): critic-only, actor-only, actor-critic

In [None]:

import numpy as np
import torch, torch.nn as nn, torch.optim as optim

T=2000
price = 100 + np.cumsum(np.random.normal(0,0.2,size=T))
ret = np.diff(np.r_[price[0], price])
rsi = 50 + 50*np.tanh(np.convolve(ret, np.ones(14)/14, mode="same"))
X = np.stack([ret, rsi/100.0], axis=1).astype(np.float32)

W = np.zeros((2,3))
alpha=0.01; gamma=0.95
for t in range(T-1):
    q = X[t]@W
    a = q.argmax()
    r = (price[t+1]-price[t]) * (1 if a==1 else 0)
    q_next = X[t+1]@W
    td = r + gamma*q_next.max() - q[a]
    W[:,a] += alpha*td*X[t]
print("Critic-only listo (demo).")

class Policy(nn.Module):
    def __init__(self): 
        super().__init__()
        self.net=nn.Sequential(nn.Linear(2,32), nn.Tanh(),
                               nn.Linear(32,3))
    def forward(self,x):
        return self.net(x)
pi=Policy(); opt=optim.Adam(pi.parameters(), lr=1e-3)
for ep in range(3):
    logps=[]; rewards=[]
    for t in range(T-1):
        logits=pi(torch.tensor(X[t]).unsqueeze(0))
        dist=torch.distributions.Categorical(logits=logits)
        a=dist.sample()
        logps.append(dist.log_prob(a))
        r=(price[t+1]-price[t]) * (1 if a.item()==1 else 0)
        rewards.append(r)
    G=[]; g=0.0
    for r in reversed(rewards):
        g= r + 0.99*g
        G.append(g)
    G=torch.tensor(list(reversed(G))).float()
    L=-(torch.stack(logps)*((G - G.mean())/(G.std()+1e-8))).sum()
    opt.zero_grad(); L.backward(); opt.step()
print("Actor-only listo (demo).")

class A2C(nn.Module):
    def __init__(self):
        super().__init__()
        self.body=nn.Sequential(nn.Linear(2,64), nn.ReLU())
        self.pi=nn.Linear(64,3)
        self.v =nn.Linear(64,1)
    def forward(self,x):
        h=self.body(x)
        return self.pi(h), self.v(h)
ac=A2C(); opt=optim.Adam(ac.parameters(), lr=1e-3)
for ep in range(3):
    for t in range(T-1):
        logits, v = ac(torch.tensor(X[t]).unsqueeze(0))
        dist=torch.distributions.Categorical(logits=logits)
        a=dist.sample()
        r=(price[t+1]-price[t]) * (1 if a.item()==1 else 0)
        with torch.no_grad():
            _, v2 = ac(torch.tensor(X[t+1]).unsqueeze(0))
        td = r + 0.99*v2.item() - v.item()
        pi_loss = -dist.log_prob(a)*td
        v_loss = (v - torch.tensor([[r]]))**2
        loss = pi_loss + 0.5*v_loss.mean()
        opt.zero_grad(); loss.backward(); opt.step()
print("Actor-Critic listo (demo).")
