# Deep Q-Learning en juegos idealizados de series de tiempo

In [None]:

import numpy as np
import torch, torch.nn as nn, torch.optim as optim
import random

class UnivariateGame:
    def __init__(self, T=4000):
        x=np.linspace(0, 50, T)
        self.price = 100 + 2*np.sin(x) + np.random.normal(0,0.2,size=T)
        self.T=T; self.t=0; self.pos=0.0
    def reset(self):
        self.t=0; self.pos=0.0
        return np.array([self.price[self.t]]).astype(np.float32)
    def step(self,a):
        done=False; r=0.0
        p=self.price[self.t]
        if a==1: self.pos=1.0
        if a==2: self.pos=0.0
        self.t+=1
        if self.t>=self.T: done=True
        else:
            p2=self.price[self.t]
            r = self.pos*(p2-p)
        return np.array([self.price[self.t-1]]).astype(np.float32), float(r), done, {}

class BivariateGame:
    def __init__(self, T=4000):
        rng=np.random.default_rng(1)
        self.price = 100 + np.cumsum(rng.normal(0,0.2,size=T))
        fut = np.append(np.diff(self.price, prepend=self.price[0]), 0)
        self.signal = fut + rng.normal(0, 0.2, size=T)
        self.T=T; self.t=0; self.pos=0.0
    def reset(self):
        self.t=0; self.pos=0.0
        return np.array([self.price[self.t], self.signal[self.t]]).astype(np.float32)
    def step(self,a):
        done=False; r=0.0
        p=self.price[self.t]
        if a==1: self.pos=1.0
        if a==2: self.pos=0.0
        self.t+=1
        if self.t>=self.T: done=True
        else:
            p2=self.price[self.t]
            r = self.pos*(p2-p)
        return np.array([self.price[self.t-1], self.signal[self.t-1]]).astype(np.float32), float(r), done, {}

class QNet(nn.Module):
    def __init__(self, obs, acts=3):
        super().__init__()
        self.net=nn.Sequential(nn.Linear(obs,64), nn.ReLU(),
                               nn.Linear(64,64), nn.ReLU(),
                               nn.Linear(64,acts))
    def forward(self,x): return self.net(x)

def train(env, obs_dim, episodes=10):
    net=QNet(obs_dim).train()
    tgt=QNet(obs_dim).train()
    tgt.load_state_dict(net.state_dict())
    opt=optim.Adam(net.parameters(), lr=1e-3)
    gamma=0.99; eps=1.0
    for ep in range(episodes):
        s=env.reset(); done=False
        while not done:
            if random.random()<eps:
                a=random.randrange(3)
            else:
                with torch.no_grad():
                    a=net(torch.tensor(s).unsqueeze(0)).argmax(1).item()
            s2,r,done,_=env.step(a)
            with torch.no_grad():
                y=r + (0 if done else gamma*tgt(torch.tensor(s2).unsqueeze(0)).max(1)[0].item())
            q=net(torch.tensor(s).unsqueeze(0))[0,a]
            loss=(q - y)**2
            opt.zero_grad(); loss.backward(); opt.step()
            s=s2
        if ep%2==1: tgt.load_state_dict(net.state_dict())
        eps=max(0.05, eps*0.9)
    return net

_ = train(UnivariateGame(), 1, episodes=5)
_ = train(BivariateGame(), 2, episodes=5)
print("DQN demo entrenado en ambos juegos.")
