# Lee & Zhang (2006) — Q-learning incremental para trading
MDP de retornos discretizados y posición. Epsilon-greedy. Recompensa = Δwealth - costos.


In [None]:

import numpy as np
import matplotlib.pyplot as plt
rng = np.random.default_rng(6)


In [None]:

def synthetic_returns(n=4000, ar=0.04, sigma=0.012, seed=6):
    rng = np.random.default_rng(seed)
    r = np.zeros(n)
    for t in range(1,n):
        r[t] = ar*r[t-1] + rng.normal(0, sigma)
    return r

r = synthetic_returns()
bins = np.quantile(r, [0.2,0.4,0.6,0.8])
def disc_ret(x): return int(np.digitize(x, bins))

A = [-1,0,1]
gamma = 0.99; alpha = 0.2; eps = 0.1; cost = 0.0002

# Q[(b, pos, a)] -> value
Q = {}

def getQ(b, pos, a): return Q.get((b,pos,a), 0.0)
def setQ(b, pos, a, val): Q[(b,pos,a)] = val


In [None]:

def step(pos, a, ret):
    pnl = a*ret - (0.0002 if a!=pos else 0.0)
    pos_next = a
    return pos_next, pnl

pos = 0; wealth = [1.0]
for ep in range(6):
    pos = 0
    for t in range(1,len(r)-1):
        b = disc_ret(r[t-1])
        # epsilon-greedy
        if rng.random() < eps:
            a = rng.choice(A)
        else:
            qs = [getQ(b,pos,a) for a in A]
            a = A[int(np.argmax(qs))]
        pos_next, rew = step(pos, a, r[t])
        b_next = disc_ret(r[t])
        # TD target
        q_next = max(getQ(b_next,pos_next,ap) for ap in A)
        td = rew + gamma*q_next - getQ(b,pos,a)
        setQ(b,pos,a, getQ(b,pos,a) + alpha*td)
        pos = pos_next

# Backtest greedy
pos = 0; wealth = [1.0]
for t in range(1,len(r)):
    b = disc_ret(r[t-1])
    qs = [getQ(b,pos,a) for a in A]
    a = A[int(np.argmax(qs))]
    pnl = a*r[t] - (0.0002 if a!=pos else 0.0)
    wealth.append(wealth[-1]*(1+pnl))
    pos = a

plt.figure(); plt.plot(wealth); plt.title('Wealth (Q-learning greedy)'); plt.show()
