# Lee (2002) — Programación Dinámica para trading discreto
Value Iteration en un MDP discretizado de retornos y posición. Acciones {-1,0,1}. Recompensa = Δwealth - costos.


In [None]:

import numpy as np
import matplotlib.pyplot as plt
rng = np.random.default_rng(2)


## Entorno y discretización

In [None]:

def synthetic_returns(n=2000, ar=0.03, sigma=0.01, seed=2):
    rng = np.random.default_rng(seed)
    r = np.zeros(n)
    for t in range(1,n):
        r[t] = ar*r[t-1] + rng.normal(0, sigma)
    return r

r = synthetic_returns()
bins = np.quantile(r, [0.2,0.4,0.6,0.8])

def disc_ret(x): return int(np.digitize(x, bins))

A = [-1,0,1]  # short, flat, long
gamma = 0.98
cost = 0.0003


## Estados: (bin_ret_{t-1}, pos)

In [None]:

# Construir transiciones empíricas P(s'|s,a) a partir de datos
def build_transitions(r):
    states = []
    for t in range(1, len(r)):
        states.append((disc_ret(r[t-1]),))  # solo un bin de retorno rezagado
    return states

states = list(set(build_transitions(r)))
pos_space = [-1,0,1]
S = [(s[0], p) for s in states for p in pos_space]
S_index = {s:i for i,s in enumerate(S)}


## Value Iteration

In [None]:

def reward(pos, pos_next, ret):
    # Δwealth ~ pos_next * ret - costo por cambiar
    sw = pos_next*ret - (cost if pos_next!=pos else 0.0)
    return sw

V = np.zeros(len(S))
pi = {s:0 for s in S}  # acción por estado índice de A

for it in range(50):
    V_new = np.zeros_like(V)
    for s in S:
        i = S_index[s]
        b, pos = s
        # enumerar acciones
        vals = []
        for a in A:
            # determinista: acción define pos_next=a
            ret_exp = 0.0; cnt = 0
            # expectativa empírica condicionada al bin b
            for t in range(1, len(r)):
                if disc_ret(r[t-1]) == b:
                    r_t = r[t]
                    b_next = disc_ret(r_t)
                    s_next = (b_next, a)
                    rew = reward(pos, a, r_t)
                    ret_exp += rew + gamma * V[S_index[s_next]]
                    cnt += 1
            vals.append(ret_exp / max(1,cnt))
        best = int(np.argmax(vals))
        V_new[i] = vals[best]
        pi[s] = A[best]
    if np.max(np.abs(V_new - V)) < 1e-6: break
    V = V_new

print("VI done. States:", len(S))


## Backtest con la política óptima encontrada

In [None]:

pos = 0
wealth = [1.0]
for t in range(1,len(r)):
    b = disc_ret(r[t-1])
    a = pi[(b, pos)]
    pnl = a*r[t] - (cost if a!=pos else 0.0)
    wealth.append(wealth[-1] * (1+pnl))
    pos = a

import matplotlib.pyplot as plt
plt.figure(); plt.plot(wealth); plt.title('Wealth (Value Iteration policy)'); plt.show()
