# Neuneier (1996) — Q-Learning para asignación de activos (2 monedas)

Baseline educativo inspirado en Neuneier (1996): Q-Learning para decidir si mantener **DM** o **USD** bajo costos de transacción.


**Contenido:**
1) Generación de una serie artificial de tipo “sube con probabilidad, cae con choques”.
2) Definición del MDP con estado = (bin de tipo de cambio, signo de posición).
3) Q-Learning con exploración Boltzmann.
4) Backtest y gráfica de wealth.

**Advertencia:** simplificado para docencia, no reproduce todos los detalles. 

## 1) Imports

In [None]:

import numpy as np
import matplotlib.pyplot as plt
rng = np.random.default_rng(123)


## 2) Serie artificial de tipo de cambio USD/DM

In [None]:

def artificial_fx(n=3000, low=1.0, high=2.0, trend=0.002, crash_prob_base=0.001, crash_scale=0.15, seed=123):
    rng = np.random.default_rng(seed)
    x = np.empty(n, dtype=float)
    x[0] = (low + high)/2
    for t in range(1, n):
        level = x[t-1]
        # drift up
        mu = trend
        # higher level => higher crash prob
        crash_prob = crash_prob_base + 0.02 * max(0.0, (level - (low+high)/2) / (high - low))
        eps = rng.normal(0, 0.003)
        x[t] = np.clip(level * (1 + mu + eps), low, high)
        if rng.random() < crash_prob:
            x[t] = max(low, x[t] * (1 - crash_scale * rng.uniform(0.5, 1.0)))
    return x

fx = artificial_fx()
plt.figure()
plt.plot(fx)
plt.title('Artificial FX (USD in DM)')
plt.xlabel('t')
plt.ylabel('rate')
plt.show()


## 3) MDP y Q-Learning
Estado = (bin_fx, pos) con pos∈{0: DM, 1: USD}. Acciones = {0: mantener, 1: cambiar}. Recompensa = Δwealth neta de costos.

In [None]:

def discretize(x, bins):
    return np.digitize(x, bins) - 1

class QLearnerFX:
    def __init__(self, fx, n_bins=10, cost_fixed=0.0005, cost_prop=0.0005,
                 tau=0.5, gamma=0.99, alpha=0.2):
        self.fx = fx
        self.n = len(fx)
        self.bins = np.linspace(fx.min(), fx.max(), n_bins+1)[1:-1]
        self.n_bins = n_bins
        self.cost_fixed = cost_fixed
        self.cost_prop = cost_prop
        self.tau = tau
        self.gamma = gamma
        self.alpha = alpha
        # Q[bin_fx, pos, action]
        self.Q = np.zeros((n_bins, 2, 2), dtype=float)

    def boltzmann(self, q):
        z = (q / max(1e-8, self.tau))
        z = z - z.max()
        p = np.exp(z)
        p = p / p.sum()
        return p

    def step_reward(self, pos, action, w, t):
        # pos: 0 DM, 1 USD
        # action: 0 stay, 1 switch
        # wealth update over [t, t+1)
        rate_t, rate_tp1 = self.fx[t], self.fx[t+1]
        new_pos = pos
        cost = 0.0
        if action == 1:  # switch
            cost = self.cost_fixed * w + self.cost_prop * w
            new_pos = 1 - pos

        # Δwealth
        if new_pos == 0:
            # holding DM: wealth unchanged except costs
            w_tp1 = w - cost
        else:
            # holding USD: wealth DM changes by rate ratio
            # approximate gain by holding USD vs DM
            growth = rate_tp1 / rate_t
            w_tp1 = w * growth - cost

        reward = w_tp1 - w
        return new_pos, w_tp1, reward

    def train(self, epochs=5, w0=1.0):
        for ep in range(epochs):
            w = w0
            pos = 0  # start in DM
            for t in range(self.n-1):
                s_bin = discretize(self.fx[t], self.bins)
                q = self.Q[s_bin, pos]
                p = self.boltzmann(q)
                a = np.random.choice([0,1], p=p)
                new_pos, w_tp1, r = self.step_reward(pos, a, w, t)
                # next state
                s_bin_next = discretize(self.fx[t+1], self.bins)
                td_target = r + self.gamma * np.max(self.Q[s_bin_next, new_pos])
                td_error = td_target - self.Q[s_bin, pos, a]
                self.Q[s_bin, pos, a] += self.alpha * td_error
                w = w_tp1
                pos = new_pos

    def policy(self):
        # greedy
        return np.argmax(self.Q, axis=-1)

    def backtest(self, w0=1.0):
        w = w0
        pos = 0
        wealth = [w]
        actions = []
        for t in range(self.n-1):
            s_bin = discretize(self.fx[t], self.bins)
            a = np.argmax(self.Q[s_bin, pos])
            actions.append(a)
            pos, w, r = self.step_reward(pos, a, w, t)
            wealth.append(w)
        return np.array(wealth), np.array(actions)

ql = QLearnerFX(fx)
ql.train(epochs=6)
wealth, acts = ql.backtest()

plt.figure()
plt.plot(wealth)
plt.title('Wealth (QL policy)')
plt.xlabel('t')
plt.ylabel('wealth (DM)')
plt.show()


## 4) Notas y próximos pasos
- Añadir capital continuo y discretización por cuantiles.
- Incluir *risk aversion* vía utilidad log.
- Validar contra DP en entorno artificial pequeño.
