# TD learning with function approximation

In [None]:
import numpy as np
from abc import ABC, abstractmethod
import matplotlib.pyplot as plt

In [None]:
N_STATES = 7
MAX_K = 15
GAMMA = 0.99
ACTIONS = ['A', 'B']
REWARD = 0
# Policy: agent selects action A with probability PROB_A 
# and action B with probability PROB_B
PROB_A = 1/7
PROB_B = 6/7
ALPHA = 0.01 # step size
TIME_STEPS = 500
N_RUNS = 100 # Mitigate the effect of noise by averaging over x runs

In [None]:
class TDLearning(ABC):
    time_steps: int
    n_states: int
    reward: int
    actions: list[str]
    # A and B are the possible actions
    # transition probability matrices for actions A and B
    p_A: np.ndarray
    p_B: np.ndarray
    # feature matrices for every state in actions A and B
    # Each matrix represents a feature function for the K-th feature. 
    # Each feature vector that composes the matrix represents the 
    # feature vector for a pair (s, a).
    phi_A: np.ndarray
    phi_B: np.ndarray
    # policy
    prob_A: float
    prob: float
    # parameter vector
    w: np.ndarray
    # remaining parameters
    alpha: float
    gamma: float
    max_k: int

    def __init__(self, time_steps, n_states, reward, actions, prob_A, prob_B, alpha, gamma, max_k) -> None:
        self.time_steps = time_steps
        self.n_states = n_states
        self.reward = reward
        self.actions = actions
        self.p_A = np.zeros((n_states, n_states))
        # Set last column to 1
        self.p_A[:, -1] = 1
        self.p_B = np.zeros((n_states, n_states))
        # Set all columns but last to 1/6
        self.p_B[:, :-1] = 1/6
        self.phi_A = np.array([[2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
                          [0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
                          [0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
                          [0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
                          [0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
                          [0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0],
                          [0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0]])
        self.phi_B = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
                          [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
                          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
                          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
                          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
                          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])
        self.prob_A = prob_A
        self.prob_B = prob_B
        self.w = np.array([1, 1, 1, 1, 1, 1, 10, 1, 1, 1, 1, 1, 1, 1, 1])
        self.alpha = alpha  
        self.gamma = gamma
        self.max_k = max_k

    def _get_parameter_vector_norm(self):
        return np.linalg.norm(self.w)

    def _get_feature_vector(self, state, action):
        if action == 'A':
            return self.phi_A[state]
        elif action == 'B':
            return self.phi_B[state]
        
    def _q_function(self, state, action):
        return np.matmul(self._get_feature_vector(state, action), self.w)

    def _choose_action(self) -> str:
        return np.random.choice(self.actions, p=[self.prob_A, self.prob_B])

    @abstractmethod
    def _update_w(self):
        pass

    def game(self) -> tuple[np.ndarray, list[float]]:
        time_steps = np.arange(self.time_steps)
        norms = []
        cur_state = np.random.choice(range(self.n_states))
        cur_action = self._choose_action()
        for time_step in range(self.time_steps):
            print(f"\n------ TIME STEP {time_step} ------")
            if cur_action == 'A':
                next_state = np.random.choice(range(self.n_states), p=self.p_A[cur_state])
            elif cur_action == 'B':
                next_state = np.random.choice(range(self.n_states), p=self.p_B[cur_state])
            next_action = self._choose_action()
            reward = self.reward
            self._update_w(cur_state, cur_action, next_state, next_action, reward)
            cur_state = next_state
            norms.append(self._get_parameter_vector_norm())
        return time_steps, norms

class Sarsa(TDLearning):
    def _update_w(self, cur_state, cur_action, next_state, next_action, reward):
        part1 = self.alpha * self._get_feature_vector(cur_state, cur_action) 
        part2 = reward + self.gamma * self._q_function(next_state, next_action) - self._q_function(cur_state, cur_action)
        self.w = self.w + part1 * part2

class QLearning(TDLearning):
    def _update_w(self, cur_state, cur_action, next_state, next_action, reward):
        part1 = self.alpha * self._get_feature_vector(cur_state, cur_action)
        part2 = reward + self.gamma * np.max([self._q_function(next_state, 'A'), self._q_function(next_state, 'B')]) - self._q_function(cur_state, cur_action)
        self.w = self.w + part1 * part2


In [None]:
sarsa_multiple_runs_norms = np.zeros((N_RUNS, TIME_STEPS))
qlearning_multiple_runs_norms = np.zeros((N_RUNS, TIME_STEPS))
for run in range(N_RUNS):   
    print(f"\n\n===== RUN {run} =====")   
    sarsa = Sarsa(TIME_STEPS, N_STATES, REWARD, ACTIONS, PROB_A, PROB_B, ALPHA, GAMMA, MAX_K)
    qlearning = QLearning(TIME_STEPS, N_STATES, REWARD, ACTIONS, PROB_A, PROB_B, ALPHA, GAMMA, MAX_K)
    sarsa_time_steps, sarsa_multiple_runs_norms[run] = sarsa.game()
    qlearning_time_steps, qlearning_multiple_runs_norms[run] = qlearning.game()

In [None]:
def plot_norms(time_steps, sarsa_avg_norms, qlearning_avg_norms):
    plt.plot(time_steps, sarsa_avg_norms, label='Sarsa')
    plt.plot(time_steps, qlearning_avg_norms, label='Q-learning')
    plt.legend()
    plt.xlabel('Time steps')
    plt.ylabel('Parameter vector norm ||w||')
    plt.show()

In [None]:
sarsa_avg_norms = np.mean(sarsa_multiple_runs_norms, axis=0)
qlearning_avg_norms = np.mean(qlearning_multiple_runs_norms, axis=0)
plot_norms(sarsa_time_steps, sarsa_avg_norms, qlearning_avg_norms)

* SARSA: The norm of the parameter vector for SARSA seems to decrease initially and then stabilizes relatively quickly. This could indicate that the SARSA algorithm is converging to a solution where the updates to the weights become smaller over time, suggesting that the algorithm is stabilizing its policy.

* Q-learning: In contrast, the parameter vector norm for Q-learning is consistently increasing over time. This trend might suggest that the weights are growing without stabilization, which could be a sign of divergence or that the algorithm is still actively learning and adjusting its policy. Most likely it did not converge.

* Stability: SARSA seems to be more stable in this scenario compared to Q-learning. This could be due to SARSA being an on-policy algorithm, meaning it learns the value of the policy it follows, possibly leading to more conservative updates.

* Exploration vs. Exploitation: The increasing trend in the Q-learning norm could be a result of its off-policy nature, where it learns the value of the best possible policy while following another policy. This might cause larger updates if the environment has many states and actions to explore.

* Potential Overfitting or Overshooting: The increasing norm in Q-learning might also indicate potential overfitting or overshooting of the value function approximation. It's possible that Q-learning's greedy nature in updating its policy could be leading to over-estimations of the action values.

* Learning Speed: The plot does not necessarily reflect the speed of learning in terms of how quickly each algorithm finds a good policy, but rather the magnitude of the updates they make to their parameter vectors.

* Algorithm Suitability: Depending on the specific environment and the reward structure, one algorithm may be more suitable than the other. This plot might suggest that SARSA is more appropriate for this particular MDP if stability is preferred.
