In [None]:
import numpy as np
import matplotlib.pyplot as plt

### Q1.

```
# Psuedo Code from Monte Carlo with Exploring Starts

Init:
    pi(s);         chosen from A(s) for all states s. 
    Q(s,a);        chosen from R for all state-action pairs (s,a).
    Returns(s,a);  dictionary of mean returns (single value).
    Counts(s,a);   List to keep track of state-actions encountered.
    
While True(for each episode):
    Choose (s,a) pair so that all such pairs are picked with a non-zero probability every episode. 
    Generate episode from (S0,A0) under pi.
    Init G=0
    for t in range(0, T, -1):
        G = discount*G + R(t+1)
        Unless (St,At) appear in remaining iterations:
            Returns(s,a) = Returns(s,a) + [G - Returns(s,a)]/Counts(s,a)
            Q(St,At) = Returns(St,At)
            pi(St) = argmax(a, Q(St,At))
            Counts(St,At) += 1
```

* This incremental update is equivalent to averaging over all returns as we are weighting the total returns ,`Returns(St,At)` by `[(n-1)/n]` and the current return `G` by `1/n`. the where `n` is the number of episodes or times it has been visited (depending on First-Visit of Every-Visit MC).
* This is exactly what we do during averaging.

In [2]:
"""
Monte Carlo base class.
"""
class MonteCarlo():
    def __init__(self, states, actions, policy, gamma):
        self.state_values = np.zeros(shape=(len(states)))
        self.returns = [[] for i in range(len(states))]
#         Hit is +1 and stick is -1
        self.policy = policy
        self.gamma = gamma
        
    def run_episode(self):
        states, actions, rewards = gen_episode()
        G = 0
        for i in range(0, len(states), -1):
            G = self.gamma*G + rewards[i]
            if not states.index(states[i])==i:
                self.returns[states[i]].append(G)
                self.state_values[states[i]] = sum(self.returns[states[i]])/len(self.returns[states[i]])
                
    def gen_episode(self, model, horizon):
        states = list()
        actions = list()
        rewards = list()
        
#         state[0] = ?
        action[0] = self.policy(state[0])
        
        for i in range(horizon):
            a, c = model.play(states[-1], actions[-1]) # Returns new state and reward.
            states.append(a)
#             actions.append(b)
            rewards.append(c)
            actions.append(self.policy(self.states[-1])) # Choose new action as per policy
        
        

In [None]:
"""
Blackjack code used for Q4.
"""
class BlackjackGame():
    def __init__(self):
        """
        Let hit be a 1
        and stick be -1
        """
        self.policy = np.zeros(22)
        for i in range(0, 17):
            self.policy[i] = 1
        for i in range(17, 22):
            self.policy[i] = -1
            
        self.player_total = 0
        self.dealer_total = 0
        self.dealer_cards = (0, 0)
        self.player_usable_ace = False
        self.usable_ace_counter = 0
    
    def deal_card(self):
#         There are 13 unique card faces in total
    card = np.random.randint(1, 14)
    if card==1:
        return 11
    else:
        return min(card, 10)
    
    def play(self, initial_state, action):
#         State format: (current total, dealer card viewable, has a usable ace)
        self.player_total, self.deal_card_viewable, self.player_usable_ace = initial_state
        self.deal_card[0] = self.deal_card_viewable
        self.dealer_card[1] = self.deal_card()
        
        self.dealer_total = self.dealer_card[0] + self.deal_card[1]
        if self.dealer_total == 22:
            self.dealer_total -= 10 # Treat 11 as 1.
        
#         Player turn
        if self.player_total==0:
            self.player_total = self.deal_card() + self.deal_card()
        if self.player_total==22:
            self.player_total -= 10 # Treat 11 as 1
            
        if action == 1:
            new_card = self.deal_card()
            

In [None]:
"""
SARSA and Q-Learning
"""
def Q_learning(terminals, states_size, actions_size, policy, model, start
               episodes=500, alpha=0.1, gamma=1):
    """
    Assume states are numbered 0 to whatever
    Directions are 0, 1, 2 ,3 for up, left, down, right.
    """
    Q_values = np.zeros(shape=(states_size, actions_size))
    
    accumulated_rewards = np.zeros(shape=(episodes, 1))
    
    for episode in episodes:
        s = start
        
        while(not s in terminals):
            a = policy(Q_values, s)
            s_next, r = model(s, a)
            a_greedy = np.argmax(Q_values[s_next, :])
            Q_values[s, a] += alpha*(r + gamma*Q_values[s_next, a_greedy] - Q_values[s, a])
            s = s_next
        
        try:
            accumulated_rewards[episode] = np.sum(Q_values)
            accumulated_rewards[episode] -= accumulated_rewards[episode-1]
        except:
            accumulated_rewards[episode] = np.sum(Q_values)
            
        
def SARSA(terminals, states_size, actions_size, policy, model, start
               episodes=500, alpha=0.1, gamma=1):
    """
    Assume states are numbered 0 to whatever
    Directions are 0, 1, 2 ,3 for up, left, down, right.
    """
    Q_values = np.zeros(shape=(states_size, actions_size))
    
    accumulated_rewards = np.zeros(shape=(episodes, 1))
    
    for episode in episodes:
        s = start
        a = policy(Q_values, s)
        
        while(not s in terminals):
            s_next, r = model(s, a)
            a_greedy = policy(Q_values, s_next)
            Q_values[s, a] += alpha*(r + gamma*Q_values[s_next, a_greedy] - Q_values[s, a])
            s = s_next
            a = a_greedy
        
        try:
            accumulated_rewards[episode] = np.sum(Q_values)
            accumulated_rewards[episode] -= accumulated_rewards[episode-1]
        except:
            accumulated_rewards[episode] = np.sum(Q_values)

In [None]:
"""
Code used in Q7.
"""
import random

class cliffWalking():
    def __init__(self):
        states_size = 48
        action_size = 4
        terminals = [37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]
        
    def model(state, action):
        if action==0:
            state -= 12
            if state < 0:
                state += 12
                
        elif action==2:
            state += 12
            if state >= 48:
                state -= 12
        
        elif action==1:
            if not(state==36 or state==24 or state==12 or state==0):
                state -= 1
            
        elif action==3:
            if not(state==11 or state==23 or state==35 or state==47):
                state+=1
        
        else:
            print("weird action:", action)
            
        if state>36 and state<47:
            reward = -100
        else:
            reward = -1
            
        return state, reward
            
    def policy(Q_vals, state, epsilon=0.1):
        """
        The epsilon greedy policy.
        Q_vals: 2D numpy array
        state: int 0 to 47
        """

        p = random.random()

        if p <= epsilon:
            return random.randint(0, 3)

        else:
            return np.argmax(Q_vals[state])