In [None]:
import numpy as np
import matplotlib.pyplot as plt

### Q1.

```
# Psuedo Code from Monte Carlo with Exploring Starts

Init:
    pi(s);         chosen from A(s) for all states s. 
    Q(s,a);        chosen from R for all state-action pairs (s,a).
    Returns(s,a);  dictionary of mean returns (single value).
    Counts(s,a);   List to keep track of state-actions encountered.
    
While True(for each episode):
    Choose (s,a) pair so that all such pairs are picked with a non-zero probability every episode. 
    Generate episode from (S0,A0) under pi.
    Init G=0
    for t in range(0, T, -1):
        G = discount*G + R(t+1)
        Unless (St,At) appear in remaining iterations:
            Returns(s,a) = Returns(s,a) + [G - Returns(s,a)]/Counts(s,a)
            Q(St,At) = Returns(St,At)
            pi(St) = argmax(a, Q(St,At))
            Counts(St,At) += 1
```

* This incremental update is equivalent to averaging over all returns as we are weighting the total returns ,`Returns(St,At)` by `[(n-1)/n]` and the current return `G` by `1/n`. the where `n` is the number of episodes or times it has been visited (depending on First-Visit of Every-Visit MC).
* This is exactly what we do during averaging.

In [2]:
"""
Monte Carlo base class.
"""
class MonteCarlo():
    def __init__(self, states, actions, policy, gamma):
        self.state_values = np.zeros(shape=(len(states)))
        self.returns = [[] for i in range(len(states))]
#         Hit is +1 and stick is -1
        self.policy = policy
        self.gamma = gamma
        
    def run_episode(self):
        states, actions, rewards = gen_episode()
        G = 0
        for i in range(0, len(states), -1):
            G = self.gamma*G + rewards[i]
            if not states.index(states[i])==i:
                self.returns[states[i]].append(G)
                self.state_values[states[i]] = sum(self.returns[states[i]])/len(self.returns[states[i]])
                
    def gen_episode(self, model, horizon):
        states = list()
        actions = list()
        rewards = list()
        
#         state[0] = ?
        action[0] = self.policy(state[0])
        
        for i in range(horizon):
            a, c = model.play(states[-1], actions[-1]) # Returns new state and reward.
            states.append(a)
#             actions.append(b)
            rewards.append(c)
            actions.append(self.policy(self.states[-1])) # Choose new action as per policy
        
        

In [None]:
"""
Blackjack code used for Q4.
"""
class BlackjackGame():
    def __init__(self):
        """
        Let hit be a 1
        and stick be -1
        """
        self.policy = np.zeros(22)
        for i in range(0, 17):
            self.policy[i] = 1
        for i in range(17, 22):
            self.policy[i] = -1
            
        self.player_total = 0
        self.dealer_total = 0
        self.dealer_cards = (0, 0)
        self.player_usable_ace = False
    
    def deal_card(self):
#         There are 13 unique card faces in total
    card = np.random.randint(1, 14)
    if card==1:
        return 11
    else:
        return 10
    
    def play(self, state, action):
#         State format: (current total, dealer card viewable, has a usable ace)
        if action=='hit':
            