In [None]:


import gym
import matplotlib
import numpy as np
import sys

from collections import defaultdict

if "../" not in sys.path:
  sys.path.append("../") 
from lib.envs.blackjack import BlackjackEnv
from lib import plotting

matplotlib.style.use('ggplot')

In [None]:
env = BlackjackEnv()

In [None]:
def mc_prediction(policy, env, num_episodes, discount_factor=1.0):
    """
    Monte Carlo prediction algorithm. Calculates the value function
    for a given policy using sampling.
    
    Args:
        policy: A function that maps an observation to action probabilities.
        env: OpenAI gym environment.
        num_episodes: Number of episodes to sample.
        discount_factor: Gamma discount factor.
    
    Returns:
        A dictionary that maps from state -> value.
        The state is a tuple and the value is a float.
    """

    # Keeps track of sum and count of returns for each state
    # to calculate an average. We could use an array to save all
    # returns (like in the book) but that's memory inefficient.
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)

    # The final value function
    V = defaultdict(float)

    # Implement this!
    for i_episode in range(1, 1 + num_episodes):
        if i_episode % 10000 == 0:
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()
        # Following the policy to play the game and record states
        episode_states = []
        observation = env.reset()
        is_done = False
        while not is_done:
            action = policy(observation)
            new_observation, reward, is_done, _ = env.step(action)
            episode_states.append((observation, reward, is_done))
            observation = new_observation

        # Evaluating policy (updating value function)
        for state in set(tuple(x[0]) for x in episode_states):
            first_visit_of_state = next(i for i, x in enumerate(episode_states) if x[0] == state)
            g = sum(x[1] * discount_factor ** i for i, x in enumerate(episode_states[first_visit_of_state:]))
            returns_count[state] += 1
            returns_sum[state] += g

            V[state] = returns_sum[state] / returns_count[state]
    return V


In [None]:
def mc_every_prediction(policy, env, num_episodes, discount_factor=1.0):

    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)

    # The final value function
    V = defaultdict(float)

    # Implement this!
    for i_episode in range(1, 1 + num_episodes):
        if i_episode % 10000 == 0:
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()
        # Following the policy to play the game and record states
        episode_states = []
        observation = env.reset()
        is_done = False
        while not is_done:
            action = policy(observation)
            new_observation, reward, is_done, _ = env.step(action)
            episode_states.append((observation, reward, is_done))
            observation = new_observation

        # Evaluating policy (updating value function)
        for state in set(tuple(x[0]) for x in episode_states):
            
            visits_of_state = set(i for i, x in enumerate(episode_states) if x[0] == state)
            for visit in visits_of_state:
                g = sum(x[1] * discount_factor ** i for i, x in enumerate(episode_states[visit:]))
                returns_count[state] += 1
                returns_sum[state] += g
    
                V[state] = returns_sum[state] / returns_count[state]
    return V


In [None]:
def sample_policy(observation):
    """
    A policy that sticks if the player score is > 20 and hits otherwise.
    """
    score, dealer_score, usable_ace = observation
    return 0 if score >= 17 else 1

In [None]:
# V_10k = mc_prediction(sample_policy, env, num_episodes=10000)
# plotting.plot_value_function(V_10k, title="Sample_p 10,000 Steps")

V_500k = mc_prediction(sample_policy, env, num_episodes=500000)
plotting.plot_value_function(V_500k, title="Sample_p 500,000 Steps")

In [None]:
def my_policy(observation):
    """
    A policy that sticks if the player score is > 20 and hits otherwise.
    """
    score, dealer_score, usable_ace = observation
    if score < 18 or (usable_ace and score < 20):
        return 1
    return 0


In [None]:

V_10k = mc_prediction(my_policy, env, num_episodes=10000)
plotting.plot_value_function(V_10k, title="mc_prediction 10,000 Steps")

# V_500k = mc_prediction(my_policy, env, num_episodes=500000)
# plotting.plot_value_function(V_500k, title="My_p18  500,000 Steps")

V_10k = mc_every_prediction(my_policy, env, num_episodes=10000)
plotting.plot_value_function(V_10k, title="mc_every_prediction 10,000 Steps")

V_10k = td_prediction(my_policy, env, num_episodes=10000)
plotting.plot_value_function(V_10k, title="td_prediction 10,000 Steps")

In [None]:
def winning_prob(policy, num_episodes):
    total_reward = 0
    player = 0
    dealer = 0
    flat = 0
    black_jack = 0
    # Implement this!
    for i_episode in range(1, 1 + num_episodes):
        if i_episode % 10000 == 0:
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()
        # Following the policy to play the game and record states
        observation = env.reset()
        is_done = False
        while not is_done:
            action = policy(observation)
            new_observation, reward, is_done, _ = env.step(action)
            observation = new_observation
        reward_i = env.get_player_reward()
        total_reward += reward_i
        # print(str(env.dealer) + ':' + str(env.player) + ' :: ' + str(reward_i))
        if reward_i == 0:
            flat += 1
        elif reward_i > 0:
            player += 1
            if reward_i == 1.5:
                black_jack += 1
        else:
            dealer += 1
    return total_reward, player / num_episodes, flat / num_episodes, dealer / num_episodes, black_jack

print("total_reward, player/num_episodes, flat/num_episodes, dealer/num_episodes, black_jack_count")
print('The percentage of wining with my_policy is ' + str(winning_prob(my_policy, 10000)))
print('The percentage of wining with policy hit smaller than 17 is ' + str(winning_prob(sample_policy, 10000)))
print('The percentage of wining with RL policy is ' + str(winning_prob(policy, 10000)))
