<a href="https://colab.research.google.com/github/dchui1/659-project/blob/master/Q_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import gym
import random
import math
import numpy as np

class RiverSwim:
    def __init__(self):
        self.STEPS_LIMIT = 20000 # number of steps in each episode?
        self.pos = 0
        self.swimRightStay = 0.6
        self.swimRightUp = 0.35
        self.swimRightDown = 0.05
        self.S1swimRightUp = 0.6
        self.SNswimRightDown = 0.4

    def reset(self):
        self.n = 0
        self.pos = 0
        return self.pos

    def step(self, a): # the transition function?
        # if action is 0 then we do nothing
        if a == 1:
            # determine if we will successfully take the "up" action
            flip = random.random()
            if self.pos <= 0: # first state in chain
                if flip > self.S1swimRightUp:
                    self.pos = self.pos + 1
            elif self.pos >= 5: # end of chain
                if flip <= self.SNswimRightDown:
                    self.pos = self.pos - 1
            else: # middle of chain
                if flip <= self.swimRightDown:
                    self.pos = self.pos - 1
                elif flip > self.swimRightDown + self.swimRightStay:
                    self.pos = self.pos + 1
        # make sure that the position we return (the next state) is between 0 and 5
        self.pos = np.clip(self.pos, 0, 5)

        # tuple indicating (state, reward, terminated, action)
        # note this is a continuing task, so the environment will never send a
        # termination signal
        return (self.pos, self.rewardFunction(self.pos), False, a)

    def rewardFunction(self, x): # the reward function is deterministic and only depends on state, not action
        if x >= 5:
            return 1.0
        if x <= 0:
            return 5.0/1000.0
        return 0.0

    def numObservations(self):
        # position on the river.
        # states are: 0, 1, 2, 3, 4, 5
        return 1

    def numActions(self):
        # (0) stay or (1) swim up the river
        return 2
    # Daniel was here
    


####Imports and Definitions

In [0]:
class Q:
    def __init__(self):
        self.alpha = 0.01
        self.gamma = 0.99
        self.epsilon = 0.0

        num_states = 6
        self.num_acts = 2

        self.Q = np.zeros((num_states, self.num_acts))
        self.next_action = 0

    def policy(self, S):
        if random.random() < self.epsilon:
            return random.randint(0, self.num_acts - 1)
        return self.maxAction(S)

    def maxAction(self, s):
        act_vals = self.Q[s, :]
        move = self.breakTie(act_vals)
        return move

    def getAction(self, Obs):
        return self.next_action

    # if gamma_tp1 = 0, that means the episode terminated
    def learn(self, s, sp, r, a, gamma):
        ap = self.maxAction(sp)
        Q_p = self.Q[sp, ap]

        tde = (r + gamma * Q_p) - self.Q[s, a]
        self.Q[s, a] = self.Q[s, a] + self.alpha*tde

    def update(self, S, Sp, r, a, done):
        if done:
            self.learn(S, Sp, r, a, 0)
        else:
            self.next_action = self.policy(Sp)
            self.learn(S, Sp, r, a, self.gamma)

    def start(self, obs):
        self.next_action = self.policy(obs)
        return self.next_action

    def breakTie(self, act_vals):
        indexes = np.where(act_vals == np.max(act_vals))[0]
        if len(indexes) < 1:
            print(indexes, act_vals)
        return np.random.choice(indexes)


####Q-learning Agent with No Bonus

In [0]:
def q_learning(env, num_episodes, q):
  for episode in range(num_episodes):
    s = env.reset()
    a = q.start(s)
    
    for step in range(env.STEPS_LIMIT):
      (sp, r, done, __) = env.step(a) # Note: the environment "registers" the new sp as env.pos
      done = done or step == (env.STEPS_LIMIT - 1)
      q.update(s, sp, r, a, done)
      s = sp # update the current state to sp
      a = q.policy(s) # update the current action to a
  return q.Q   

In [0]:
num_episodes = 1
env = RiverSwim()
Q_object = Q()  # recall: self.Q = np.zeros((num_states, self.num_acts)), thus "q" here is an instance of the Q class, where self.Q is an np.array
Q_array = q_learning(env, num_episodes, Q_object) 
Q_array

In [0]:
print(Q_array.shape)
type(Q_array)

####Q-learning Agent with Bonus updated Tabularly

In [0]:
class B:
  def __init__(self):
    self.B = np.zeros((num_states, self.num_acts, 2))
    self.count = 0
  
  def stat_parameters(self, s, a):
    return (self.B[s, a, 0], self.B[s, a, 1])
    
  def update_stats(self, s, a, val=0.0): # the default of the new value is 0 for exploration bonuses
    self.count += 1
    old_m, old_var = self.stat_parameters(s, a)
    self.B[s, a, 0] = old_m + (val - old_m)/self.count  
#     self.B[s, a, 1] = old_var + (val - old_m)*(val - self.B[s, a, 0])
    self.B[s, a, 1] = old_var + ((val - old_m)^2)/self.count - (old_var^2)/(n-1)
    
  def sample(self, s, a):
    mean, var = self.stat_parameters(s, a)
    return np.random.normal(mean, var)



In [0]:
# Notes:
# windy gridworld -> stochastic world.. maybe ignore stochasticity at first
# Try mountain car? This is a continuous-state domain
# river swim: states have far enough variance... How is this determined?