<a href="https://colab.research.google.com/github/garfield-gray/MachineLearning/blob/main/Reinforcement_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simple Tram Problem

In [8]:
class TransportationMDP(object):
    def __init__(self, N):
        # N = number of states
        self.N = N
    def startstate(self):
        return 1
    def is_final(self, state):
        return state == self.N
    def actions(self, state):
        result = []
        if state+1<=self.N:
            result.append('walk')
        if state*2<=self.N:
            result.append('tram')
        return result
    def succProbReward(self, state, action):
        # return list of (newState, prob, reward) triples
        # state = s, action = a, newState = s'
        # prob = T(s, a, s'), reward = Reward(s, a, s')
        result = []
        if action=='walk':
            result.append((state+1, 1.0, -1.0))
        else:
            failProb = 0.1
            result.append((state*2, 1-failProb, -2.0))
            result.append((state,   failProb, -2.0))
        return result
    def discount(self):
        return 1.0

    def states(self):
        return range(1, self.N+1)




In [9]:
mdp = TransportationMDP(N=10)
mdp.actions(5)
mdp.succProbReward(5, 'tram')

[(10, 0.9, -2.0), (5, 0.1, -2.0)]

In [10]:
import os

def value_iteration(mdp):
    # initialize
    V = {} # state -> estimate of optimal value function
    for state in mdp.states():
        V[state] = 0.0

    def Q(state, action):
        return sum(prob*(reward+mdp.discount()*V[newState]) for newState, prob, reward in mdp.succProbReward(state, action))
    while True:
        # compute
        newV = {}
        for state in mdp.states():
            # v* = max_a sum_s' T(s, a, s') * [R(s, a, s') + gamma
            if mdp.is_final(state):
                newV[state] = 0.0
            else:
                newV[state] = max(Q(state, action) for action in mdp.actions(state))

        # check convergence
        if max(abs(V[state]-newV[state]) for state in mdp.states())<1e-10:
            break
        # print(V)
        V = newV

        pi = {}
        for state in mdp.states():
            if mdp.is_final(state):
                pi[state] = 'final'
            else:
                pi[state] = max((Q(state, action), action) for action in mdp.actions(state))[1]
        # print
        os.system('clear')
        print('{:15} {:15} {:15}'.format('State', 'Optimal Value', 'Optimal Action'))
        for state in mdp.states():
            print('{:15} {:15} {:15}'.format(state, V[state], pi[state]))
        input()



In [11]:
mdp = TransportationMDP(N=10)
value_iteration(mdp)

State           Optimal Value   Optimal Action 
              1            -1.0 walk           
              2            -1.0 walk           
              3            -1.0 walk           
              4            -1.0 walk           
              5            -1.0 walk           
              6            -1.0 walk           
              7            -1.0 walk           
              8            -1.0 walk           
              9            -1.0 walk           
             10             0.0 final          

State           Optimal Value   Optimal Action 
              1            -2.0 walk           
              2            -2.0 walk           
              3            -2.0 walk           
              4            -2.0 walk           
              5            -2.0 tram           
              6            -2.0 walk           
              7            -2.0 walk           
              8            -2.0 walk           
              9            -1.0 walk   

In [None]:









min((3, 'walk'), (2, 'tram'), (4, 'walk'), (6, 'tram'), (10, 'walk'))

(2, 'tram')

In [3]:
max((3, 'walk'), (10, 'tram'), (4, 'walk'), (6, 'tram'), (10, 'walk'))

(10, 'walk')

# Volcano

# who knows?!