<a href="https://colab.research.google.com/github/garfield-gray/MachineLearning/blob/main/Reinforcement_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simple Tram Problem

In [None]:
class TransportationMDP(object):
    def __init__(self, N):
        # N = number of states
        self.N = N
    def startstate(self):
        return 1
    def is_final(self, state):
        return state == self.N
    def actions(self, state):
        result = []
        if state+1<=self.N:
            result.append('walk')
        if state*2<=self.N:
            result.append('tram')
        return result
    def succProbReward(self, state, action):
        # return list of (newState, prob, reward) triples
        # state = s, action = a, newState = s'
        # prob = T(s, a, s'), reward = Reward(s, a, s')
        result = []
        if action=='walk':
            result.append((state+1, 1.0, -1.0))
        else:
            failProb = 0.5
            result.append((state*2, 1-failProb, -2.0))
            result.append((state,   failProb, -2.0))
        return result
    def discount(self):
        return 1.0

    def states(self):
        return range(1, self.N+1)




In [None]:
mdp = TransportationMDP(N=10)
mdp.actions(5)
mdp.succProbReward(5, 'tram')

[(10, 0.5, -2.0), (5, 0.5, -2.0)]

In [None]:
import os

def value_iteration(mdp):
    # initialize
    V = {} # state -> estimate of optimal value function
    for state in mdp.states():
        V[state] = 0.0

    def Q(state, action):
        return sum(prob*(reward+mdp.discount()*V[newState]) for newState, prob, reward in mdp.succProbReward(state, action))
    while True:
        # compute
        newV = {}
        for state in mdp.states():
            # v* = max_a sum_s' T(s, a, s') * [R(s, a, s') + gamma
            if mdp.is_final(state):
                newV[state] = 0.0
            else:
                newV[state] = max(Q(state, action) for action in mdp.actions(state))

        # check convergence
        if max(abs(V[state]-newV[state]) for state in mdp.states())<1e-10:
            break
        # print(V)
        V = newV

        pi = {}
        for state in mdp.states():
            if mdp.is_final(state):
                pi[state] = 'final'
            else:
                print(list((Q(state, action), action) for action in mdp.actions(state)))
                pi[state] = max((Q(state, action), action) for action in mdp.actions(state))[1]
        # print
        os.system('clear')
        print('{:15} {:15} {:15}'.format('State', 'Optimal Value', 'Optimal Action'))
        for state in mdp.states():
            print('{:15} {:15} {:15}'.format(state, V[state], pi[state]))
        input()



In [None]:
mdp = TransportationMDP(N=10)
value_iteration(mdp)

[(-2.0, 'walk'), (-3.0, 'tram')]
[(-2.0, 'walk'), (-3.0, 'tram')]
[(-2.0, 'walk'), (-3.0, 'tram')]
[(-2.0, 'walk'), (-3.0, 'tram')]
[(-2.0, 'walk'), (-2.5, 'tram')]
[(-2.0, 'walk')]
[(-2.0, 'walk')]
[(-2.0, 'walk')]
[(-1.0, 'walk')]
State           Optimal Value   Optimal Action 
              1            -1.0 walk           
              2            -1.0 walk           
              3            -1.0 walk           
              4            -1.0 walk           
              5            -1.0 walk           
              6            -1.0 walk           
              7            -1.0 walk           
              8            -1.0 walk           
              9            -1.0 walk           
             10             0.0 final          

[(-3.0, 'walk'), (-4.0, 'tram')]
[(-3.0, 'walk'), (-4.0, 'tram')]
[(-3.0, 'walk'), (-4.0, 'tram')]
[(-3.0, 'walk'), (-4.0, 'tram')]
[(-3.0, 'walk'), (-3.0, 'tram')]
[(-3.0, 'walk')]
[(-3.0, 'walk')]
[(-2.0, 'walk')]
[(-1.0, 'walk')]
State

In [None]:



































min((3, 'walk'), (2, 'tram'), (4, 'walk'), (6, 'tram'), (10, 'walk'))

(2, 'tram')

In [None]:
max((3, 'walk'), (10, 'tram'), (4, 'walk'), (6, 'tram'), (10, 'walk'))

(10, 'walk')

# gym

In [1]:
import gym

In [2]:

import gym
env = gym.make('CartPole-v0')
env.reset()
for _ in range(1000):
  env.render()
  env.step(env.action_space.sample()) # take a random action
env.close()


  logger.warn(
  deprecation(
  deprecation(
If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):
  logger.warn(


# torch

In [None]:
import torch

In [None]:
from torch import optim
a = optim.Adam(lr=0.01, params=[torch.tensor(1.0, requires_grad=True)])

In [None]:
d = a.__doc__

In [None]:
d

"Implements Adam algorithm.\n\n    .. math::\n       \\begin{aligned}\n            &\\rule{110mm}{0.4pt}                                                                 \\\\\n            &\\textbf{input}      : \\gamma \\text{ (lr)}, \\beta_1, \\beta_2\n                \\text{ (betas)},\\theta_0 \\text{ (params)},f(\\theta) \\text{ (objective)}          \\\\\n            &\\hspace{13mm}      \\lambda \\text{ (weight decay)},  \\: \\textit{amsgrad},\n                \\:\\textit{maximize}                                                              \\\\\n            &\\textbf{initialize} :  m_0 \\leftarrow 0 \\text{ ( first moment)},\n                v_0\\leftarrow 0 \\text{ (second moment)},\\: \\widehat{v_0}^{max}\\leftarrow 0\\\\[-1.ex]\n            &\\rule{110mm}{0.4pt}                                                                 \\\\\n            &\\textbf{for} \\: t=1 \\: \\textbf{to} \\: \\ldots \\: \\textbf{do}                         \\\\\n\n            &\\hspace{5mm}\\textbf