환경 설정
```
!pip install tqdm numpy scikit-learn pyglet setuptools && \
!pip install gym asciinema pandas tabulate tornado==5.* PyBullet && \
!pip install git+https://github.com/pybox2d/pybox2d#egg=Box2D && \
!pip install git+https://github.com/mimoralea/gym-bandits#egg=gym-bandits && \
!pip install git+https://github.com/mimoralea/gym-walk#egg=gym-walk && \
!pip install git+https://github.com/mimoralea/gym-aima#egg=gym-aima && \
!pip install gym[atari]
```

# Recap Point
- MDP를 튜플-딕셔너리로 표현하고 해석 가능한지

# BanditWalk Env
Transition tuple: (transition probability, next state, reward, done)

In [1]:
P = {
    0: {
        0: [(1.0, 0, 0.0, True)],
        1: [(1.0, 0, 0.0, True)]
    },
    1: {
        0: [(1.0, 0, 0.0, True)],
        1: [(1.0, 2, 1.0, True)],
    },
    2: {
        0: [(1.0, 2, 0.0, True)],
        1: [(1.0, 2, 0.0, True)]
    }
}

Or using gym module

In [2]:
import gym, gym_walk
P = gym.make('BanditWalk-v0').env.P

In [3]:
P

{0: {0: [(1.0, 0, 0.0, True), (0.0, 0, 0.0, True), (0.0, 0, 0.0, True)],
  1: [(1.0, 0, 0.0, True), (0.0, 0, 0.0, True), (0.0, 0, 0.0, True)]},
 1: {0: [(1.0, 0, 0.0, True), (0.0, 1, 0.0, False), (0.0, 2, 1.0, True)],
  1: [(1.0, 2, 1.0, True), (0.0, 1, 0.0, False), (0.0, 0, 0.0, True)]},
 2: {0: [(1.0, 2, 0.0, True), (0.0, 2, 0.0, True), (0.0, 2, 0.0, True)],
  1: [(1.0, 2, 0.0, True), (0.0, 2, 0.0, True), (0.0, 2, 0.0, True)]}}

# BanditSlipperyWalk Env

In [4]:
P = {
    0: {
        0: [(1.0, 0, 0.0, True)],
        1: [(1.0, 0, 0.0, True)]
    },
    1: {
        0: [(0.8, 0, 0.0, True), (0.2, 2, 1.0, True)],
        1: [(0.8, 2, 1.0, True), (0.2, 0, 0.0, True)]
    },
    2: {
        0: [(1.0, 2, 0.0, True)],
        1: [(1.0, 2, 0.0, True)]
    }
}

In [5]:
P = gym.make('BanditSlipperyWalk-v0').env.P

In [6]:
P

{0: {0: [(0.8, 0, 0.0, True), (0.0, 0, 0.0, True), (0.2, 0, 0.0, True)],
  1: [(0.8, 0, 0.0, True), (0.0, 0, 0.0, True), (0.2, 0, 0.0, True)]},
 1: {0: [(0.8, 0, 0.0, True), (0.0, 1, 0.0, False), (0.2, 2, 1.0, True)],
  1: [(0.8, 2, 1.0, True), (0.0, 1, 0.0, False), (0.2, 0, 0.0, True)]},
 2: {0: [(0.8, 2, 0.0, True), (0.0, 2, 0.0, True), (0.2, 2, 0.0, True)],
  1: [(0.8, 2, 0.0, True), (0.0, 2, 0.0, True), (0.2, 2, 0.0, True)]}}

# Walk Tree

In [7]:
P = {
    0: {
        0: [(1.0, 0, 0.0, True)],
        1: [(1.0, 0, 0.0, True)]
    },
    1: {
        0: [(1.0, 0, 0.0, True)],
        1: [(1.0, 2, 0.0, False)],
    },
    2: {
        0: [(1.0, 1, 0.0, False)],
        1: [(1.0, 3, 0.0, False)]
    },
    3: {
        0: [(1.0, 2, 0.0, False)],
        1: [(1.0, 4, 1.0, True)]
    },
    4: {
        0: [(1.0, 4, 0.0, True)],
        1: [(1.0, 4, 0.0, True)]
    }
}

P

{0: {0: [(1.0, 0, 0.0, True)], 1: [(1.0, 0, 0.0, True)]},
 1: {0: [(1.0, 0, 0.0, True)], 1: [(1.0, 2, 0.0, False)]},
 2: {0: [(1.0, 1, 0.0, False)], 1: [(1.0, 3, 0.0, False)]},
 3: {0: [(1.0, 2, 0.0, False)], 1: [(1.0, 4, 1.0, True)]},
 4: {0: [(1.0, 4, 0.0, True)], 1: [(1.0, 4, 0.0, True)]}}

In [8]:
P = gym.make('WalkThree-v0').env.P
P

{0: {0: [(1.0, 0, 0.0, True), (0.0, 0, 0.0, True), (0.0, 0, 0.0, True)],
  1: [(1.0, 0, 0.0, True), (0.0, 0, 0.0, True), (0.0, 0, 0.0, True)]},
 1: {0: [(1.0, 0, 0.0, True), (0.0, 1, 0.0, False), (0.0, 2, 0.0, False)],
  1: [(1.0, 2, 0.0, False), (0.0, 1, 0.0, False), (0.0, 0, 0.0, True)]},
 2: {0: [(1.0, 1, 0.0, False), (0.0, 2, 0.0, False), (0.0, 3, 0.0, False)],
  1: [(1.0, 3, 0.0, False), (0.0, 2, 0.0, False), (0.0, 1, 0.0, False)]},
 3: {0: [(1.0, 2, 0.0, False), (0.0, 3, 0.0, False), (0.0, 4, 1.0, True)],
  1: [(1.0, 4, 1.0, True), (0.0, 3, 0.0, False), (0.0, 2, 0.0, False)]},
 4: {0: [(1.0, 4, 0.0, True), (0.0, 4, 0.0, True), (0.0, 4, 0.0, True)],
  1: [(1.0, 4, 0.0, True), (0.0, 4, 0.0, True), (0.0, 4, 0.0, True)]}}

# Slippery Walk Tree

In [9]:
P = {
    0: {
        0: [(1.0, 0, 0.0, True)],
        1: [(1.0, 0, 0.0, True)]
    },
    1: {
        0: [(0.5000000000000001, 0, 0.0, True),
            (0.3333333333333333, 1, 0.0, False),
            (0.16666666666666666, 2, 0.0, False)
        ],
        1: [(0.5000000000000001, 2, 0.0, False),
            (0.3333333333333333, 1, 0.0, False),
            (0.16666666666666666, 0, 0.0, True)
        ]
    },
    2: {
        0: [(0.5000000000000001, 1, 0.0, False),
            (0.3333333333333333, 2, 0.0, False),
            (0.16666666666666666, 3, 0.0, False)
        ],
        1: [(0.5000000000000001, 3, 0.0, False),
            (0.3333333333333333, 2, 0.0, False),
            (0.16666666666666666, 1, 0.0, False)
        ]
    },
    3: {
        0: [(0.5000000000000001, 2, 0.0, False),
            (0.3333333333333333, 3, 0.0, False),
            (0.16666666666666666, 4, 1.0, True)
        ],
        1: [(0.5000000000000001, 4, 1.0, True),
            (0.3333333333333333, 3, 0.0, False),
            (0.16666666666666666, 2, 0.0, False)
        ]
    },
    4: {
        0: [(1.0, 4, 0.0, True)],
        1: [(1.0, 4, 0.0, True)]
    }
}

P

{0: {0: [(1.0, 0, 0.0, True)], 1: [(1.0, 0, 0.0, True)]},
 1: {0: [(0.5000000000000001, 0, 0.0, True),
   (0.3333333333333333, 1, 0.0, False),
   (0.16666666666666666, 2, 0.0, False)],
  1: [(0.5000000000000001, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.16666666666666666, 0, 0.0, True)]},
 2: {0: [(0.5000000000000001, 1, 0.0, False),
   (0.3333333333333333, 2, 0.0, False),
   (0.16666666666666666, 3, 0.0, False)],
  1: [(0.5000000000000001, 3, 0.0, False),
   (0.3333333333333333, 2, 0.0, False),
   (0.16666666666666666, 1, 0.0, False)]},
 3: {0: [(0.5000000000000001, 2, 0.0, False),
   (0.3333333333333333, 3, 0.0, False),
   (0.16666666666666666, 4, 1.0, True)],
  1: [(0.5000000000000001, 4, 1.0, True),
   (0.3333333333333333, 3, 0.0, False),
   (0.16666666666666666, 2, 0.0, False)]},
 4: {0: [(1.0, 4, 0.0, True)], 1: [(1.0, 4, 0.0, True)]}}

In [10]:
P = gym.make('SlipperyWalkThree-v0').env.P
P

{0: {0: [(0.5000000000000001, 0, 0.0, True),
   (0.3333333333333333, 0, 0.0, True),
   (0.16666666666666666, 0, 0.0, True)],
  1: [(0.5000000000000001, 0, 0.0, True),
   (0.3333333333333333, 0, 0.0, True),
   (0.16666666666666666, 0, 0.0, True)]},
 1: {0: [(0.5000000000000001, 0, 0.0, True),
   (0.3333333333333333, 1, 0.0, False),
   (0.16666666666666666, 2, 0.0, False)],
  1: [(0.5000000000000001, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.16666666666666666, 0, 0.0, True)]},
 2: {0: [(0.5000000000000001, 1, 0.0, False),
   (0.3333333333333333, 2, 0.0, False),
   (0.16666666666666666, 3, 0.0, False)],
  1: [(0.5000000000000001, 3, 0.0, False),
   (0.3333333333333333, 2, 0.0, False),
   (0.16666666666666666, 1, 0.0, False)]},
 3: {0: [(0.5000000000000001, 2, 0.0, False),
   (0.3333333333333333, 3, 0.0, False),
   (0.16666666666666666, 4, 1.0, True)],
  1: [(0.5000000000000001, 4, 1.0, True),
   (0.3333333333333333, 3, 0.0, False),
   (0.16666666666666666, 2, 0.0, Fals

# Random Walk

In [11]:
P = {
    0: {
        0: [(1.0, 0, 0.0, True)],
        1: [(1.0, 0, 0.0, True)]
    },
    1: {
        0: [(0.5, 0, 0.0, True), (0.5, 2, 0.0, False)],
        1: [(0.5, 2, 0.0, False), (0.5, 0, 0.0, True)]
    },
    2: {
        0: [(0.5, 1, 0.0, False), (0.5, 3, 0.0, False)],
        1: [(0.5, 3, 0.0, False), (0.5, 1, 0.0, False)]
    },
    3: {
        0: [(0.5, 2, 0.0, False), (0.5, 4, 0.0, False)],
        1: [(0.5, 4, 0.0, False), (0.5, 2, 0.0, False)]
    },
    4: {
        0: [(0.5, 3, 0.0, False), (0.5, 5, 0.0, False)],
        1: [(0.5, 5, 0.0, False), (0.5, 3, 0.0, False)]
    },
    5: {
        0: [(0.5, 4, 0.0, False), (0.5, 6, 1.0, True)],
        1: [(0.5, 6, 1.0, True), (0.5, 4, 0.0, False)]
    },
    6: {
        0: [(1.0, 6, 0.0, True)],
        1: [(1.0, 6, 0.0, True)]
    }
}

P

{0: {0: [(1.0, 0, 0.0, True)], 1: [(1.0, 0, 0.0, True)]},
 1: {0: [(0.5, 0, 0.0, True), (0.5, 2, 0.0, False)],
  1: [(0.5, 2, 0.0, False), (0.5, 0, 0.0, True)]},
 2: {0: [(0.5, 1, 0.0, False), (0.5, 3, 0.0, False)],
  1: [(0.5, 3, 0.0, False), (0.5, 1, 0.0, False)]},
 3: {0: [(0.5, 2, 0.0, False), (0.5, 4, 0.0, False)],
  1: [(0.5, 4, 0.0, False), (0.5, 2, 0.0, False)]},
 4: {0: [(0.5, 3, 0.0, False), (0.5, 5, 0.0, False)],
  1: [(0.5, 5, 0.0, False), (0.5, 3, 0.0, False)]},
 5: {0: [(0.5, 4, 0.0, False), (0.5, 6, 1.0, True)],
  1: [(0.5, 6, 1.0, True), (0.5, 4, 0.0, False)]},
 6: {0: [(1.0, 6, 0.0, True)], 1: [(1.0, 6, 0.0, True)]}}

In [12]:
P = gym.make('RandomWalk-v0').env.P
P

{0: {0: [(0.5, 0, 0.0, True), (0.0, 0, 0.0, True), (0.5, 0, 0.0, True)],
  1: [(0.5, 0, 0.0, True), (0.0, 0, 0.0, True), (0.5, 0, 0.0, True)]},
 1: {0: [(0.5, 0, 0.0, True), (0.0, 1, 0.0, False), (0.5, 2, 0.0, False)],
  1: [(0.5, 2, 0.0, False), (0.0, 1, 0.0, False), (0.5, 0, 0.0, True)]},
 2: {0: [(0.5, 1, 0.0, False), (0.0, 2, 0.0, False), (0.5, 3, 0.0, False)],
  1: [(0.5, 3, 0.0, False), (0.0, 2, 0.0, False), (0.5, 1, 0.0, False)]},
 3: {0: [(0.5, 2, 0.0, False), (0.0, 3, 0.0, False), (0.5, 4, 0.0, False)],
  1: [(0.5, 4, 0.0, False), (0.0, 3, 0.0, False), (0.5, 2, 0.0, False)]},
 4: {0: [(0.5, 3, 0.0, False), (0.0, 4, 0.0, False), (0.5, 5, 0.0, False)],
  1: [(0.5, 5, 0.0, False), (0.0, 4, 0.0, False), (0.5, 3, 0.0, False)]},
 5: {0: [(0.5, 4, 0.0, False), (0.0, 5, 0.0, False), (0.5, 6, 1.0, True)],
  1: [(0.5, 6, 1.0, True), (0.0, 5, 0.0, False), (0.5, 4, 0.0, False)]},
 6: {0: [(0.5, 6, 0.0, True), (0.0, 6, 0.0, True), (0.5, 6, 0.0, True)],
  1: [(0.5, 6, 0.0, True), (0.0, 6, 0

# GridWorld 

(Russll and Norvig의 Artificial Intelligence - Modern Approach책 참고)

In [13]:
P = {
    0: {
        0: [(0.9, 0, -0.04, False),
            (0.1, 4, -0.04, False)
        ],
        1: [(0.1, 0, -0.04, False), (0.8, 4, -0.04, False), (0.1, 1, -0.04, False)],
        2: [(0.1, 4, -0.04, False), (0.8, 1, -0.04, False), (0.1, 0, -0.04, False)],
        3: [(0.1, 1, -0.04, False), (0.8, 0, -0.04, False), (0.1, 0, -0.04, False)]
    },
    1: {
        0: [(0.2, 1, -0.04, False),
            (0.8, 0, -0.04, False)
        ],
        1: [(0.1, 0, -0.04, False), (0.8, 1, -0.04, False), (0.1, 2, -0.04, False)],
        2: [(0.1, 1, -0.04, False), (0.8, 2, -0.04, False), (0.1, 1, -0.04, False)],
        3: [(0.1, 2, -0.04, False), (0.8, 1, -0.04, False), (0.1, 0, -0.04, False)]
    },
    2: {
        0: [(0.1, 2, -0.04, False),
            (0.8, 1, -0.04, False),
            (0.1, 6, -0.04, False)
        ],
        1: [(0.1, 1, -0.04, False), (0.8, 6, -0.04, False), (0.1, 3, 0.96, True)],
        2: [(0.1, 6, -0.04, False), (0.8, 3, 0.96, True), (0.1, 2, -0.04, False)],
        3: [(0.1, 3, 0.96, True), (0.8, 2, -0.04, False), (0.1, 1, -0.04, False)]
    },
    3: {
        0: [(1.0, 3, 0, True)],
        1: [(1.0, 3, 0, True)],
        2: [(1.0, 3, 0, True)],
        3: [(1.0, 3, 0, True)]
    },
    4: {
        0: [(0.1, 0, -0.04, False),
            (0.8, 4, -0.04, False),
            (0.1, 8, -0.04, False)
        ],
        1: [(0.2, 4, -0.04, False), (0.8, 8, -0.04, False)],
        2: [(0.1, 8, -0.04, False), (0.8, 4, -0.04, False), (0.1, 0, -0.04, False)],
        3: [(0.2, 4, -0.04, False), (0.8, 0, -0.04, False)]
    },
    5: {
        0: [(1.0, 5, 0, True)],
        1: [(1.0, 5, 0, True)],
        2: [(1.0, 5, 0, True)],
        3: [(1.0, 5, 0, True)]
    },
    6: {
        0: [(0.1, 2, -0.04, False),
            (0.8, 6, -0.04, False),
            (0.1, 10, -0.04, False)
        ],
        1: [(0.1, 6, -0.04, False), (0.8, 10, -0.04, False), (0.1, 7, -1.04, True)],
        2: [(0.1, 10, -0.04, False), (0.8, 7, -1.04, True), (0.1, 2, -0.04, False)],
        3: [(0.1, 7, -1.04, True), (0.8, 2, -0.04, False), (0.1, 6, -0.04, False)]
    },
    7: {
        0: [(1.0, 7, 0, True)],
        1: [(1.0, 7, 0, True)],
        2: [(1.0, 7, 0, True)],
        3: [(1.0, 7, 0, True)]
    },
    8: {
        0: [(0.1, 4, -0.04, False),
            (0.9, 8, -0.04, False)
        ],
        1: [(0.9, 8, -0.04, False), (0.1, 9, -0.04, False)],
        2: [(0.1, 8, -0.04, False), (0.8, 9, -0.04, False), (0.1, 4, -0.04, False)],
        3: [(0.1, 9, -0.04, False), (0.8, 4, -0.04, False), (0.1, 8, -0.04, False)]
    },
    9: {
        0: [(0.2, 9, -0.04, False),
            (0.8, 8, -0.04, False)
        ],
        1: [(0.1, 8, -0.04, False), (0.8, 9, -0.04, False), (0.1, 10, -0.04, False)],
        2: [(0.2, 9, -0.04, False), (0.8, 10, -0.04, False)],
        3: [(0.1, 10, -0.04, False),
            (0.8, 9, -0.04, False),
            (0.1, 8, -0.04, False)
        ]
    },
    10: {
        0: [(0.1, 6, -0.04, False),
            (0.8, 9, -0.04, False),
            (0.1, 10, -0.04, False)
        ],
        1: [(0.1, 9, -0.04, False),
            (0.8, 10, -0.04, False),
            (0.1, 11, -0.04, False)
        ],
        2: [(0.1, 10, -0.04, False),
            (0.8, 11, -0.04, False),
            (0.1, 6, -0.04, False)
        ],
        3: [(0.1, 11, -0.04, False),
            (0.8, 6, -0.04, False),
            (0.1, 9, -0.04, False)
        ]
    },
    11: {
        0: [(0.1, 7, -1.04, True),
            (0.8, 10, -0.04, False),
            (0.1, 11, -0.04, False)
        ],
        1: [(0.1, 10, -0.04, False),
            (0.9, 11, -0.04, False)
        ],
        2: [(0.9, 11, -0.04, False), (0.1, 7, -1.04, True)],
        3: [(0.1, 11, -0.04, False),
            (0.8, 7, -1.04, True),
            (0.1, 10, -0.04, False)
        ]
    }
}

P

{0: {0: [(0.9, 0, -0.04, False), (0.1, 4, -0.04, False)],
  1: [(0.1, 0, -0.04, False), (0.8, 4, -0.04, False), (0.1, 1, -0.04, False)],
  2: [(0.1, 4, -0.04, False), (0.8, 1, -0.04, False), (0.1, 0, -0.04, False)],
  3: [(0.1, 1, -0.04, False), (0.8, 0, -0.04, False), (0.1, 0, -0.04, False)]},
 1: {0: [(0.2, 1, -0.04, False), (0.8, 0, -0.04, False)],
  1: [(0.1, 0, -0.04, False), (0.8, 1, -0.04, False), (0.1, 2, -0.04, False)],
  2: [(0.1, 1, -0.04, False), (0.8, 2, -0.04, False), (0.1, 1, -0.04, False)],
  3: [(0.1, 2, -0.04, False), (0.8, 1, -0.04, False), (0.1, 0, -0.04, False)]},
 2: {0: [(0.1, 2, -0.04, False),
   (0.8, 1, -0.04, False),
   (0.1, 6, -0.04, False)],
  1: [(0.1, 1, -0.04, False), (0.8, 6, -0.04, False), (0.1, 3, 0.96, True)],
  2: [(0.1, 6, -0.04, False), (0.8, 3, 0.96, True), (0.1, 2, -0.04, False)],
  3: [(0.1, 3, 0.96, True), (0.8, 2, -0.04, False), (0.1, 1, -0.04, False)]},
 3: {0: [(1.0, 3, 0, True)],
  1: [(1.0, 3, 0, True)],
  2: [(1.0, 3, 0, True)],
  3: [(

In [17]:
P = gym.make('RussellNorvigGridWorld-v0').env.P
P

UnregisteredEnv: No registered env with id: RussellNorvigGridWorld-v0

# FronzenLake Env

In [None]:
P = {
    0: {
        0: [(0.66666666666, 0, 0.0, False),
            (0.33333333333, 4, 0.0, False)
           ],
        1: [