# Markovian Game of Thorns

![GOT Display](gotdisplay.jpg)

In [1]:
# Markov Decision Process Example
import numpy as np
import copy
import pprint

V ={"dragonstone": 0,"whiteharbor":0, "winterfell":0,"alive-terminal":1,"dead-terminal":-1} # States

R ={"from_dragonstone":{"land":-0.02,"sea":-0.05,"dragon":-0.1},\
    "from_whiteharbor":{"land":-0.01},\
   "from_winterfell":{"land":-0.01},\
   }

Q = copy.copy(R)

P ={"from_dragonstone":{"land":{"to_winterfell":0.5,"to_dead-terminal":0.5},\
                         "sea":{"to_whiteharbor":0.1,"to_dead-terminal":0.9},\
                         "dragon":{"to_winterfell":0.95,"to_dead-terminal":0.05}},\
    "from_whiteharbor":{"land":{"to_winterfell":0.6,"to_dead-terminal":0.4}},\
   "from_winterfell":{"land":{"to_alive-terminal":0.9,"to_dead-terminal":0.1}},\
   }

gamma = 0.9

Policy = {"from_dragonstone":"land","from_whiteharbor":"land","from_winterfell":"land"}

# Solution by Value Iteration
for i in range(10):
    for from_location in P.keys():
        V[from_location[5:]] = max(Q[from_location].values())
        Q[from_location]=copy.copy(R[from_location]) # Initialize with Immediate Reward
        #Action Value Update
        for action in P[from_location].keys():
            for to_location in P[from_location][action].keys():
                Q[from_location][action] = Q[from_location][action] + \
                gamma*P[from_location][action][to_location]*V[to_location[3:]]

        Policy[from_location]=max(Q[from_location],key=Q[from_location].get)


print ('State Value ',V)
print ('Action Value',Q)
print ('Learned Policy',Policy)

State Value  {'whiteharbor': 0.013400000000000023, 'alive-terminal': 1, 'dragonstone': 0.4620500000000001, 'dead-terminal': -1, 'winterfell': 0.7100000000000001}
Action Value {'from_dragonstone': {'land': -0.15049999999999997, 'sea': -0.8587940000000001, 'dragon': 0.4620500000000001}, 'from_whiteharbor': {'land': 0.013400000000000023}, 'from_winterfell': {'land': 0.7100000000000001}}
Learned Policy {'from_dragonstone': 'dragon', 'from_whiteharbor': 'land', 'from_winterfell': 'land'}


## State Values

In [4]:
pprint.pprint(V)

{'alive-terminal': 1,
 'dead-terminal': -1,
 'dragonstone': 0.4620500000000001,
 'whiteharbor': 0.013400000000000023,
 'winterfell': 0.7100000000000001}


## Action Values

In [5]:
pprint.pprint(Q)

{'from_dragonstone': {'dragon': 0.4620500000000001,
                      'land': -0.15049999999999997,
                      'sea': -0.8587940000000001},
 'from_whiteharbor': {'land': 0.013400000000000023},
 'from_winterfell': {'land': 0.7100000000000001}}


# Learned Policy

In [7]:
pprint.pprint(Policy)

{'from_dragonstone': 'dragon',
 'from_whiteharbor': 'land',
 'from_winterfell': 'land'}


# Summary Video

In [9]:
from IPython.display import HTML

# Youtube
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/Kllu_rlyUn8?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')
