In [1]:

# imports
import os
import sys
import types
import json

# figure size/format
fig_width = 7
fig_height = 5
fig_format = 'retina'
fig_dpi = 96

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = fig_dpi
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  pio.renderers.default = "notebook_connected"
except Exception:
  pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass



# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
if r'/Users/davoodwadi/MLCourse/davoodwadi.github.io/code':
  os.chdir(r'/Users/davoodwadi/MLCourse/davoodwadi.github.io/code')

# reset state
%reset

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v
  
  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define


In [2]:
# Step 1: Policy Evaluation
def policy_evaluation(policy, states, actions, rewards, transitions, discount_factor, theta):
    # Initialize the state-value function with zeros
    V = {s: 0 for s in states}

    while True:
        delta = 0
        
        # Iterate over all states
        for s in states:
            v = V[s]
            new_v = 0
            
            # Iterate over all possible actions
            for a in actions:
                # Compute the expected return for each action
                next_states = transitions[s][a]
                expected_return = sum(prob * (reward + discount_factor * V[next_state]) for next_state, reward, prob in next_states)

                # Compute the new state-value function
                new_v += policy[s][a] * expected_return

            # Update the state-value function for the current state
            V[s] = new_v

            # Calculate the maximum difference between the old and new state-value functions
            delta = max(delta, abs(v - V[s]))
        
        # If the maximum difference is less than a threshold, we assume convergence
        if delta < theta:
            break
    
    return V

# Step 2: Policy Improvement
def policy_improvement(states, actions, rewards, transitions, discount_factor, V):
    # Initialize the new policy with zeros
    new_policy = {s: {a: 0 for a in actions} for s in states}
    
    for s in states:
        action_values = []
        
        # Iterate over all possible actions
        for a in actions:
            # Compute the expected return for each action
            next_states = transitions[s][a]
            expected_return = sum(prob * (reward + discount_factor * V[next_state]) for next_state, reward, prob in next_states)
            
            # Add the expected return to the list of action values
            action_values.append(expected_return)
        
        # Assign the action with the maximum expected return as the new policy for the current state
        idx = np.argmax(action_values)
        new_policy[s][actions[idx]] = 1
    
    return new_policy

# Step 3: Policy Iteration
def policy_iteration(states, actions, rewards, transitions, discount_factor, theta):
    # Initialize a random policy
    policy = {s: {a: 1 / len(actions) for a in actions} for s in states}

    while True:
        # Policy Evaluation
        V = policy_evaluation(policy, states, actions, rewards, transitions, discount_factor, theta)
        
        # Policy Improvement
        new_policy = policy_improvement(states, actions, rewards, transitions, discount_factor, V)
        
        # If the new policy is the same as the old policy, we have converged to the optimal policy
        if policy == new_policy:
            break
        
        policy = new_policy
    
    return policy