In [66]:
import numpy as np
#Enter the sum of digits in your enrollment number as seed


def environment(seed):
  np.random.seed(26)
  f = np.random.uniform(0.3,0.4,1)
  m = np.random.uniform(0.3,0.4,1)
  s = np.random.uniform(0.2,0.3,1)
  gamma = np.random.uniform(0.9,0.99,1)
  return f, m, s, gamma


f,m,s,gamma = environment(seed)
print('Transition probability parameter f:', f)
print('Transition probability parameter m:', m)
print('Transition probability parameter s:', s)
print('Discounting factor gamma:', gamma)


Transition probability parameter f: [0.3307935]
Transition probability parameter m: [0.35193915]
Transition probability parameter s: [0.27682977]
Discounting factor gamma: [0.97102987]


In [67]:
import pandas as pd

In [68]:
g = float(gamma[0])

In [69]:
# States: Standing, Moving, Falling

# Actions: Slow action: Black, Fast action: Blue

In [70]:
prob = np.array([[[0,1,0],
                  [0,1,0],
                  [f[0],0,1-f[0]]],
                 [[0,1-s[0],s[0]],
                  [0,1-m[0],m[0]],
                  [0,0,0]]])

In [71]:
reward = np.array([[[0.,1.,0.],
                    [0.,1.,0.],
                    [1.,0.,-1.]],
                   [[0.,2.,-1.],
                    [0.,2.,-1.],
                    [0.,0.,0.]]])

In [72]:
prob

array([[[0.        , 1.        , 0.        ],
        [0.        , 1.        , 0.        ],
        [0.3307935 , 0.        , 0.6692065 ]],

       [[0.        , 0.72317023, 0.27682977],
        [0.        , 0.64806085, 0.35193915],
        [0.        , 0.        , 0.        ]]])

In [82]:
reward


array([[[ 0.,  1.,  0.],
        [ 0.,  1.,  0.],
        [ 1.,  0., -1.]],

       [[ 0.,  2., -1.],
        [ 0.,  2., -1.],
        [ 0.,  0.,  0.]]])

# 1 Optimal value function

In [78]:
v = np.ones(3)
v_n = np.zeros(3)
p=0
while v[0]!=v_n[0] or v[1]!=v_n[1] or v[2]!=v_n[2]:
    v = v_n
    p+=1
    v_n = value_iteration(prob,reward,v,g)
    
    
def value_iteration(prob,reward,v,g):
    v1_black, v1_blue = 0,0
    v2_black, v2_blue = 0,0
    v3_black, v3_blue = 0,0
    v_new = []
    for i in range(3):
        v1_black += prob[0][0][i]*(reward[0][0][i]+g*v[i])
        v1_blue += prob[1][0][i]*(reward[1][0][i]+g*v[i])
        v2_black += prob[0][1][i]*(reward[0][1][i]+g*v[i])
        v2_blue += prob[1][1][i]*(reward[1][1][i]+g*v[i])
        v3_black += prob[0][2][i]*(reward[0][2][i]+g*v[i])
        v3_blue += prob[1][2][i]*(reward[1][2][i]+g*v[i])
    
    v_new.append(max(v1_black,v1_blue))
    v_new.append(max(v2_black,v2_blue))
    v_new.append(max(v3_black,v3_blue))
    
    return v_new

In [79]:
print("Optimal policy: ", v_n)
print(f"Number of times iteration will be done {p}")

Optimal policy:  [34.51830802077721, 34.51830802077721, 30.696241914446126]
Number of times iteration will be done 1131


# 2 Optimal action value function

In [32]:
def q_value(prob,reward,g,v):
    q = np.zeros((2,3))
    for i in range(3):
        for j in range(2):
            for k in range(3):
                q[j][i] += prob[j][i][k]*(reward[j][i][k]+g*v[k])
    return q

In [18]:
data = q_value(prob,reward,g,v_n)

In [19]:
cols = ['Q(s1,a)','Q(s2,a)','Q(s3,a)']

In [20]:
df = pd.DataFrame(data = data, index = ['Black','Blue'], columns = cols)

In [86]:
df

Unnamed: 0,"Q(s1,a)","Q(s2,a)","Q(s3,a)"
Black,34.518308,34.518308,30.696242
Blue,33.660409,33.156325,0.0


# 3 Optimal policy

In [87]:
def optimal_policy(q):
    op = []
    for i in range(3):
        if q[0][i]>q[1][i]:
            op.append('Black')
        else:
            op.append('Blue')
    return op

In [88]:
op = optimal_policy(data)
print("Optimal policy: ", op)

Optimal policy:  ['Black', 'Black', 'Black']


# 4 Optimal Value function

In [89]:
v = np.zeros((3,3))
rhs = np.zeros(3)
lhs = np.zeros((3,3))
for i in range(3):
    v[0][i] = prob[0][0][i]
    v[1][i] = prob[0][1][i]
    v[2][i] = prob[0][2][i]
    rhs[0] += prob[0][0][i]*reward[0][0][i]
    rhs[1] += prob[0][1][i]*reward[0][1][i]
    rhs[2] += prob[0][2][i]*reward[0][2][i]
lhs = np.identity(3)-g*v
x = np.linalg.solve(lhs,rhs)

In [90]:
x

array([34.51830802, 34.51830802, 30.69624191])

In [91]:
print("Optimal policy: ", x)

Optimal policy:  [34.51830802 34.51830802 30.69624191]
