# Stimare Q function ottimale
## 1. Quanto tempo ci mette per convergere a Q*, al variare di tau
## 2. Distanza tra Q* e Q_t 
## 3. distanza tra Q_0 e Q_0 appresa tramite i valori di tau
## 4. cercare di integrare i bound trovati

### Estrarre policy da Q
### Confrontare due diverse Q valutando diverse metriche 
### Calcolare Q*
### Tentativo di implementazione di un curriculum
### Plot functions

In [1]:
import numpy as np
from TMDP import TMDP
from river_swim import River

from algorithms import *


In [2]:
# Test with tau=0.9
nS = 8
gamma = 0.9
river = River(nS, gamma, 5, 1000)
tau = 0.9
xi = np.ones(river.nS)*1/river.nS
tmdp = TMDP(river, xi, tau, gamma)
tmdp_0 = TMDP(river, xi, 0, gamma)

In [3]:
s = tmdp.reset()
M = 10000
Q = np.zeros((tmdp.nS, tmdp.nA))
ret = 0
a = eps_greedy(s, Q, 1., tmdp.allowed_actions[s.item()])
Q = Q_learning(tmdp, s, a, Q, M)

print("\nQ function with tau: {}\n".format(tmdp.tau))
print(Q)
print(get_policy(Q))

s = tmdp_0.reset()
M_0 = 10000000
Q = np.zeros((tmdp_0.nS, tmdp_0.nA))
ret = 0
a = eps_greedy(s, Q, 1., tmdp_0.allowed_actions[s.item()])
Q = Q_learning(tmdp_0, s, a, Q, M_0)


print("Q function with tau: {}".format(tmdp_0.tau))
print(Q)
print(get_policy(Q))

r_s_a = compute_r_s_a(tmdp_0.nS, tmdp_0.nA, tmdp_0.P_mat, tmdp_0.reward)
q_star = bellman_optimal_q(tmdp_0.nS, tmdp_0.nA, tmdp_0.P_mat, tmdp_0.reward, 1, gamma)


print("\n Q function with bellman operator")
print(q_star)
print(get_policy(q_star))


Q function with tau: 0.9

[[104.8042924  152.52889372]
 [108.03522441 152.96243176]
 [153.71736149 120.40499111]
 [150.89024271  95.43266164]
 [ 97.87452594 151.51808776]
 [150.79477754 107.3520062 ]
 [150.34058906 115.51313149]
 [100.8315816  246.76945879]]
[1, 1, 0, 0, 1, 0, 0, 1]
Q function with tau: 0
[[50.         43.57353625]
 [45.         38.0638219 ]
 [40.5        35.74364593]
 [36.45       29.79083485]
 [32.805      29.47719167]
 [29.52302679  0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]]
[0, 0, 0, 0, 0, 0, 0, 0]

 Q function with bellman operator
[[ 64.33978241  66.4264707 ]
 [ 59.78382363  92.64863548]
 [ 83.38377194 137.08716883]
 [123.37845195 203.98538599]
 [183.58684739 303.06311755]
 [272.75680579 449.4704471 ]
 [404.52340239 665.78575189]
 [599.2071767  985.43074649]]
[1, 1, 1, 1, 1, 1, 1, 1]


In [4]:
print(compute_delta_q(q_star, Q))
np.linalg.norm(np.abs(Q-q_star), np.inf)

985.430746492539


1584.6379231962903

In [5]:

""" xi = np.ones(river.nS)*1/river.nS
gamma = 0.9
river = River(gamma)

for tau in taus:
    tmdp = TMDP(river, xi, tau, gamma)
    s = tmdp.reset()
    M = 10000
    Q = np.zeros((tmdp.nS, tmdp.nA))
    ret = 0
    a = eps_greedy(s, Q, 1., tmdp.allowed_actions[s.item()])
    Q = Q_learning(tmdp, s, a, Q, M)
    Qs.append({"tau":tau, "Q_function":Q, "env":tmdp})

for i in range(len(taus)):
    print("Tau:", Qs[i]['tau'])
    print(Qs[i]['Q_function']) """

' xi = np.ones(river.nS)*1/river.nS\ngamma = 0.9\nriver = River(gamma)\n\nfor tau in taus:\n    tmdp = TMDP(river, xi, tau, gamma)\n    s = tmdp.reset()\n    M = 10000\n    Q = np.zeros((tmdp.nS, tmdp.nA))\n    ret = 0\n    a = eps_greedy(s, Q, 1., tmdp.allowed_actions[s.item()])\n    Q = Q_learning(tmdp, s, a, Q, M)\n    Qs.append({"tau":tau, "Q_function":Q, "env":tmdp})\n\nfor i in range(len(taus)):\n    print("Tau:", Qs[i][\'tau\'])\n    print(Qs[i][\'Q_function\']) '

In [6]:
""" pi = get_policy(Qs[-2]['Q_function'])
pi_prime = get_policy(Qs[-1]['Q_function'])

print(pi)
print(pi_prime) """

" pi = get_policy(Qs[-2]['Q_function'])\npi_prime = get_policy(Qs[-1]['Q_function'])\n\nprint(pi)\nprint(pi_prime) "

In [7]:
taus = [1 - i*0.1 for i in range(10)]
taus.append(0)

# Curriculul for decreasing values of tau
for tau in taus:
    tmdp = TMDP(river, xi, tau, gamma)
    if tau == 1:
       Q = np.zeros((tmdp.nS, tmdp.nA))
    
    s = tmdp.reset()
    M = 20000
    ret = 0
    a = eps_greedy(s, Q, 1., tmdp.allowed_actions[s.item()])
    Q = Q_learning(tmdp, s, a, Q, M)


print("Q function learned with transfer learning:\n", Q)

pi = get_policy(Q)
print(pi)


Q function learned with transfer learning:
 [[  50.           52.05508102]
 [  45.32535716   67.44292293]
 [  61.00966625  100.32531515]
 [  89.98939402  163.74387092]
 [ 139.44737939  261.51897406]
 [ 211.03520929  474.29833638]
 [ 321.49348673  704.94195898]
 [ 446.08854619 1038.11073228]]
[1, 1, 1, 1, 1, 1, 1, 1]
