In [1]:
import numpy as np
import csv
from itertools import product
from functools import partial

In [2]:
def parse_csv(filepath):
    with open(filepath) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 1
        episodes = []
        m = int(next(csv_reader)[0])
        num_actions = int(next(csv_reader)[0])
        k = int(next(csv_reader)[0])
        state_rep_size = (k+1)**m
        theta_b = np.array(next(csv_reader), dtype=np.float).reshape(num_actions, state_rep_size).T
        n = int(next(csv_reader)[0])
        for _ in range(n):
            episodes.append(np.array(next(csv_reader), dtype=np.float))
        policy_test_expected = np.array(next(csv_reader), dtype=np.float)
        return m, num_actions, k, theta_b, episodes, policy_test_expected    

def softmax(x):
    return np.exp(x)/sum(np.exp(x))

def pi(theta,c,s):
    phi_s = np.cos(np.pi * np.dot(c,s)).flatten()
    return softmax(np.dot(phi_s,theta))

def PDIS(H, pi_e, pi_b, gamma=1):
    L = int(H.size/3)
    S = H[0::3]
    A = H[1::3].astype(int)
    R = H[2::3]
    pi_e_array = np.array([pi_e(S[i])[A[i]] for i in range(L)])
    pi_b_array = np.array([pi_b(S[i])[A[i]] for i in range(L)])
    importance_weights = pi_e_array / pi_b_array
    prod_importance_weights = np.cumprod(importance_weights)
    gamma_array = gamma**np.arange(L)
    return np.sum(gamma_array * prod_importance_weights * R)

## Test policy function

In [3]:
m, num_actions, k, theta_b, episodes, policy_test_expected = parse_csv('data.csv')
c = np.flip(list(product(range(k+1), repeat=m)), axis=1)
pi_b = partial(pi,theta_b,c)

H_test = episodes[0]
S_test = H_test[0::3]
A_test = H_test[1::3].astype(int)
policy_test_actual = np.array([pi_b(S_test[i])[A_test[i]] for i in range(S_test.size)])

print(f'num state features (m): {m}')
print(f'num actions: {num_actions}')
print(f'fourier basis order (k): {k}')
print(f'theta_b: \n{theta_b}')
print(f'num episodes: {len(episodes)}')
print(f'first episode: {episodes[0]}')
print(f'policy expected: {policy_test_expected}')
print(f'policy actual: {policy_test_actual}')

num state features (m): 1
num actions: 2
fourier basis order (k): 1
theta_b: 
[[ 0.01  1.  ]
 [-0.01  1.  ]]
num episodes: 200000
first episode: [ 0.419908   1.         0.992628   0.366283   0.        10.
  0.0622811  1.         2.98566    0.327772   1.         1.69524
  0.612293   0.         6.10134  ]
policy expected: [0.775818 0.197512 0.878759 0.819091 0.345012]
policy actual: [0.77581796 0.19751227 0.87875879 0.81909146 0.34501192]


## Test PDIS

In [4]:
theta_e = np.ones(4).reshape([2,2])
pi_e = partial(pi,theta_e,c)
PDIS_array = [PDIS(episode, pi_e, pi_b) for episode in episodes]
b_array = [np.sum(episode[2::3]) for episode in episodes]

print(f'first J(pi_e): {PDIS_array[0]}')
print(f'first J(pi_b): {b_array[0]}')
print(f'mean  J(pi_e): {np.mean(PDIS_array)}')
print(f'mean  J(pi_b): {np.mean(b_array)}')
print(f'std   J(pi_e): {np.std(PDIS_array)}')
print(f'std   J(pi_b): {np.std(b_array)}')

first J(pi_e): 25.69744378193233
first J(pi_b): 21.774867999999998
mean  J(pi_e): 4.113746017365581
mean  J(pi_b): 1.0031260507163062
std   J(pi_e): 19.755544874488958
std   J(pi_b): 6.834558113981664
