In [3]:
import cvxpy as cp
import numpy as np
import math

# IMPORTANT!!

## note that for both sparse and EOT, function names are same, so we run the experiments sequentially, thus overwriting function definitions.

## 1) experiments with end of trajectory feedback setting

In [4]:
class GridWorld:
    def __init__(self,k,d,size=8,danger=[7,1],goal=[4,5],wall=[2,5],coins=[(1,6),(4,2),(5,5)], horizon=50, noise=0.1):
        self.noise = noise
        self.k=k
        self.d=d
        self.size = size
        self.horizon = horizon
        self.goal = tuple(goal)
        self.danger = tuple(danger)
        self.wall = tuple(wall)
        self._init_coins = tuple(map(tuple, coins)) 
        self.coins = set(self._init_coins)
        self.collected_coins = set()
        self.done = 0
    
    def reset(self):
        self.done = 0
        self.pos = (0,7)
        self.t = 0
        self.collected = 0
        self.collected_coins = set()
        self.coins = set(self._init_coins)
        return self.pos
    
    def step(self, intended_action):
        probs = np.full(4, 0.03)
        probs[intended_action] = 0.91
        action = np.random.choice(4, p=probs)
        x, y = self.pos
        if action == 0: x = max(0, x-1)       # up
        if action == 1: x = min(self.size-1, x+1) # down
        if action == 2: y = max(0, y-1)       # left
        if action == 3: y = min(self.size-1, y+1) # right
        if((x,y)!=(self.wall)):
            self.pos = (x,y)
        if self.pos in self.coins:
            self.collected += 1
            self.collected_coins.add(self.pos)
            self.coins.remove(self.pos)
        self.t += 1
        self.done = ((self.t >= self.horizon)or (self.pos==self.goal) or (self.pos==self.danger))
        return self.pos, self.done
        

    def get_feedback_and_features(self):
        weights = [0.1, 1.0, 2.0, 3.0]
        true_reward = (1-(self.pos==self.danger))*(weights[self.collected]*(self.collected + 1.32*(self.pos==self.goal)) - 5*(self.t-14)/self.horizon + 5*36/50)
#         scaled_reward = 10*(1-math.exp(-true_reward/5))/(1+math.exp(-true_reward/5))
        scaled_reward = 2*true_reward
        ## now we quantize it into k bins
        edges = np.linspace(0,32.92,self.k+1)
        feedback = self.k-1
        for i in range(len(edges)-1):
            if(edges[i]<=scaled_reward and scaled_reward<edges[i+1]):
                feedback = i
                break
                
        # label noise
        probs = [0.0] * self.k
        probs[feedback] = 1-self.noise+self.noise/self.k
        rem = 1-probs[feedback]
        rem_distributed = rem / (self.k - 1)
        for i in range(self.k):
            if probs[i] == 0.0:
                probs[i] = rem_distributed
   
        feedback_list = [i for i in range(self.k)]
        feedback_given = np.random.choice(feedback_list,p=probs)
        return feedback_given,self._features()
    
    def true_return(self):
        weights = [0.1, 1.0, 2.0, 3.0]
        true_reward = (1-(self.pos==self.danger))*(weights[self.collected]*(self.collected + 1.32*(self.pos==self.goal)) - 5*(self.t-14)/self.horizon + 5*36/50)
        scaled_reward = 2*true_reward
        edges = np.linspace(0,32.92,self.k+1)
        feedback = self.k-1
        for i in range(len(edges)-1):
            if(edges[i]<=scaled_reward and scaled_reward<edges[i+1]):
                feedback = i
                break
        return feedback
                
    
    def _features(self):
        """return trajectory features phi(tau)"""
        x, y = self.pos
        xg, yg = self.goal
        xd, yd = self.danger
        dist_to_goal = abs(x-xg) + abs(y-yg)
        dist_to_danger = abs(x-xd) + abs(y-yd)
        at_danger = int(self.pos == self.danger)
        at_goal = int(self.pos == self.goal and (at_danger==0))
        coin_indicator = [int(c in self.collected_coins) for c in self._init_coins]
        return np.array([dist_to_goal, dist_to_danger, at_goal, at_danger] + coin_indicator, dtype=float)
        

def softmax(logits):
    exps = np.exp(logits - np.max(logits))
    return exps / np.sum(exps)

class Policy:
    def __init__(self, grid_size, action_dim):
        self.grid_size = grid_size
        self.state_dim = grid_size * grid_size
        self.action_dim = action_dim
        self.theta = np.ones((self.state_dim, self.action_dim))
    
    def state_index(self, state):
        return state[0] * self.grid_size + state[1]
    
    def act(self, state):
        s_idx = self.state_index(state)
        probs = softmax(self.theta[s_idx])
        action = np.random.choice(len(probs), p=probs)
        return action, probs
    
    def grad_log_prob(self, state, action):
        """Return (state_index, grad_row) with grad_row shape (action_dim,)
           grad_row[j] = 1{j==action} - pi(j|s)"""
        s_idx = self.state_index(state)
        probs = softmax(self.theta[s_idx])
        grad_row = -probs.copy()
        grad_row[action] += 1.0
        return s_idx, grad_row
    


class RewardModel:
    def __init__(self, k,d,C=0.0):
        self.k = k
        self.d = d
        self.W = np.zeros((k,d))
        self.C = C
        
    def estimate_W(self, X, Y, reg=1e-3):
        n, d = X.shape
        W = cp.Variable((self.k, self.d))  # optimization variable, not vectorized, shape(k,d)
        loss = 0
        for i in range(n):
            phi = X[i]  
            yi = int(Y[i])
            logits = W @ phi 
            ## yi cannot be more than (k-1), so if assigning deterministic rewards without crafting W*, be careful
            loss += -(logits[yi] - cp.log_sum_exp(logits))
        
        loss = loss/n + reg*cp.norm(W, "fro")**2
        prob = cp.Problem(cp.Minimize(loss))
        prob.solve(solver=cp.MOSEK)

        self.W = W.value
        return self.W
    
    def reward_probabilities(self, phi):
        """estimate P(y|tau)"""
        logits = self.W @ phi
        logits-= np.max(logits)
        exp_logits = np.exp(logits)
        return exp_logits/np.sum(exp_logits)
    
    def reward_estimate(self,phi):
        """returns the average, expected reward given the reward probabilities"""
        return np.sum(np.array([i*self.reward_probabilities(phi)[i] for i in range(self.k)]))
    
    def optimistic_reward(self, phi, n_samples):
        """
        optimism term included
        """
        base = self.reward_estimate(phi)
        n = max(n_samples, 1)
        bonus = self.C / np.sqrt(n)
        optimistic = base + bonus
        return min(optimistic, self.k - 1)


###----------Traning loop-------------###
def train(N=20,m=50,k=6,eta=0.1,epsilon=0.1,grid_size=8,danger=[7,1],goal=[0,7], wall=[2,5] ,horizon=50,coins=None,seed=0,noise=0.1):
    np.random.seed(seed)
    queries = 0
    steps = 0
    if coins is None:
        coins=[(1,6),(4,2),(5,5)]
    print("hi")
    d = 4+len(coins)
#     W_true = generate_W_true(k,d)
    env = GridWorld(k,d,size=grid_size, danger=danger, goal=goal, wall=wall, coins=coins, horizon=horizon, noise=noise)
    policy = Policy(grid_size=grid_size, action_dim=4)
    
    ## initialize weights w_0
    reward_model = RewardModel(k, d, C=10.0)
    reward_model.W = np.zeros((k,d))
    
    all_data_X, all_data_Y = [], []
    avg_true_rewards,avg_coins,avg_est_rewards = [], [], []
    flag = 0
    for n in range(N):
        if flag: break
        print(n)
        avg_true_reward_this_iter = 0
        avg_est_reward_this_iter = 0
        if(n<100):
            G_range = 10
        elif (n>100 ):
            G_range = 100

        for g in range(G_range):
            steps+=1
            rollout_trajectories = []
            avg_true_reward_this_iter = 0
            avg_est_reward_this_iter = 0
            rewards = []
            for i in range(m): ## sample trajectories under current policy pi to approiximate the theoretical expectation
                s = env.reset()
                traj = {"states": [], "actions": [], "steps":0, "coins":0}
                done = False

                while not done:
                    a, _ = policy.act(s)
                    traj["states"].append(s)
                    traj["actions"].append(a)
                    s, done = env.step(a)
            
                traj["steps"] = env.horizon
                traj["coins"] = env.collected
                y, phi = env.get_feedback_and_features()
                phi = np.array(phi, dtype=float)   
                rollout_trajectories.append((traj, phi, y))
                rewards.append(env.true_return())
#             if(np.mean(rewards)>31):
#                 flag = 1
#                 break

                
            ## now with these m rollouts, approximate the expectation of estimated reward under policy pi
            grad_theta = np.zeros_like(policy.theta)
            n_samples = max(len(all_data_X), 1)
            R_hats = [reward_model.optimistic_reward(phi, n_samples) for _, phi, _ in rollout_trajectories]
            b = float(np.mean(R_hats))  # baseline
                        
            for (traj,phi_tau,y), r_hat in zip(rollout_trajectories,R_hats):
                
                avg_est_reward_this_iter+=r_hat/len(rollout_trajectories)
#                 avg_true_reward_this_iter+=y/len(rollout_trajectories)
                temp = r_hat-b
                    
                for state,action in zip(traj["states"],traj["actions"]):
                    s_idx, grad_row = policy.grad_log_prob(state, action)
                    grad_theta[s_idx] += grad_row*(temp)

            grad_theta = grad_theta/len(rollout_trajectories)
            policy.theta += eta*grad_theta
            avg_true_reward_this_iter = np.mean(rewards)
            
        done = False
        s = env.reset()
        while not done:
            a,_ = policy.act(s)
            s,done = env.step(a)
        y,phi = env.get_feedback_and_features()
        all_data_X.append(phi)
        all_data_Y.append(y)
        reward_model.W = reward_model.estimate_W(np.array(all_data_X),np.array(all_data_Y), reg = 1e-3)
         
        traj,phi,y = rollout_trajectories[-1]

        coins_this_iter = traj["coins"]
        
        ## storing some info
        avg_est_rewards.append(avg_est_reward_this_iter)
        avg_true_rewards.append(avg_true_reward_this_iter)
        avg_coins.append(coins_this_iter)
        
        # update estimate of weight matrix W
        reward_model.W = reward_model.estimate_W(np.array(all_data_X),np.array(all_data_Y), reg = 1e-3)

        print(f"Iter {n:02d}: avg_estimated_reward={avg_est_rewards[-1]:.3f}, avg_true_reward={avg_true_rewards[-1]:.3f},coins_this_episode={avg_coins[-1]:.2f}")
    
    return policy, reward_model, avg_true_rewards, avg_est_rewards, reward_model.W,steps,queries




In [None]:
import numpy as np
import matplotlib.pyplot as plt

N_EPISODES = 200
seeds = [0, 1, 2, 3, 4]

all_curves_01 = []

for seed in seeds:
    policy, rm, avg_true, avg_est, W, steps, queries = train(
        N=N_EPISODES,
        m=20,
        k=6,
        eta=0.5,
        grid_size=8,
        danger=(7, 1),
        goal=(4, 5),
        wall=(2, 5),
        coins=[(1, 6), (4, 2), (5, 5)],
        seed=seed,
        noise=0.1
    )
    all_curves_01.append(np.array(avg_true, dtype=float))

all_curves_01 = np.stack(all_curves_01, axis=0)   

mean_curve_01 = all_curves_01.mean(axis=0)
std_curve_01  = all_curves_01.std(axis=0)

x_01 = np.arange(N_EPISODES)

plt.figure(figsize=(7,4))
plt.plot(x, mean_curve_01, label="mean avg_true over seeds")
plt.fill_between(x, mean_curve_01 - std_curve_01, mean_curve_01 + std_curve_01,
                 alpha=0.2, label="±1 std over seeds")
plt.xlabel("Episode")
plt.ylabel("Average feedback per episode")
plt.title("avg feedback per episode averaged over seeds")
plt.grid(True)
plt.legend()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

N_EPISODES = 200
seeds = [0, 1, 2, 3, 4]

all_curves_05 = []

for seed in seeds:
    policy, rm, avg_true, avg_est, W, steps, queries = train(
        N=N_EPISODES,
        m=20,
        k=6,
        eta=0.5,
        grid_size=8,
        danger=(7, 1),
        goal=(4, 5),
        wall=(2, 5),
        coins=[(1, 6), (4, 2), (5, 5)],
        seed=seed,
        noise=0.5
    )
    all_curves_05.append(np.array(avg_true, dtype=float))

all_curves_05 = np.stack(all_curves_05, axis=0)   

mean_curve_05 = all_curves_05.mean(axis=0)
std_curve_05  = all_curves_05.std(axis=0)

x_05 = np.arange(N_EPISODES)

plt.figure(figsize=(7,4))
plt.plot(x_05, mean_curve_05, label="mean avg_true over seeds")
plt.fill_between(x_05, mean_curve_05 - std_curve_05, mean_curve_05 + std_curve_05,
                 alpha=0.2, label="±1 std over seeds")
plt.xlabel("Episode")
plt.ylabel("Average feedback per episode")
plt.title("avg feedback per episode averaged over seeds")
plt.grid(True)
plt.legend()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

N_EPISODES = 200
seeds = [0, 1, 2, 3, 4]

all_curves_08 = []

for seed in seeds:
    policy, rm, avg_true, avg_est, W, steps, queries = train(
        N=N_EPISODES,
        m=20,
        k=6,
        eta=0.5,
        grid_size=8,
        danger=(7, 1),
        goal=(4, 5),
        wall=(2, 5),
        coins=[(1, 6), (4, 2), (5, 5)],
        seed=seed,
        noise=0.8
    )
    all_curves_08.append(np.array(avg_true, dtype=float))

all_curves_08 = np.stack(all_curves_08, axis=0)   

mean_curve_08 = all_curves_08.mean(axis=0)
std_curve_08  = all_curves_08.std(axis=0)

x_08 = np.arange(N_EPISODES)

plt.figure(figsize=(7,4))
plt.plot(x_08, mean_curve_08, label="mean avg_true over seeds")
plt.fill_between(x_08, mean_curve_08 - std_curve_08, mean_curve_08 + std_curve_08,
                 alpha=0.2, label="±1 std over seeds")
plt.xlabel("Episode")
plt.ylabel("Average feedback per episode")
plt.title("avg feedback per episode averaged over seeds")
plt.grid(True)
plt.legend()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Stack and compute mean/std for each noise level
all_curves_01 = np.stack(all_curves_01, axis=0)
all_curves_05 = np.stack(all_curves_05, axis=0)
all_curves_08 = np.stack(all_curves_08, axis=0)

mean_01 = all_curves_01.mean(axis=0)
std_01  = all_curves_01.std(axis=0)

mean_05 = all_curves_05.mean(axis=0)
std_05  = all_curves_05.std(axis=0)

mean_08 = all_curves_08.mean(axis=0)
std_08  = all_curves_08.std(axis=0)

N_EPISODES = mean_01.shape[0]  
x = np.arange(N_EPISODES)

plt.figure(figsize=(7, 4))

# noise = 0.1
plt.plot(x, mean_01, label="noise = 0.1")
plt.fill_between(x, mean_01 - std_01, mean_01 + std_01, alpha=0.15)

# noise = 0.5
plt.plot(x, mean_05, label="noise = 0.5")
plt.fill_between(x, mean_05 - std_05, mean_05 + std_05, alpha=0.15)

# noise = 0.8
plt.plot(x, mean_08, label="noise = 0.8")
plt.fill_between(x, mean_08 - std_08, mean_08 + std_08, alpha=0.15)

plt.xlabel("Episode")
plt.ylabel("Average feedback per episode")
plt.title("Average feedback per episode (mean ± 1 std over seeds)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
## optionally, save these for comparing algorithms later
# import numpy as np

# np.save('mean_01.npy', mean_01)
# np.save('std_01.npy', std_01)
# np.save('mean_05.npy', mean_05)
# np.save('std_05.npy', std_05)
# np.save('mean_08.npy', mean_08)
# np.save('std_08.npy', std_08)


## 2) experiments with sparse feedback setting

In [None]:
class GridWorld:
    def __init__(self,k,d,size=8,danger=[7,1],goal=[4,5],wall=[2,5],
                 coins=[(1,6),(4,2),(5,5)], horizon=50, noise=0.1):
        self.noise = noise
        self.k = k
        self.d = d
        self.size = size
        self.horizon = horizon
        self.goal = tuple(goal)
        self.danger = tuple(danger)
        self.wall = tuple(wall)
        self._init_coins = tuple(map(tuple, coins)) 
        self.coins = set(self._init_coins)
        self.collected_coins = set()
        self.done = 0
    
    def reset(self):
        self.done = 0
        self.pos = (0,7)
        self.t = 0
        self.collected = 0
        self.collected_coins = set()
        self.coins = set(self._init_coins)
        return self.pos
    
    def step(self, intended_action):
        probs = np.full(4, 0.03)
        probs[intended_action] = 0.91
        action = np.random.choice(4, p=probs)
        x, y = self.pos
        if action == 0: x = max(0, x-1)       # up
        if action == 1: x = min(self.size-1, x+1) # down
        if action == 2: y = max(0, y-1)       # left
        if action == 3: y = min(self.size-1, y+1) # right
        if (x,y) != (self.wall):
            self.pos = (x,y)
        if self.pos in self.coins:
            self.collected += 1
            self.collected_coins.add(self.pos)
            self.coins.remove(self.pos)
        self.t += 1
        self.done = ((self.t >= self.horizon) or 
                     (self.pos==self.goal) or 
                     (self.pos==self.danger))
        return self.pos, self.done
        

    def get_feedback_and_features(self):
        weights = [0.1, 1.0, 2.0, 3.0]
        true_reward = (1-(self.pos==self.danger))*(
            weights[self.collected]*(self.collected + 1.32*(self.pos==self.goal))
            - 5*(self.t-14)/self.horizon + 5*36/50
        )
        # scaled_reward = 10*(1-math.exp(-true_reward/5))/(1+math.exp(-true_reward/5))
        scaled_reward = 2*true_reward

        # quantize into k bins
        edges = np.linspace(0, 32.92, self.k+1)
        feedback = self.k-1
        for i in range(len(edges)-1):
            if edges[i] <= scaled_reward < edges[i+1]:
                feedback = i
                break
                
        # label noise
        probs = [0.0] * self.k
        probs[feedback] = 1 - self.noise + self.noise/self.k
        rem = 1 - probs[feedback]
        rem_distributed = rem / (self.k - 1)
        for i in range(self.k):
            if probs[i] == 0.0:
                probs[i] = rem_distributed
   
        feedback_list = [i for i in range(self.k)]
        feedback_given = np.random.choice(feedback_list,p=probs)
        return feedback_given, self._features()
    
    def true_return(self):
        weights = [0.1, 1.0, 2.0, 3.0]
        true_reward = (1-(self.pos==self.danger))*(
            weights[self.collected]*(self.collected + 1.32*(self.pos==self.goal))
            - 5*(self.t-14)/self.horizon + 5*36/50
        )
        scaled_reward = 2*true_reward
        edges = np.linspace(0, 32.92, self.k+1)
        feedback = self.k-1
        for i in range(len(edges)-1):
            if edges[i] <= scaled_reward < edges[i+1]:
                feedback = i
                break
        return feedback
                
    
    def _features(self):
        """return trajectory features phi(tau)"""
        x, y = self.pos
        xg, yg = self.goal
        xd, yd = self.danger
        dist_to_goal = abs(x-xg) + abs(y-yg)
        dist_to_danger = abs(x-xd) + abs(y-yd)
        at_danger = int(self.pos == self.danger)
        at_goal = int(self.pos == self.goal and (at_danger==0))
        coin_indicator = [int(c in self.collected_coins) for c in self._init_coins]
        return np.array([dist_to_goal, dist_to_danger, at_goal, at_danger] + coin_indicator,
                        dtype=float)
        

def softmax(logits):
    exps = np.exp(logits - np.max(logits))
    return exps / np.sum(exps)

class Policy:
    def __init__(self, grid_size, action_dim):
        self.grid_size = grid_size
        self.state_dim = grid_size * grid_size
        self.action_dim = action_dim
        self.theta = np.ones((self.state_dim, self.action_dim))
    
    def state_index(self, state):
        return state[0] * self.grid_size + state[1]
    
    def act(self, state):
        s_idx = self.state_index(state)
        probs = softmax(self.theta[s_idx])
        action = np.random.choice(len(probs), p=probs)
        return action, probs
    
    def grad_log_prob(self, state, action):
        """Return (state_index, grad_row) with grad_row shape (action_dim,)
           grad_row[j] = 1{j==action} - pi(j|s)"""
        s_idx = self.state_index(state)
        probs = softmax(self.theta[s_idx])
        grad_row = -probs.copy()
        grad_row[action] += 1.0
        return s_idx, grad_row
    


class RewardModel:
    def __init__(self, k,d,C=0.0):
        self.k = k
        self.d = d
        self.W = np.zeros((k,d))
        self.C = C
        
    def estimate_W(self, X, Y, reg=1e-3):
        n, d = X.shape
        W = cp.Variable((self.k, self.d))  # optimization variable, shape (k,d)
        loss = 0
        for i in range(n):
            phi = X[i]  
            yi = int(Y[i])
            logits = W @ phi 
            loss += -(logits[yi] - cp.log_sum_exp(logits))
        
        loss = loss/n + reg*cp.norm(W, "fro")**2
        prob = cp.Problem(cp.Minimize(loss))
        prob.solve(solver=cp.MOSEK)

        self.W = W.value
        return self.W
    
    def reward_probabilities(self, phi):
        """estimate P(y|tau)"""
        logits = self.W @ phi
        logits -= np.max(logits)
        exp_logits = np.exp(logits)
        return exp_logits/np.sum(exp_logits)
    
    def reward_estimate(self,phi):
        """returns E[y | phi]"""
        probs = self.reward_probabilities(phi)
        return np.sum(np.arange(self.k) * probs)
    
    def optimistic_reward(self, phi, n_samples):
        """
        optimism term included
        """
        base = self.reward_estimate(phi)
        n = max(n_samples, 1)
        bonus = self.C / np.sqrt(n)
        optimistic = base + bonus
        return min(optimistic, self.k - 1)


###----------Training loop with model-based multi-feedback-------------###
def train(
    N=20,
    m=50,
    k=6,
    eta=0.1,
    epsilon=0.1,         # unused, kept for API compatibility
    grid_size=8,
    danger=[7,1],
    goal=[4,5],
    wall=[2,5],
    horizon=50,
    coins=None,
    seed=0,
    noise=0.1,
    feedback_every=10,   # NEW: how often to query feedback within a trajectory
):
    np.random.seed(seed)
    queries = 0
    steps = 0
    if coins is None:
        coins=[(1,6),(4,2),(5,5)]
    print("hi")
    d = 4+len(coins)

    env = GridWorld(k,d,size=grid_size, danger=danger, goal=goal,
                    wall=wall, coins=coins, horizon=horizon, noise=noise)
    policy = Policy(grid_size=grid_size, action_dim=4)
    
    ## initialize weights w_0
    reward_model = RewardModel(k, d, C=10.0)
    reward_model.W = np.zeros((k,d))
    
    all_data_X, all_data_Y = [], []
    avg_true_rewards, avg_coins, avg_est_rewards = [], [], []
    flag = 0

    for n in range(N):              # <-- outer loop over episodes (unchanged)
        if flag: break
        print(n)
        avg_est_reward_this_iter = 0.0

        # same logic for how many gradient steps per episode
        if n < 100:
            G_range = 10
        else:
            G_range = 100

        for g in range(G_range):    # <-- inner gradient-descent iterations
            rollout_trajectories = []
            # per-g stats
            rewards = []
            est = []
            for i in range(m):  # sample m trajectories under current policy
                s = env.reset()
                traj = {"states": [], "actions": [], "steps": 0,
                        "coins": 0, "step_rewards": []}
                done = False

                prev_y_hat = 0.0     # predicted cumulative label so far
                last_y = 0.0         # last (noisy) label index for logging
                last_y_hat = 0.0     # last predicted cumulative label

                # number of samples used so far in RM training (for optimism)
                n_samples = max(len(all_data_X), 1)

                while not done:
                    a, _ = policy.act(s)
                    traj["states"].append(s)
                    traj["actions"].append(a)
                    s, done = env.step(a)
                    steps += 1

                    step_r = 0.0

                    # MULTI-FEEDBACK: query at intervals or at episode end
                    if (env.t % feedback_every == 0) or done:
                        queries += 1
                        y, phi = env.get_feedback_and_features()
                        phi = np.array(phi, dtype=float)

                        last_y = float(y)

                        # model-based predicted cumulative rating
                        y_hat = reward_model.optimistic_reward(phi, n_samples)
                        last_y_hat = float(y_hat)

                        # incremental reward = Δ y_hat
                        step_r = y_hat - prev_y_hat
                        prev_y_hat = y_hat

                    traj["step_rewards"].append(step_r)

                traj["steps"] = env.t
                traj["coins"] = env.collected

                # logging: approximate "true" and "estimated" label from last feedback
                est.append(reward_model.optimistic_reward(phi, n_samples))
                rewards.append(env.true_return())

                # compute reward-to-go for this trajectory (REINFORCE)
                returns = []
                G_t = 0.0
                for r_step in reversed(traj["step_rewards"]):
                    G_t = r_step + G_t
                    returns.insert(0, G_t)

                rollout_trajectories.append((traj, returns))

            # ----- policy gradient update for this g -----
            grad_theta = np.zeros_like(policy.theta)

            # baseline from all returns across all m trajectories
            all_returns = [ret for _, rets in rollout_trajectories for ret in rets]
            if len(all_returns) == 0:
                b = 0.0
            else:
                b = float(np.mean(all_returns))

            for traj, returns in rollout_trajectories:
                for state, action, G_t in zip(traj["states"],
                                              traj["actions"],
                                              returns):
                    s_idx, grad_row = policy.grad_log_prob(state, action)
                    grad_theta[s_idx] += grad_row * (G_t - b)

            grad_theta = grad_theta / len(rollout_trajectories)
            policy.theta += eta * grad_theta
        
        avg_true_reward_this_iter = np.mean(rewards)
        avg_est_reward_this_iter = np.mean(est)

        # ===== end of outer episode n: collect ONE label for reward model =====
        s = env.reset()
        done = False
        while not done:
            a, _ = policy.act(s)
            s, done = env.step(a)
            steps += 1
        queries += 1
        y_final, phi_final = env.get_feedback_and_features()
        phi_final = np.array(phi_final, dtype=float)

        all_data_X.append(phi_final)
        all_data_Y.append(y_final)

        # fit reward model on all collected episode-level samples
        reward_model.W = reward_model.estimate_W(
            np.array(all_data_X), np.array(all_data_Y), reg=1e-3
        )

        # take last trajectory of the last g for coins logging
        last_traj, _ = rollout_trajectories[-1]
        coins_this_iter = last_traj["coins"]
        
        # store info for plotting
        avg_est_rewards.append(avg_est_reward_this_iter)
        avg_true_rewards.append(avg_true_reward_this_iter)
        avg_coins.append(coins_this_iter)
        
        print(
            f"Iter {n:02d}: "
            f"avg_estimated_label={avg_est_rewards[-1]:.3f}, "
            f"avg_true_label={avg_true_rewards[-1]:.3f}, "
            f"coins_this_episode={avg_coins[-1]:.2f}"
        )
    
    return policy, reward_model, avg_true_rewards, avg_est_rewards, reward_model.W, steps, queries


# Example call (like yours, just with feedback_every if you want to change it)
trained_policy, trained_reward_model, avg_true, avg_est, W, steps, queries = train(
    N=200,
    m=20,
    k=6,
    eta=0.5,
    epsilon=1e-2,
    grid_size=8,
    danger=[7,1],
    goal=[4,5],
    wall=[2,5],
    coins=[(1,6),(4,2),(5,5)],
    seed=1,
    noise=0.8,
    feedback_every=10,   # you can sweep over this
)
print("Training complete.")


In [None]:
import numpy as np
import matplotlib.pyplot as plt


N_EPISODES = 200         
seeds = [0, 1, 2, 3, 4]   # seeds to average over

feedback_every_list = [1, 10, 20]   # how often to query within a trajectory
noise_levels = [0.1, 0.5, 0.8]      # label noise in env

M = 20               
K = 6              
ETA = 0.5            
GRID_SIZE = 8
DANGER = [7, 1]
GOAL   = [4, 5]
WALL   = [2, 5]
COINS  = [(1, 6), (4, 2), (5, 5)]

results = {}

for fb_every in feedback_every_list:
    for noise in noise_levels:
        print(f"\n=== feedback_every={fb_every}, noise={noise} ===")
        curves = []

        for seed in seeds:
            print(f"  Seed {seed}")
            policy, rm, avg_true, avg_est, W, steps, queries = train(
                N=N_EPISODES,
                m=M,
                k=K,
                eta=ETA,
                grid_size=GRID_SIZE,
                danger=DANGER,
                goal=GOAL,
                wall=WALL,
                coins=COINS,
                seed=seed,
                noise=noise,
                feedback_every=fb_every,
            )
            curves.append(np.array(avg_true, dtype=float))

        curves = np.stack(curves, axis=0)   # shape: [num_seeds, N_EPISODES]
        mean_curve = curves.mean(axis=0)
        std_curve  = curves.std(axis=0)

        results[(fb_every, noise)] = {
            "mean": mean_curve,
            "std": std_curve,
        }

x = np.arange(N_EPISODES)

for noise in noise_levels:
    plt.figure(figsize=(7, 4))
    for fb_every in feedback_every_list:
        key = (fb_every, noise)
        mean_curve = results[key]["mean"]
        std_curve  = results[key]["std"]

        label = f"fb_every={fb_every}"
        plt.plot(x, mean_curve, label=label)
        plt.fill_between(
            x,
            mean_curve - std_curve,
            mean_curve + std_curve,
            alpha=0.2,
        )

    plt.xlabel("Episode (outer iter n)")
    plt.ylabel("Average feedback per episode (avg_true)")
    plt.title(f"avg feedback per episode (noise={noise})")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()


for fb_every in feedback_every_list:
    plt.figure(figsize=(7, 4))
    for noise in noise_levels:
        key = (fb_every, noise)
        mean_curve = results[key]["mean"]
        std_curve  = results[key]["std"]

        label = f"noise={noise}"
        plt.plot(x, mean_curve, label=label)
        plt.fill_between(
            x,
            mean_curve - std_curve,
            mean_curve + std_curve,
            alpha=0.2,
        )

    plt.xlabel("Episode (outer iter n)")
    plt.ylabel("Average feedback per episode (avg_true)")
    plt.title(f"avg feedback per episode (feedback_every={fb_every})")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# # --- load end-of-trajectory curves ---
# mean_01 = np.load('mean_01.npy')
# std_01  = np.load('std_01.npy')
# mean_05 = np.load('mean_05.npy')
# std_05  = np.load('std_05.npy')
# mean_08 = np.load('mean_08.npy')
# std_08  = np.load('std_08.npy')

episodes = np.arange(len(mean_01))

def get_fb20(noise):
    return results[(20, noise)]["mean"], results[(20, noise)]["std"]

mean_fb20_01, std_fb20_01 = get_fb20(0.1)
mean_fb20_05, std_fb20_05 = get_fb20(0.5)
mean_fb20_08, std_fb20_08 = get_fb20(0.8)

plt.figure(figsize=(7, 4))

# noise = 0.1
line01, = plt.plot(episodes, mean_01, label='end-only, noise = 0.1')
c01 = line01.get_color()
plt.fill_between(episodes, mean_01 - std_01, mean_01 + std_01, alpha=0.2, color=c01)

plt.plot(episodes, mean_fb20_01, '--', color=c01,
         label='sparse fb=20, noise = 0.1')
plt.fill_between(episodes, mean_fb20_01 - std_fb20_01,
                 mean_fb20_01 + std_fb20_01, alpha=0.15, color=c01)

# noise = 0.5
line05, = plt.plot(episodes, mean_05, label='end-only, noise = 0.5')
c05 = line05.get_color()
plt.fill_between(episodes, mean_05 - std_05, mean_05 + std_05, alpha=0.2, color=c05)

plt.plot(episodes, mean_fb20_05, '--', color=c05,
         label='sparse fb=20, noise = 0.5')
plt.fill_between(episodes, mean_fb20_05 - std_fb20_05,
                 mean_fb20_05 + std_fb20_05, alpha=0.15, color=c05)

# noise = 0.8
line08, = plt.plot(episodes, mean_08, label='end-only, noise = 0.8')
c08 = line08.get_color()
plt.fill_between(episodes, mean_08 - std_08, mean_08 + std_08, alpha=0.2, color=c08)

plt.plot(episodes, mean_fb20_08, '--', color=c08,
         label='sparse fb=20, noise = 0.8')
plt.fill_between(episodes, mean_fb20_08 - std_fb20_08,
                 mean_fb20_08 + std_fb20_08, alpha=0.15, color=c08)

plt.xlabel("Episode")
plt.ylabel("Average feedback per episode")
plt.title("Average feedback per episode: end-of-trajectory vs sparse (fb_every=20)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt


episodes = np.arange(len(mean_01))


def get_fb10(noise):
    return results[(10, noise)]["mean"], results[(10, noise)]["std"]

mean_fb10_01, std_fb10_01 = get_fb10(0.1)
mean_fb10_05, std_fb10_05 = get_fb10(0.5)
mean_fb10_08, std_fb10_08 = get_fb10(0.8)

plt.figure(figsize=(7, 4))

# noise = 0.1
line01, = plt.plot(episodes, mean_01, label='end-only, noise = 0.1')
c01 = line01.get_color()
plt.fill_between(episodes, mean_01 - std_01, mean_01 + std_01, alpha=0.2, color=c01)

plt.plot(episodes, mean_fb10_01, '--', color=c01,
         label='sparse fb=20, noise = 0.1')
plt.fill_between(episodes, mean_fb10_01 - std_fb10_01,
                 mean_fb10_01 + std_fb10_01, alpha=0.15, color=c01)

# noise = 0.5
line05, = plt.plot(episodes, mean_05, label='end-only, noise = 0.5')
c05 = line05.get_color()
plt.fill_between(episodes, mean_05 - std_05, mean_05 + std_05, alpha=0.2, color=c05)

plt.plot(episodes, mean_fb10_05, '--', color=c05,
         label='sparse fb=20, noise = 0.5')
plt.fill_between(episodes, mean_fb10_05 - std_fb10_05,
                 mean_fb10_05 + std_fb10_05, alpha=0.15, color=c05)

# noise = 0.8
line08, = plt.plot(episodes, mean_08, label='end-only, noise = 0.8')
c08 = line08.get_color()
plt.fill_between(episodes, mean_08 - std_08, mean_08 + std_08, alpha=0.2, color=c08)

plt.plot(episodes, mean_fb10_08, '--', color=c08,
         label='sparse fb=20, noise = 0.8')
plt.fill_between(episodes, mean_fb10_08 - std_fb10_08,
                 mean_fb10_08 + std_fb10_08, alpha=0.15, color=c08)

plt.xlabel("Episode")
plt.ylabel("Average feedback per episode")
plt.title("Average feedback per episode: end-of-trajectory vs sparse (fb_every=20)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt


episodes = np.arange(len(mean_01))

def get_fb1(noise):
    return results[(1, noise)]["mean"], results[(1, noise)]["std"]

mean_fb1_01, std_fb1_01 = get_fb1(0.1)
mean_fb1_05, std_fb1_05 = get_fb1(0.5)
mean_fb1_08, std_fb1_08 = get_fb1(0.8)

plt.figure(figsize=(7, 4))

# noise = 0.1
line01, = plt.plot(episodes, mean_01, label='end-only, noise = 0.1')
c01 = line01.get_color()
plt.fill_between(episodes, mean_01 - std_01, mean_01 + std_01, alpha=0.2, color=c01)

plt.plot(episodes, mean_fb1_01, '--', color=c01,
         label='sparse fb=20, noise = 0.1')
plt.fill_between(episodes, mean_fb1_01 - std_fb1_01,
                 mean_fb1_01 + std_fb1_01, alpha=0.15, color=c01)

# noise = 0.5
line05, = plt.plot(episodes, mean_05, label='end-only, noise = 0.5')
c05 = line05.get_color()
plt.fill_between(episodes, mean_05 - std_05, mean_05 + std_05, alpha=0.2, color=c05)

plt.plot(episodes, mean_fb1_05, '--', color=c05,
         label='sparse fb=20, noise = 0.5')
plt.fill_between(episodes, mean_fb1_05 - std_fb1_05,
                 mean_fb1_05 + std_fb1_05, alpha=0.15, color=c05)

# noise = 0.8
line08, = plt.plot(episodes, mean_08, label='end-only, noise = 0.8')
c08 = line08.get_color()
plt.fill_between(episodes, mean_08 - std_08, mean_08 + std_08, alpha=0.2, color=c08)

plt.plot(episodes, mean_fb1_08, '--', color=c08,
         label='sparse fb=20, noise = 0.8')
plt.fill_between(episodes, mean_fb1_08 - std_fb1_08,
                 mean_fb1_08 + std_fb1_08, alpha=0.15, color=c08)

plt.xlabel("Episode")
plt.ylabel("Average feedback per episode")
plt.title("Average feedback per episode: end-of-trajectory vs sparse (fb_every=20)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
