# Reward Landscape Visualization

Visualize how reward changes with gripper position, grasping state, and lift height.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from ipywidgets import interact, FloatSlider, IntSlider

%matplotlib inline
plt.style.use('dark_background')

## Reward Function Parameters

In [None]:
# Default parameters - modify these to experiment
REACHING_WEIGHT = 0.3
REACHING_COEFF = 3.0
GRASP_REWARD = 0.5
LIFT_REWARD = 1.0
SUCCESS_REWARD = 200.0
TIME_PENALTY = 0.5
SUCCESS_HEIGHT = 0.04  # 4cm above table

In [None]:
def compute_reward(dist, is_grasping, height_above_table, 
                   reaching_weight=REACHING_WEIGHT,
                   reaching_coeff=REACHING_COEFF,
                   grasp_reward=GRASP_REWARD,
                   lift_reward=LIFT_REWARD,
                   success_reward=SUCCESS_REWARD,
                   time_penalty=TIME_PENALTY):
    """Compute reward for a single step."""
    reward = 0.0
    
    # Reaching reward
    dist_reward = (1 - np.tanh(reaching_coeff * dist)) * reaching_weight
    reward += dist_reward
    
    # Grasp and lift rewards (only if grasping)
    if is_grasping:
        reward += grasp_reward
        reward += min(height_above_table, 1.0) * lift_reward
    
    # Success bonus
    if is_grasping and height_above_table >= SUCCESS_HEIGHT:
        reward += success_reward
    
    # Time penalty
    reward -= time_penalty
    
    return reward

## 1. Reaching Reward vs Distance (comparing coefficients)

In [None]:
distances = np.linspace(0, 1.0, 100)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: Raw tanh curves for different coefficients
ax = axes[0]
for coeff in [2, 3, 5, 10]:
    reach_reward = 1 - np.tanh(coeff * distances)
    ax.plot(distances, reach_reward, label=f'coeff={coeff}', linewidth=2)

ax.set_xlabel('Distance to cube (m)', fontsize=12)
ax.set_ylabel('Reaching reward (before weight)', fontsize=12)
ax.set_title('Reaching Reward Gradient\n(lower coeff = gentler gradient)', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)
ax.axvline(x=0.3, color='yellow', linestyle='--', alpha=0.5, label='typical reach dist')

# Right: Net reward (reaching - time_penalty) for different coefficients
ax = axes[1]
for coeff in [2, 3, 5, 10]:
    reach_reward = (1 - np.tanh(coeff * distances)) * REACHING_WEIGHT
    net_reward = reach_reward - TIME_PENALTY
    ax.plot(distances, net_reward, label=f'coeff={coeff}', linewidth=2)

ax.axhline(y=0, color='white', linestyle='-', alpha=0.3)
ax.set_xlabel('Distance to cube (m)', fontsize=12)
ax.set_ylabel('Net reward per step', fontsize=12)
ax.set_title(f'Net Reward (reachÃ—{REACHING_WEIGHT} - penalty {TIME_PENALTY})\n(closer to 0 = better signal)', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 2. Reward Heatmap: Distance vs Height (when grasping)

In [None]:
# Create grid
distances = np.linspace(0, 0.5, 50)
heights = np.linspace(0, 0.1, 50)
D, H = np.meshgrid(distances, heights)

# Compute rewards for grasping state
rewards_grasping = np.zeros_like(D)
rewards_not_grasping = np.zeros_like(D)

for i in range(D.shape[0]):
    for j in range(D.shape[1]):
        rewards_grasping[i, j] = compute_reward(D[i, j], True, H[i, j])
        rewards_not_grasping[i, j] = compute_reward(D[i, j], False, H[i, j])

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: Not grasping
ax = axes[0]
im = ax.contourf(D, H * 100, rewards_not_grasping, levels=20, cmap='RdYlGn')
plt.colorbar(im, ax=ax, label='Reward/step')
ax.set_xlabel('Distance to cube (m)', fontsize=12)
ax.set_ylabel('Height above table (cm)', fontsize=12)
ax.set_title('Reward Landscape: NOT Grasping', fontsize=14)
ax.axhline(y=4, color='cyan', linestyle='--', linewidth=2, label='Success height')
ax.legend()

# Right: Grasping
ax = axes[1]
# Mask out the success reward for clearer visualization
rewards_grasping_masked = np.where(H >= SUCCESS_HEIGHT, np.nan, rewards_grasping)
im = ax.contourf(D, H * 100, rewards_grasping_masked, levels=20, cmap='RdYlGn')
plt.colorbar(im, ax=ax, label='Reward/step')
ax.set_xlabel('Distance to cube (m)', fontsize=12)
ax.set_ylabel('Height above table (cm)', fontsize=12)
ax.set_title('Reward Landscape: Grasping (before success bonus)', fontsize=14)
ax.axhline(y=4, color='cyan', linestyle='--', linewidth=2, label='Success height (+200!)')
ax.legend()

plt.tight_layout()
plt.show()

## 3. Cumulative Episode Reward: Different Behaviors

In [None]:
max_steps = 500
steps = np.arange(max_steps)

def simulate_episode(behavior, success_step=None):
    """Simulate cumulative reward for different behaviors."""
    cumulative = 0
    rewards = []
    
    for t in range(max_steps):
        if behavior == 'random_far':
            # Far from cube, never grasps
            r = compute_reward(dist=0.5, is_grasping=False, height_above_table=0)
        elif behavior == 'hover_near':
            # Near cube but never grasps
            r = compute_reward(dist=0.05, is_grasping=False, height_above_table=0)
        elif behavior == 'grasp_hold':
            # Grasps but never lifts
            r = compute_reward(dist=0.0, is_grasping=True, height_above_table=0.01)
        elif behavior == 'success':
            # Reaches, grasps, lifts, succeeds
            if t < success_step * 0.3:  # Approaching
                progress = t / (success_step * 0.3)
                dist = 0.5 * (1 - progress)
                r = compute_reward(dist=dist, is_grasping=False, height_above_table=0)
            elif t < success_step * 0.6:  # Grasping
                r = compute_reward(dist=0.0, is_grasping=True, height_above_table=0.01)
            elif t < success_step:  # Lifting
                lift_progress = (t - success_step * 0.6) / (success_step * 0.4)
                height = 0.01 + lift_progress * 0.04
                r = compute_reward(dist=0.0, is_grasping=True, height_above_table=height)
            else:  # Success!
                r = compute_reward(dist=0.0, is_grasping=True, height_above_table=0.05)
                cumulative += r
                rewards.append(cumulative)
                break  # Episode ends on success
        
        cumulative += r
        rewards.append(cumulative)
    
    return rewards

# Simulate different behaviors
fig, ax = plt.subplots(figsize=(12, 6))

behaviors = [
    ('random_far', 'Random (far from cube)', 'red'),
    ('hover_near', 'Hover near cube (no grasp)', 'orange'),
    ('grasp_hold', 'Grasp and hold (no lift)', 'yellow'),
]

for behavior, label, color in behaviors:
    rewards = simulate_episode(behavior)
    ax.plot(range(len(rewards)), rewards, label=f'{label}: {rewards[-1]:.0f}', 
            color=color, linewidth=2)

# Success at different times
for success_step, color in [(100, 'lime'), (200, 'green'), (400, 'darkgreen')]:
    rewards = simulate_episode('success', success_step=success_step)
    ax.plot(range(len(rewards)), rewards, 
            label=f'Success @ step {success_step}: {rewards[-1]:.0f}',
            color=color, linewidth=2, linestyle='--')

ax.axhline(y=0, color='white', linestyle='-', alpha=0.3)
ax.set_xlabel('Timestep', fontsize=12)
ax.set_ylabel('Cumulative Reward', fontsize=12)
ax.set_title('Episode Reward Accumulation: Different Behaviors', fontsize=14)
ax.legend(loc='upper left', bbox_to_anchor=(1.02, 1))
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Interactive Parameter Tuning

In [None]:
def plot_reward_landscape(reaching_weight, reaching_coeff, grasp_reward, 
                          lift_reward, success_reward, time_penalty):
    """Interactive plot for tuning parameters."""
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    # 1. Reaching gradient
    ax = axes[0]
    distances = np.linspace(0, 1.0, 100)
    reach = (1 - np.tanh(reaching_coeff * distances)) * reaching_weight
    net = reach - time_penalty
    ax.plot(distances, net, 'cyan', linewidth=2, label='Not grasping')
    ax.plot(distances, net + grasp_reward, 'lime', linewidth=2, label='Grasping')
    ax.axhline(y=0, color='white', linestyle='--', alpha=0.5)
    ax.set_xlabel('Distance (m)')
    ax.set_ylabel('Reward/step')
    ax.set_title('Reward vs Distance')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # 2. Lift progression
    ax = axes[1]
    heights = np.linspace(0, 0.1, 100)
    lift_rewards = []
    for h in heights:
        r = compute_reward(0, True, h, reaching_weight, reaching_coeff,
                          grasp_reward, lift_reward, success_reward, time_penalty)
        lift_rewards.append(r)
    ax.plot(heights * 100, lift_rewards, 'lime', linewidth=2)
    ax.axvline(x=4, color='cyan', linestyle='--', label='Success threshold')
    ax.set_xlabel('Height above table (cm)')
    ax.set_ylabel('Reward/step')
    ax.set_title('Reward vs Lift Height (grasping)')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # 3. Episode outcomes
    ax = axes[2]
    outcomes = {
        'Far\n(500 steps)': compute_reward(0.5, False, 0, reaching_weight, reaching_coeff,
                                           grasp_reward, lift_reward, success_reward, time_penalty) * 500,
        'Near\n(500 steps)': compute_reward(0.05, False, 0, reaching_weight, reaching_coeff,
                                            grasp_reward, lift_reward, success_reward, time_penalty) * 500,
        'Grasp+Hold\n(500 steps)': compute_reward(0, True, 0.01, reaching_weight, reaching_coeff,
                                                   grasp_reward, lift_reward, success_reward, time_penalty) * 500,
        'Success\n@ step 100': sum([compute_reward(0, True, 0.05, reaching_weight, reaching_coeff,
                                                    grasp_reward, lift_reward, success_reward, time_penalty) 
                                    for _ in range(100)]),
        'Success\n@ step 300': sum([compute_reward(0, True, 0.05, reaching_weight, reaching_coeff,
                                                    grasp_reward, lift_reward, success_reward, time_penalty) 
                                    for _ in range(300)]),
    }
    colors = ['red', 'orange', 'yellow', 'lime', 'green']
    bars = ax.bar(outcomes.keys(), outcomes.values(), color=colors)
    ax.axhline(y=0, color='white', linestyle='-', alpha=0.3)
    ax.set_ylabel('Total Episode Reward')
    ax.set_title('Episode Outcomes')
    
    # Add value labels on bars
    for bar, val in zip(bars, outcomes.values()):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5, 
                f'{val:.0f}', ha='center', va='bottom', fontsize=10)
    
    plt.tight_layout()
    plt.show()

# Interactive sliders
interact(plot_reward_landscape,
         reaching_weight=FloatSlider(value=0.3, min=0.0, max=1.0, step=0.1, description='reach_wt'),
         reaching_coeff=FloatSlider(value=3.0, min=1.0, max=10.0, step=0.5, description='reach_coeff'),
         grasp_reward=FloatSlider(value=0.5, min=0.0, max=2.0, step=0.1, description='grasp'),
         lift_reward=FloatSlider(value=1.0, min=0.0, max=3.0, step=0.1, description='lift'),
         success_reward=FloatSlider(value=200.0, min=0.0, max=500.0, step=10.0, description='success'),
         time_penalty=FloatSlider(value=0.5, min=0.0, max=1.0, step=0.1, description='penalty'));

## 5. 3D Surface: Reward vs (Distance, Height)

In [None]:
from mpl_toolkits.mplot3d import Axes3D

# Create grid
distances = np.linspace(0, 0.5, 30)
heights = np.linspace(0, 0.06, 30)
D, H = np.meshgrid(distances, heights)

# Compute rewards (grasping case, excluding success bonus for clarity)
rewards = np.zeros_like(D)
for i in range(D.shape[0]):
    for j in range(D.shape[1]):
        # Exclude success bonus for clearer gradient visualization
        r = compute_reward(D[i, j], True, H[i, j], success_reward=0)
        rewards[i, j] = r

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

surf = ax.plot_surface(D, H * 100, rewards, cmap='viridis', alpha=0.8)
ax.set_xlabel('Distance to cube (m)')
ax.set_ylabel('Height above table (cm)')
ax.set_zlabel('Reward/step')
ax.set_title('3D Reward Surface (Grasping, excl. success bonus)\n'
             'Goal: Agent should move toward origin (dist=0) and up (height=4cm+)')

# Add success threshold plane
ax.plot_surface(D, np.full_like(D, 4), np.full_like(D, rewards.max()), 
                alpha=0.2, color='cyan')

plt.colorbar(surf, shrink=0.5, label='Reward/step')
plt.tight_layout()
plt.show()

## Summary: Current Reward Structure

With the current parameters:
- `reaching_weight=0.3`, `reaching_coeff=3.0`
- `grasp_reward=0.5`, `lift_reward=1.0`
- `success_reward=200.0`, `time_penalty=0.5`

**Per-step rewards:**
| State | Reward/step |
|-------|-------------|
| Far from cube | ~ -0.5 |
| Near cube (no grasp) | ~ -0.2 |
| Grasping (not lifting) | ~ +0.3 |
| Lifting | ~ +0.8 to +1.3 |
| Success | +200 bonus |

**Episode totals:**
| Behavior | Total Reward |
|----------|-------------|
| Random far (500 steps) | ~ -250 |
| Hover near (500 steps) | ~ -100 |
| Grasp+hold (500 steps) | ~ +150 |
| Success @ 100 steps | ~ +280 |
| Success @ 300 steps | ~ +260 |