In [3]:
import sys
sys.path.append("../src")

import numpy as np
from PDGenv import PDGEnv
from agents import make_agents
from loop import run_episode

# 1) Create environment
env = PDGEnv(size=10)  # 100 agents

# 2) Create agents (choose parameters)
A = 0.5
beta = 1.0
epsilon = 0.2
agents = make_agents(env.n_agents, A=A, beta=beta, epsilon=epsilon, p_init=0.5)

# 3) Run 1 episode
actions_hist, rewards_hist, p_hist = run_episode(env, agents, tmax=25, seed=123)

# 4) Print sanity info
print("Mean reward at t=0:", rewards_hist[0].mean())
print("Mean reward at t=24:", rewards_hist[-1].mean())
print("Mean p at t=0:", p_hist[0].mean())
print("Mean p at t=24:", p_hist[-1].mean())

# Show first 10 actions of first 5 agents for first 3 timesteps
print("Actions (t=0..2, agents 0..4):")
print(actions_hist[:3, :5])


Mean reward at t=0: 2.355
Mean reward at t=24: 2.17
Mean p at t=0: 0.46796011426346934
Mean p at t=24: 0.3896721130871995
Actions (t=0..2, agents 0..4):
[[0 1 1 1 1]
 [0 1 0 1 1]
 [0 1 0 1 1]]


In [14]:
import numpy as np

print("===== SANITY CHECK START =====")

# 1) p bounds
print("\n[1] Check p bounds")
print("p min:", p_hist.min())
print("p max:", p_hist.max())

# 2) reward bounds
print("\n[2] Check reward bounds")
print("reward min:", rewards_hist.min())
print("reward max:", rewards_hist.max())

# 3) All-Cooperate test
print("\n[3] All-Cooperate (all C) test")
actions_all_C = np.ones(env.n_agents, dtype=int)
rewards_all_C = env.step(actions_all_C)
print("Mean reward (expected 3):", rewards_all_C.mean())
print("Unique rewards:", np.unique(rewards_all_C))

# 4) All-Defect test
print("\n[4] All-Defect (all D) test")
actions_all_D = np.zeros(env.n_agents, dtype=int)
rewards_all_D = env.step(actions_all_D)
print("Mean reward (expected 1):", rewards_all_D.mean())
print("Unique rewards:", np.unique(rewards_all_D))

# 5) One defector among cooperators
print("\n[5] One defector test")
actions_one_D = np.ones(env.n_agents, dtype=int)
defector = 0
actions_one_D[defector] = 0

rewards_one_D = env.step(actions_one_D)
print("Defector reward (expected 5):", rewards_one_D[defector])

neighbors = env.neighbors[defector]
print("Neighbors of defector:", neighbors)
print("Neighbors rewards (expected ~2.25):", rewards_one_D[neighbors])
print("Neighbors mean reward:", rewards_one_D[neighbors].mean())

print("\n===== SANITY CHECK END =====")


===== SANITY CHECK START =====

[1] Check p bounds
p min: 1.8749981389863832e-47
p max: 1.0

[2] Check reward bounds
reward min: 0.0
reward max: 5.0

[3] All-Cooperate (all C) test
Mean reward (expected 3): 3.0
Unique rewards: [3.]

[4] All-Defect (all D) test
Mean reward (expected 1): 1.0
Unique rewards: [1.]

[5] One defector test
Defector reward (expected 5): 5.0
Neighbors of defector: [90, 10, 9, 1]
Neighbors rewards (expected ~2.25): [2.25 2.25 2.25 2.25]
Neighbors mean reward: 2.25

===== SANITY CHECK END =====
