In [1]:
# Lost Cities V1E
# Finished 3 grid searches
# Using best determined configuration, long training

In [3]:
# import V1E_main as main
from V1E_main import *

In [5]:
# ALL PARAMETERS

nn_layer_1=64
nn_layer_2=32
nn_layer_2_dropout=0.20
learning_rate=0.001
replay_size=20000
num_episodes = 200_000
batch_size = 64
batch_cnt = 3
train_every = 3
step_booster = 8.0
episode_booster = 0.10

# Fixed parameters (not in grid)
fixed_params = {
    'epsilon': 0.30,
    'epsilon_min': 0.05,
    'epsilon_decay': 0.9999
}

# Simply comment out any functions not to be included
step_functions=[
'lower_val_avail',
'too_few_pts',
'blocked_7',
'exp_small_deck',
'exp_was_live',
'good_exp',
'bad_X',
'bad_bigger_val',
'good_low_val',
'draw_to_tgt',
# 'had_X',
'next_value',
# 'bad_center',
# 'smart_opp_center'
]

In [11]:
# Modified Training Loop with Action + Draw Selection from Policy
def train_model(fv, reward_params, nn_layer_1, nn_layer_2_dropout,
                learning_rate, replay_size, step_booster, episode_booster,
                step_functions, file_name):

    global epsilon, epsilon_min, epsilon_decay
    global batch_size, train_every, batch_cnt

    global all_rewards
    global mean_rewards
    all_rewards = []
    mean_rewards = []

    env = LostCitiesEnv()
    num_card_actions = card_cnt
    num_draw_choices = color_cnt+1
    model = ActorCritic(state_size=state_size, action_size=num_card_actions, draw_size=num_draw_choices, 
                             nn_layer_1=nn_layer_1, nn_layer_2=nn_layer_2, nn_layer_2_dropout=nn_layer_2_dropout)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    replay_buffer = deque(maxlen=replay_size)
    rule_counter = defaultdict(int)
    
    for episode in range(1, num_episodes + 1):
        state = env.reset()
        done = False
        mean_reward = 0.0
        play_cnt=0
        plays_p1 = []
        plays_p2 = []
    
        while not done:
            play_cnt+=1
            features_np = extract_features(state)
            features = torch.tensor(features_np, dtype=torch.float32)
    
            # Forward pass through the model
            card_logits, draw_logits, value = model(features)
    
            # Get legal actions and legal draws
            # actions, draws = env.get_legal_actions(state['current_player'])
            actions, draws = env.get_legal_actions(state['current_player'])
            legal_action_indices = list(range(len(actions)))
    
            valid_draws = [d for d in draws if d == 'deck' or (d in env.center_piles and env.center_piles[d])]
            if not valid_draws:
                print("No valid draws—forcing episode end.")
                done = True
                break
    
            # legal_draw_indices = list(range(len(valid_draws)))  # typically ['deck', 'R', 'B', 'G']
    
            # Stop this game is no legal actions - though this is of course not true, but...
            # Print discard and draw for plays 1001 to 1020
            if 10001 <= play_cnt <= 10020:
                print(f"Legal actions: {actions}")
                print(f"Action indices: {[i for i in range(len(actions))]}")
                # Not valid at this point
                # print(f"Play {play_cnt}: Discard action = {chosen_action}, Draw choice = {chosen_draw}")
            if play_cnt>=10020:
                print(play_cnt, actions, draws, valid_draws)
                print(f"\n--- STUCK STATE at play {play_cnt} ---")
                print(f"Deck size: {state['deck_size']}")
                print(f"Player hand: {state['hands'][state['current_player']]}")
                print(f"Expeditions:")
                for color in env.expeditions[state['current_player']]:
                    print(f"  {color}: {env.expeditions[state['current_player']][color]}")
                print(f"Center piles:")
                for color in env.center_piles:
                    print(f"  {color}: {env.center_piles[color]}")
                print(f"Available actions: {actions}")
                print(f"Available draws: {draws}")
                print(f"----------------------\n")
                raise SystemExit(f"STOP")
            
            if not actions:
                print(f"No legal actions for player {state['current_player']}. Ending episode early.")
                done = True
                break
    
            # Sample card action with epsilon-greedy
            if random.random() < epsilon:
                # Random action
                card_idx = random.randint(0, len(actions) - 1)
            else:
                # Model-based action
                card_probs = torch.softmax(card_logits[:len(actions)], dim=0)
                card_dist = torch.distributions.Categorical(card_probs)
                card_idx = card_dist.sample().item()
            
            chosen_action = actions[card_idx]
    
            # Filter valid draws based on chosen_action (if it's a center discard)
            discard_color = None
            if chosen_action[0] == 'center':
                discard_color = chosen_action[1][0]
            
            filtered_draws = [
                d for d in valid_draws if d != discard_color
            ]
            if not filtered_draws:
                # Failsafe: fallback to deck
                filtered_draws = ['deck']
    
            valid_draws=filtered_draws
    
            # Sample draw choice (FIXED)
            if random.random() < epsilon:
                chosen_draw = random.choice(valid_draws)
            else:
                # Correct mapping: get logits only for valid draws
                draw_indices_in_logits = [draw_to_index[d] for d in valid_draws]
                draw_logits_filtered = draw_logits[draw_indices_in_logits]
                draw_probs = torch.softmax(draw_logits_filtered, dim=0)
                draw_dist = torch.distributions.Categorical(draw_probs)
                draw_idx = draw_dist.sample().item()
                chosen_draw = valid_draws[draw_idx]
    
            # Compute shaped intermediate reward
            step_reward = compute_step_reward_grid(state, chosen_action, chosen_draw, env, step_functions, reward_params, rule_counter)
    
            # Map draw_choice to its index for policy update
            chosen_draw_idx = draw_to_index[chosen_draw]
    
            # Save the current player before doing env.step
            current_player=state['current_player']
            
            # Take action and draw based on policies
            next_state, reward, done = env.step(chosen_action, chosen_draw)
    
            # Combine shaped reward + final score (if any)
            # total_reward = reward + booster * step_reward
            total_reward = step_booster * step_reward
    
            # Store full experience (must include both action idx and draw idx!)
            # replay_buffer.append((features_np, card_idx, chosen_draw_idx, total_reward))
            # Now, do it all at end of game
            if current_player=='P1':
                plays_p1.append((features_np, card_idx, chosen_draw_idx, step_reward))
            else:
                plays_p2.append((features_np, card_idx, chosen_draw_idx, step_reward))
                
            # Advance state
            state = next_state
            mean_reward += total_reward
    
            ddebug = random.random()<1e-6
            if done:
                reward_p1 = env.compute_score('P1')
                reward_p2 = env.compute_score('P2')
                p1cnt=0
                for (features_np, card_idx, draw_idx, step_reward) in plays_p1:
                    total_reward = episode_booster * reward_p1 + step_reward
                    replay_buffer.append((features_np, card_idx, draw_idx, total_reward))
                    if ddebug:
                        p1cnt+=1
                        print(f"P1 {p1cnt} - {episode_booster} * {reward_p1} + {step_reward} = {total_reward}")
                for (features_np, card_idx, draw_idx, step_reward) in plays_p2:
                    total_reward = episode_booster * reward_p2 + step_reward
                    replay_buffer.append((features_np, card_idx, draw_idx, total_reward))
    
        # Final mean reward is the average over plays - approximate over P1 and P2
        mean_reward=1.0*mean_reward/play_cnt

        # Annealing
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        
        if play_cnt>200:
            print(f"Plays: {play_cnt} in episode {episode}")
    
        # Train
        if episode % train_every == 0 and len(replay_buffer) >= batch_size:
            for _ in range(batch_cnt):
                minibatch = random.sample(replay_buffer, batch_size)
        
                # Unpack minibatch into separate lists
                states_b, actions_b, draws_b, rewards_b = zip(*minibatch)
    
                # Convert lists to tensors
                states_np = np.array(states_b)  # Convert list of arrays → single array
                states_t = torch.tensor(states_np, dtype=torch.float32)
        
                # Convert to tensors in batch
                # states_t = torch.tensor(states_b, dtype=torch.float32)  # Shape: [batch_size, state_size]
                actions_t = torch.tensor(actions_b, dtype=torch.long)   # Shape: [batch_size]
                draws_t = torch.tensor(draws_b, dtype=torch.long)       # Shape: [batch_size]
                rewards_t = torch.tensor(rewards_b, dtype=torch.float32)  # Shape: [batch_size]
        
                # Forward pass in batch
                card_logits_b, draw_logits_b, values_b = model(states_t)  # Each output shape: [batch_size, num_actions/draws]
        
                # Compute log probs for card actions
                card_probs_b = torch.softmax(card_logits_b, dim=1)
                log_card_probs_b = torch.log(card_probs_b + 1e-8)
                selected_log_card_probs = log_card_probs_b[range(batch_size), actions_t]
        
                # Compute log probs for draws
                draw_probs_b = torch.softmax(draw_logits_b, dim=1)
                log_draw_probs_b = torch.log(draw_probs_b + 1e-8)
                selected_log_draw_probs = log_draw_probs_b[range(batch_size), draws_t]
        
                # Compute advantage
                advantages = rewards_t - values_b.squeeze(1)  # Shape: [batch_size]
        
                # Losses
                critic_loss = advantages.pow(2).mean()
                actor_loss_card = -(selected_log_card_probs * advantages).mean()
                actor_loss_draw = -(selected_log_draw_probs * advantages).mean()
        
                total_loss = critic_loss + actor_loss_card + actor_loss_draw
        
                optimizer.zero_grad()
                total_loss.backward()
                optimizer.step()
    
        all_rewards.append(reward_p1)
        all_rewards.append(reward_p2)    
        mean_rewards.append(mean_reward)
    
        if episode % 5000 == 0:
            print("\n=== Step Rule Firing Counts ===")
            for rule, count in sorted(rule_counter.items(), key=lambda x: -x[1]):
                print(f"{rule:<30}: {count}")
            pd.Series(all_rewards).to_csv(file_name, index=False, header=False)
        
        if episode % 500 == 0:
            avg_score = np.mean(all_rewards[-2000:]) if len(all_rewards) >= 2000 else np.mean(all_rewards)
            print(f"Episode {episode}, Average Reward Last {min(len(all_rewards), 1000)}: {avg_score:.2f}, eps={epsilon:.4f}")

In [None]:
from itertools import product
import datetime
import os

running=100

grid = {
    "nn_layer_1": [64],
    "nn_layer_2_dropout": [0.1],
    "learning_rate": [0.001],
    "replay_size": [20000],
    "step_booster": [8.0],
    "episode_booster": [0.1],
    "step_functions": [
        ['good_exp', 'draw_to_tgt', 'too_few_pts', 'good_low_val', 'next_value','bad_bigger_val','bad_X',
         'lower_val_avail','exp_was_live','exp_small_deck','bad_center','next_value']
    ]
}

# This has all step rewards to explore and evaluate
reward_grid = {
    "good_exp":        [0.3],
    "good_exp_1":      [1.5],    
    "draw_to_tgt":     [1.5],
    "too_few_pts":     [-0.3],
    "bad_X":           [-1.5], # was -2.0
    "bad_center":      [-1.5], # was -2.0
    "good_low_val":    [1.5],  # was +2.0
    "bad_bigger_val":  [-1.5], # was -2.0
    "next_value":      [0.3],
    "blocked_7":       [-0.3],
    "lower_val_avail": [-1.5]
}

# Create Cartesian product of all combinations
keys, values = zip(*grid.items())
combinations = list(product(*values))

print(f"Total runs: {len(combinations)}")

# Main loop
for idx, combo in enumerate(combinations):
    params = dict(zip(keys, combo))
    fv = f"grid_{idx:02d}_{datetime.datetime.now().strftime('%H%M%S')}"
    file_name = f"all_rewards.{running}.{fv}.csv"

    # Add file_name to params so it can be passed to train_model
    params['file_name'] = file_name
  
    # Save config file
    save_config_txt(running, fv, params, params['step_functions'])

    # Variables to reset
    epsilon = fixed_params['epsilon']
    epsilon_min = fixed_params['epsilon_min']
    epsilon_decay = fixed_params['epsilon_decay']

    # Train grid
    # reward_params=reward_grid.copy()
    reward_params = {k: v[0] for k, v in reward_grid.items()}
    train_model(fv, reward_params, **params)

    # Optional: log progress
    print(f"Finished run {idx+1}/{len(combinations)} → {file_name}")

Total runs: 1
Configuration saved to config_100_grid_00_141038.txt
Episode 500, Average Reward Last 1000: -0.14, eps=0.2440
Episode 1000, Average Reward Last 1000: 0.03, eps=0.2385
Episode 1500, Average Reward Last 1000: -0.19, eps=0.2328
Episode 2000, Average Reward Last 1000: -0.44, eps=0.2272
Episode 2500, Average Reward Last 1000: -0.41, eps=0.2215
Episode 3000, Average Reward Last 1000: -0.62, eps=0.2160
Episode 3500, Average Reward Last 1000: -0.74, eps=0.2107
Episode 4000, Average Reward Last 1000: -0.55, eps=0.2047
Episode 4500, Average Reward Last 1000: -0.40, eps=0.1977

=== Step Rule Firing Counts ===
good_exp_1                    : 27718
draw_to_tgt                   : 21729
too_few_pts                   : 18669
good_exp                      : 17693
good_low_val                  : 15089
bad_center                    : 11363
next_value                    : 9835
lower_val_avail               : 4186
bad_bigger_val                : 4186
bad_X                         : 2404
Epis

In [80]:
# torch.save(model.state_dict(), 'lc_model_'+fv+'.pt')

In [87]:
# model.load_state_dict(torch.load('lc_model_v1B_2.pt'))
# model.eval()  # Important: sets model to evaluation mode (no dropout etc.)

ActorCritic(
  (fc1): Linear(in_features=43, out_features=96, bias=True)
  (fc2): Linear(in_features=96, out_features=32, bias=True)
  (dropout): Dropout(p=0.15, inplace=False)
  (policy_action_head): Linear(in_features=32, out_features=18, bias=True)
  (policy_draw_head): Linear(in_features=32, out_features=4, bias=True)
  (value_head): Linear(in_features=32, out_features=1, bias=True)
)