In [4]:
import gymnasium as gym
import openai
import re
from minigrid.wrappers import FlatObsWrapper
from openai import OpenAI
import os
import pickle
import minigrid
import pprint
import json
import dotenv
import numpy as np
from dotenv import load_dotenv
from minigrid.core.world_object import Door, Key, Goal, Wall, Lava

  from pkg_resources import resource_stream, resource_exists


In [6]:
def describe_environment(env):
    grid = env.unwrapped.grid
    width, height = grid.width, grid.height
    description = [f"Environment: {env.spec.id}, {width}x{height} grid"]

    # Get all directions
    directions = ['right', 'down', 'left', 'up']
    for i, direction in enumerate(directions):
        description.append(f"Direction {i}: {direction}")

    for action in range(env.action_space.n):
        description.append(f"Action {action}: {env.unwrapped.actions(action).name}")

    # Store object positions as tuples
    object_positions = []
    wall_positions = []
    
    for x in range(width):
        for y in range(height):
            obj = grid.get(x, y)
            if obj is not None:
                obj_desc = ''
                if isinstance(obj, Wall):
                    wall_desc = f"{x, y}"
                    wall_positions.append(wall_desc)
                elif isinstance(obj, Door):
                    obj_desc += f"Door at {x, y}, is locked: True"
                    object_positions.append(obj_desc)
                    door_pos = [x, y]
                elif isinstance(obj, Key):
                    obj_desc += f"Key at {x, y}"
                    object_positions.append(obj_desc)
                    key_pos = [x, y]
                elif isinstance(obj, Goal):
                    obj_desc += f"Goal at {x, y}"
                    object_positions.append(obj_desc)
                    goal_pos = [x, y]
    agent_pos = list(int(x) for x in env.unwrapped.agent_pos)
    agent_dir = int(env.unwrapped.agent_dir)
    description.append(f"Agent starts at {tuple(agent_pos)}; facing direction {agent_dir}")
    description.append(f"Mission: {env.unwrapped.mission}")
    description.append(f"Object positions: {object_positions}")
    description.append(f"Wall positions: {wall_positions}")
    
    return "\n".join(description), agent_pos, door_pos, key_pos, goal_pos

# Example usage
seed = 9
env = gym.make("MiniGrid-DoorKey-8x8-v0")
# env = gym.make("MiniGrid-DoorKey-8x8-v0", render_mode="rgb_array")
env = FlatObsWrapper(env)  # Flatten the observation space
env = gym.wrappers.RecordEpisodeStatistics(env)
# env = gym.wrappers.RecordVideo(env, "videos/test")
# env.reset(seed=seed)
#test_env, agent_pos, door_pos, key_pos, goal_pos = describe_environment(env)
#
#print(agent_pos, door_pos, key_pos, goal_pos)

In [20]:
def get_trajectory_from_llm(env_description):
    """
    Get a trajectory from LLM for solving the environment
    
    Args:
        env_description (str): Description of the environment
        model (str): Model to use for generation
        temperature (float, optional): Temperature for generation
    
    Returns:
        str: Generated trajectory response
    """
    # Load environment variables
    load_dotenv()

    # Initialize OpenAI client
    client = OpenAI(
        api_key=os.getenv('OPENAI_API_KEY'),
        base_url=os.getenv('OPENAI_BASE_URL')
    )

    # Define prompts
    system_prompt = 'You are a helpful AI assistant who is expert at ReinforcementLearning'
    user_prompt = f'''I have this environment: {env_description}. 
    Please generate ONLY ONE trajectory to solve this environment in this exact format:
    {{"action": <action_number>}}
    Where action_number is an integer.
    I need ONLY the trajectory in that format. Do not ADD any other words.'''

    try:
        # Create chat completion
        completion = client.chat.completions.create(
            model="deepseek/deepseek-chat-v3-0324:free",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        )
        
        response = completion.choices[0].message.content
        return response

    except Exception as e:
        print(f"Error with OpenAI API: {e}")
        return None


response = get_trajectory_from_llm(test_env)
print(response)

{"action": 2}
{"action": 3}
{"action": 2}
{"action": 2}
{"action": 1}
{"action": 1}
{"action": 1}
{"action": 0}
{"action": 2}
{"action": 5}
{"action": 2}
{"action": 0}
{"action": 2}
{"action": 2}
{"action": 1}
{"action": 1}
{"action": 2}


In [20]:
def parse_json(response):
    """
    Parse JSON response from LLM into list of trajectory dictionaries
    
    Args:
        response (str): JSON response string from LLM
        
    Returns:
        list: List of trajectory dictionaries with action/observation pairs
    """

    # Handle empty or None response
    if not response:
        return []
    
    # Strip code block markers if they exist
    response = response.strip()
    if response.startswith('```'):
        # Remove first line containing ```json or similar
        response = '\n'.join(response.split('\n')[1:])
    if response.endswith('```'):
        # Remove last line containing ```
        response = '\n'.join(response.split('\n')[:-1])
    if response.startswith('['):
        # Remove first line containing [ or similar
        response = '\n'.join(response.split('\n')[1:])
    if response.endswith(']'):
        # Remove last line containing ]
        response = '\n'.join(response.split('\n')[:-1])

    # Extract just the JSON part from the response
    # Convert lines into list of dictionaries
    trajectories = []
    for line in response.split('\n'):
        line = line.strip()
        if line.startswith('{') and line.endswith('}'):
            try:
                trajectory = json.loads(line)
                trajectories.append(trajectory)
            except json.JSONDecodeError as e:
                print(f"Error parsing line: {e}")
                continue

    return trajectories


trajectories = parse_json(response)

print(trajectories)

# Create a dictionary with metadata and trajectories
data_to_save = {
    "seed": seed,
    "trajectories": trajectories
}

# Save to JSON file with proper formatting
output_path = "trajectories.json"
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(data_to_save, f, indent=2, ensure_ascii=False)

print(f"Saved trajectories to {output_path}")


NameError: name 'response' is not defined

In [None]:
def load_trajectory_from_json(json_file):
    """Load trajectory data for a specific seed from JSON file"""
    with open(json_file, 'r') as f:
        data = json.load(f)
    trajectories = data['trajectories']
    seed = data['seed']
    return seed, trajectories

# Example usage
#seed, trajectories = load_trajectory_from_json('/trajectory_seed_746.json')

print(seed)
print(trajectories)

FileNotFoundError: [Errno 2] No such file or directory: 'human_expert_data/trajectory_seed_746.json'

In [None]:
def simulate_environment(env, seed, trajectory):
    """
    Simulate environment with given trajectory and collect state-action pairs
    
    Args:
        env: Gymnasium environment
        seed (int): Seed for environment reproducibility
        trajectory (list): List of trajectory dictionaries with actions
    
    Returns:
        list: List of state-action pair dictionaries
    """
    # Recreate the environment with seed
    env.reset(seed=seed)

    # State-action pair
    state_action_pairs = []

    # Extract actions from trajectory data
    actions = [t['action'] for t in trajectory]
    
    # Store state-action pairs
    for step_idx, action in enumerate(actions):
        next_obs, reward, terminated, truncated, _ = env.step(action)
        
        # Store as dictionary
        pair = {
            'state': next_obs,
            'action': action
        }
        state_action_pairs.append(pair)

        if terminated or truncated:
            break

    return state_action_pairs

# Example usage
state_action_pairs = simulate_environment(env, seed, trajectories)
print(state_action_pairs)

NameError: name 'trajectories' is not defined

In [25]:
def save_trajectory_pkl(seed, state_action_pairs, trajectory_index):
    # Create trajectory_data_2 directory if it doesn't exist
    os.makedirs('trajectory_data_2', exist_ok=True)

    # Generate filename with trajectory index and seed (matching human_expert_data_2 format)
    output_path = os.path.join('trajectory_data_2', f'trajectory_{trajectory_index}_seed_{seed}.pkl')

    # Save to pickle file
    with open(output_path, 'wb') as f:
        pickle.dump(state_action_pairs, f)
    
    print(f"Saved trajectories to {output_path}")

# Usage example:
save_trajectory_pkl(seed, state_action_pairs, trajectory_index=0)

# # To load the data later:
# def load_trajectory(seed):
#     """Load trajectory data for a specific seed"""
#     filepath = os.path.join('trajectory_data_2', f'trajectories_seed{seed}.pkl')
#     with open(filepath, 'rb') as f:
#         data = pickle.load(f)
#     return data

# check = load_trajectory(seed=seed)

# print(check)


NameError: name 'state_action_pairs' is not defined

In [None]:
def validate_trajectory(env, trajectory_data, goal_position, seed):
    """
    Validates if a sequence of actions reaches the goal position.
    
    Args:
        env: Gymnasium environment
        actions: List of actions to execute
        goal_position: Target position to reach [x, y]
        seed: Random seed for environment (optional)
    
    Returns:
        bool: True if trajectory reaches goal, False otherwise
    """

    # Extract actions from trajecotry data
    actions = [t['action'] for t in trajectory_data]

    # Execute actions and track progress
    for step_idx, action in enumerate(actions):
        next_obs, reward, terminated, truncated, next_info = env.step(action)
        agent_pos = list(int(x) for x in env.unwrapped.agent_pos)
        agent_dir = env.unwrapped.agent_dir
        print(f"Step {step_idx + 1}: Action {action}: {env.unwrapped.actions(action).name}")
        print(f"Position: {agent_pos}, Direction: {agent_dir}")
        
        # Check if reached goal successfully
        if terminated and reward > 0 and agent_pos == goal_position:
            return True
            
        # Check if trajectory failed
        if truncated:
            return False
    
    # If exit the loop without reaching the goal
    return False

#Load trajectories from JSON file
with open('trajectories.json', 'r') as f:
    data = json.load(f)
    trajectory_data = data['trajectories']

print(trajectory_data)

#Reset environment with seed if provided
env.reset(seed=seed)
success = validate_trajectory(env, trajectory_data, goal_pos, seed=1)
print(f"Trajectory {'succeeded' if success else 'failed'} to reach goal")

[{'action': 2, 'observation': 'Agent moves forward to (1, 5), facing up'}, {'action': 1, 'observation': 'Agent turns right, now facing right'}, {'action': 2, 'observation': 'Agent moves forward to (2, 5), facing right'}, {'action': 2, 'observation': 'Agent moves forward to (3, 5), facing right'}, {'action': 0, 'observation': 'Agent turns left, now facing up'}, {'action': 2, 'observation': 'Agent moves forward to (3, 4), facing up'}, {'action': 2, 'observation': 'Agent moves forward to (3, 3), facing up'}, {'action': 2, 'observation': 'Agent moves forward to (3, 2), facing up, cannot move further'}, {'action': 1, 'observation': 'Agent turns right, now facing right'}, {'action': 2, 'observation': 'Agent moves forward to (4, 2), facing right'}, {'action': 0, 'observation': 'Agent turns left, now facing up'}, {'action': 2, 'observation': 'Agent moves forward to (4, 1), facing up'}, {'action': 3, 'observation': 'Agent picks up the key at (4, 1)'}, {'action': 1, 'observation': 'Agent turns r

In [None]:
def simulate_and_save_trajectory(env, data_dir):
    for file in os.listdir(data_dir):
        if file.endswith(".json"):
            seed, trajectory = load_trajectory_from_json(os.path.join(data_dir, file))
            trajectory_index = int(file.split('_')[1])
            state_action_pairs = simulate_environment(env, seed, trajectory)
            save_trajectory_pkl(seed, state_action_pairs, trajectory_index)
            print(f"Generated and saved state-actions pairs for seed {seed}")

simulate_and_save_trajectory(env, "example file")

Saved trajectories to trajectory_data_2\trajectory_0_seed_0.pkl
Generated and saved state-actions pairs for seed 0
Saved trajectories to trajectory_data_2\trajectory_10_seed_0.pkl
Generated and saved state-actions pairs for seed 0
Saved trajectories to trajectory_data_2\trajectory_11_seed_0.pkl
Generated and saved state-actions pairs for seed 0
Saved trajectories to trajectory_data_2\trajectory_12_seed_0.pkl
Generated and saved state-actions pairs for seed 0
Saved trajectories to trajectory_data_2\trajectory_13_seed_0.pkl
Generated and saved state-actions pairs for seed 0
Saved trajectories to trajectory_data_2\trajectory_14_seed_0.pkl
Generated and saved state-actions pairs for seed 0
Saved trajectories to trajectory_data_2\trajectory_15_seed_0.pkl
Generated and saved state-actions pairs for seed 0
Saved trajectories to trajectory_data_2\trajectory_16_seed_0.pkl
Generated and saved state-actions pairs for seed 0
Saved trajectories to trajectory_data_2\trajectory_17_seed_0.pkl
Generate

In [None]:
def generate_and_save_multiple_trajectories(env, num_seeds, start_seed):
    """
    Generate and save multiple trajectories to PKL files
    
    Args:
        env: Gymnasium environment
        num_seeds (int): Number of different seeds to generate
        start_seed (int): Starting seed number
    """
    for seed_idx in range(num_seeds):
        current_seed = start_seed + seed_idx
        
        # Reset environment with new seed
        env.reset(seed=current_seed)
        env_desc = describe_environment(env)
        
        # Get trajectory from LLM and get the state-action pairs
        response = get_trajectory_from_llm(env_desc)
        trajectories = parse_json(response)
        state_action_pairs = simulate_environment(env, current_seed, trajectories)
        
        # Save to PKL file
        save_trajectory_pkl(current_seed, state_action_pairs)
        print(f"Generated and saved state-actions pairs for seed {current_seed}")

# Generate trajectories for 10 different seeds
generate_and_save_multiple_trajectories(env, num_seeds=5, start_seed=6)



Saved trajectories to trajectory_data\trajectories_seed6.pkl
Generated and saved state-actions pairs for seed 6
Saved trajectories to trajectory_data\trajectories_seed7.pkl
Generated and saved state-actions pairs for seed 7
Saved trajectories to trajectory_data\trajectories_seed8.pkl
Generated and saved state-actions pairs for seed 8
Saved trajectories to trajectory_data\trajectories_seed9.pkl
Generated and saved state-actions pairs for seed 9
Saved trajectories to trajectory_data\trajectories_seed10.pkl
Generated and saved state-actions pairs for seed 10

Total trajectories saved: 11
Seed 0: 17 steps
Seed 1: 20 steps
Seed 10: 1 steps
Seed 2: 26 steps
Seed 3: 22 steps
Seed 4: 16 steps
Seed 5: 14 steps
Seed 6: 17 steps
Seed 7: 18 steps
Seed 8: 10 steps
Seed 9: 24 steps


In [13]:
# Verify the saved file
import glob
saved_files = glob.glob('trajectory_data/trajectories_seed*.pkl')
print(f"\nTotal trajectories saved: {len(saved_files)}")
for file in saved_files:
    seed = file.split('seed')[-1].split('.')[0]
    data = load_trajectory(int(seed))
    print(f"Seed {seed}: {data}")


Total trajectories saved: 190


NameError: name 'load_trajectory' is not defined