In [1]:
import gymnasium as gym
import skyscraper
from sklearn.neural_network import MLPRegressor
import numpy as np
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
import torch
import torch.nn as nn
import torch.optim as optim
import random

In [2]:
trajectories = np.loadtxt("powered_flight.txt", dtype=int)  # Reads as integers
print(trajectories)

[[18 17  1  0 17 13]
 [ 4 61  1  0  4 57]
 [12 20  2  0 11 23]
 ...
 [11 19  1  0 12 16]
 [ 9  8  1  0 10  5]
 [ 5 27  2  0  4 30]]


In [3]:
#Transistion function

def transition_function(trajectories):
    transition_dict = {}

    for step in trajectories:
        state = (step[0], step[1])  # Current state (assuming 2D state)
        action = step[2]  # Action taken
        new_state = (step[4], step[5])  # Next state (assuming 2D state)

        # Store multiple transitions per (state, action) pair
        if (state, action) not in transition_dict:
            transition_dict[(state, action)] = []
        
        transition_dict[(state, action)].append(new_state)

    return transition_dict

print(transition_function(trajectories))

{((18, 17), 1): [(17, 13)], ((4, 61), 1): [(4, 57)], ((12, 20), 2): [(11, 23)], ((17, 10), 2): [(19, 12)], ((8, 15), 1): [(9, 12)], ((3, 38), 1): [(3, 35)], ((18, 17), 2): [(17, 21)], ((5, 56), 1): [(5, 52)], ((4, 40), 2): [(2, 43)], ((3, 59), 2): [(3, 64)], ((3, 57), 2): [(4, 61)], ((2, 54), 1): [(3, 52)], ((10, 14), 2): [(11, 17)], ((19, 58), 2): [(21, 61)], ((2, 56), 1): [(2, 53)], ((8, 11), 2): [(8, 15)], ((4, 54), 2): [(3, 57)], ((11, 18), 1): [(12, 15)], ((6, 55), 1): [(5, 52)], ((6, 49), 1): [(4, 45)], ((4, 61), 2): [(5, 64)], ((3, 61), 1): [(3, 58)], ((7, 64), 2): [(9, 56)], ((10, 25), 1): [(9, 22)], ((1, 55), 2): [(2, 59)], ((4, 57), 2): [(4, 61)], ((2, 48), 2): [(2, 53)], ((7, 14), 2): [(7, 18)], ((4, 20), 2): [(3, 23)], ((12, 60), 2): [(15, 63)], ((9, 26), 1): [(8, 23)], ((14, 21), 2): [(12, 23)], ((22, 14), 1): [(23, 12)], ((4, 43), 2): [(3, 46)], ((5, 7), 2): [(6, 10)], ((2, 28), 2): [(4, 32)], ((2, 43), 1): [(3, 41)], ((2, 54), 2): [(3, 58)], ((8, 55), 2): [(6, 59)], ((11

In [4]:
def reward_function(trajecotries):
    reward_dict = {}

    for step in trajectories:
        state = (step[0], step[1])  # Current state (assuming 2D state)
        action = step[2]  # Action taken
        reward = step[3]

        # Store multiple transitions per (state, action) pair
        if (state, action) not in reward_dict:
            reward_dict[(state, action)] = []
        
        reward_dict[(state, action)].append(reward)

    return reward_dict

print(reward_function(trajectories))

{((18, 17), 1): [0], ((4, 61), 1): [0], ((12, 20), 2): [0], ((17, 10), 2): [0], ((8, 15), 1): [0], ((3, 38), 1): [0], ((18, 17), 2): [0], ((5, 56), 1): [0], ((4, 40), 2): [0], ((3, 59), 2): [0], ((3, 57), 2): [0], ((2, 54), 1): [0], ((10, 14), 2): [0], ((19, 58), 2): [0], ((2, 56), 1): [0], ((8, 11), 2): [0], ((4, 54), 2): [0], ((11, 18), 1): [0], ((6, 55), 1): [0], ((6, 49), 1): [0], ((4, 61), 2): [0], ((3, 61), 1): [0], ((7, 64), 2): [0], ((10, 25), 1): [0], ((1, 55), 2): [0], ((4, 57), 2): [0], ((2, 48), 2): [0], ((7, 14), 2): [0], ((4, 20), 2): [0], ((12, 60), 2): [0], ((9, 26), 1): [0], ((14, 21), 2): [0], ((22, 14), 1): [0], ((4, 43), 2): [0], ((5, 7), 2): [0], ((2, 28), 2): [0], ((2, 43), 1): [0], ((2, 54), 2): [0], ((8, 55), 2): [0], ((11, 18), 2): [0], ((16, 18), 2): [0], ((14, 59), 2): [0], ((13, 10), 2): [0], ((14, 6), 2): [0], ((6, 56), 2): [0], ((7, 58), 2): [0], ((17, 13), 2): [0], ((5, 26), 2): [0], ((2, 14), 2): [0], ((4, 23), 2): [0], ((6, 63), 1): [0], ((10, 19), 2): 

In [5]:
env = gym.make('skyscraper/GridWorld-v0')
env = env.unwrapped 

In [6]:
def eps_greedy(Q, state, action_size, epsilon=0.1):
        """Epsilon-greedy action selection."""
        if np.random.rand() < epsilon:
            return np.random.choice(action_size)  # Explore
        return np.argmax(Q[state, i] for i in range(action_size))  # Exploit

def q_learning(state, action, reward, next_state, Q, alpha, gamma, action_size):
    """Q-learning update rule."""
    best_next_action = np.argmax(Q[next_state, i] for i in range(action_size))
    Q[state, action] += alpha * (reward + gamma * Q[next_state, best_next_action] - Q[state, action])
    

In [7]:
def dyna_q(env, recorded_trajectories, alpha=0.1, gamma=0.95, epsilon=0.1, planning_steps=10, episodes=500):
    """
    Implements the Dyna-Q reinforcement learning algorithm.
    
    Parameters:
        env: The environment (assumed to follow OpenAI Gym-like API).
        alpha: Learning rate.
        gamma: Discount factor.
        epsilon: Exploration probability.
        planning_steps: Number of simulated updates per real step.
        episodes: Number of episodes to train.

    Returns:
        Q-table (state-action values) and optimal policy.
    """
    state_size = env.height * env.width
    action_size = env.action_space.n
    states = []
    # Initialize Q-table
    Q = defaultdict(float)
    observed_state = []
    for x, y, _, _, a, b   in recorded_trajectories:
        if ([x, y] not in observed_state): observed_state.append([x,y])
        if ([a, b] not in observed_state): observed_state.append([a,b])
    recorded_trajectories = recorded_trajectories
    # Model: Dictionary storing transitions {(state, action): (reward, next_state)}
    transition_model = transition_function()
    reward_model = reward_function()

    # Training loop
    for episode in range(episodes):
        observation, info  = env.reset()
        state = tuple(observation.get("agent").get("pos"))
        if(state not in observed_state): observed_state.append(state)
        action = eps_greedy(Q, state, action_size)
        new_observation, reward, terminated, _, info = env.step(action)
        new_state = tuple(new_observation["agent"]["pos"])
        Q[state, action] = q_learning(state, action, reward, new_state, Q, alpha, gamma, action_size) 
        if(new_state not in observed_state): observed_state.append(new_state)
        state=new_state

        for imaginary_step in range(100):
            print(observed_state)
            imag_state = observed_state[np.random.choice(len(observed_state))]
            imag_action = np.random.choice(action_size)
            imag_next_state = transition_function(imag_state, imag_action)
            imag_reward = reward_function(imag_state, imag_action)
            Q[imag_state, imag_action] += q_learning(imag_state, imag_action, imag_reward, imag_next_state, Q=Q, alpha=alpha, action_size=action_size, gamma=gamma)

    # Derive optimal policy from Q-table
    optimal_policy = np.argmax(Q, axis=1)
    return Q, optimal_policy

dyna_q(env=env, recorded_trajectories=trajectories)

TypeError: transition_function() missing 1 required positional argument: 'trajectories'