In [321]:
import gymnasium as gym
import skyscraper
from sklearn.neural_network import MLPRegressor
import numpy as np
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

# ex5


In [322]:
trajectories = np.loadtxt("powered_flight.txt", dtype=int)  # Reads as integers
print(trajectories)


[[18 17  1  0 17 13]
 [ 4 61  1  0  4 57]
 [12 20  2  0 11 23]
 ...
 [11 19  1  0 12 16]
 [ 9  8  1  0 10  5]
 [ 5 27  2  0  4 30]]


In [323]:
#Transistion function

def transition_function(trajectories):
    transition_dict = {}

    for step in trajectories:
        state = (step[0], step[1])  # Current state (assuming 2D state)
        action = step[2]  # Action taken
        new_state = (step[4], step[5])  # Next state (assuming 2D state)

        # Store multiple transitions per (state, action) pair
        if (state, action) not in transition_dict:
            transition_dict[(state, action)] = []
        
        transition_dict[(state, action)].append(new_state)

    return transition_dict

print(transition_function(trajectories))

{((np.int64(18), np.int64(17)), np.int64(1)): [(np.int64(17), np.int64(13))], ((np.int64(4), np.int64(61)), np.int64(1)): [(np.int64(4), np.int64(57))], ((np.int64(12), np.int64(20)), np.int64(2)): [(np.int64(11), np.int64(23))], ((np.int64(17), np.int64(10)), np.int64(2)): [(np.int64(19), np.int64(12))], ((np.int64(8), np.int64(15)), np.int64(1)): [(np.int64(9), np.int64(12))], ((np.int64(3), np.int64(38)), np.int64(1)): [(np.int64(3), np.int64(35))], ((np.int64(18), np.int64(17)), np.int64(2)): [(np.int64(17), np.int64(21))], ((np.int64(5), np.int64(56)), np.int64(1)): [(np.int64(5), np.int64(52))], ((np.int64(4), np.int64(40)), np.int64(2)): [(np.int64(2), np.int64(43))], ((np.int64(3), np.int64(59)), np.int64(2)): [(np.int64(3), np.int64(64))], ((np.int64(3), np.int64(57)), np.int64(2)): [(np.int64(4), np.int64(61))], ((np.int64(2), np.int64(54)), np.int64(1)): [(np.int64(3), np.int64(52))], ((np.int64(10), np.int64(14)), np.int64(2)): [(np.int64(11), np.int64(17))], ((np.int64(19)

In [324]:
def reward_function(trajecotries):
    reward_dict = {}

    for step in trajectories:
        state = (step[0], step[1])  # Current state (assuming 2D state)
        action = step[2]  # Action taken
        reward = step[3]

        # Store multiple transitions per (state, action) pair
        if (state, action) not in reward_dict:
            reward_dict[(state, action)] = []
        
        reward_dict[(state, action)].append(reward)

    return reward_dict

print(reward_function(trajectories))

{((np.int64(18), np.int64(17)), np.int64(1)): [np.int64(0)], ((np.int64(4), np.int64(61)), np.int64(1)): [np.int64(0)], ((np.int64(12), np.int64(20)), np.int64(2)): [np.int64(0)], ((np.int64(17), np.int64(10)), np.int64(2)): [np.int64(0)], ((np.int64(8), np.int64(15)), np.int64(1)): [np.int64(0)], ((np.int64(3), np.int64(38)), np.int64(1)): [np.int64(0)], ((np.int64(18), np.int64(17)), np.int64(2)): [np.int64(0)], ((np.int64(5), np.int64(56)), np.int64(1)): [np.int64(0)], ((np.int64(4), np.int64(40)), np.int64(2)): [np.int64(0)], ((np.int64(3), np.int64(59)), np.int64(2)): [np.int64(0)], ((np.int64(3), np.int64(57)), np.int64(2)): [np.int64(0)], ((np.int64(2), np.int64(54)), np.int64(1)): [np.int64(0)], ((np.int64(10), np.int64(14)), np.int64(2)): [np.int64(0)], ((np.int64(19), np.int64(58)), np.int64(2)): [np.int64(0)], ((np.int64(2), np.int64(56)), np.int64(1)): [np.int64(0)], ((np.int64(8), np.int64(11)), np.int64(2)): [np.int64(0)], ((np.int64(4), np.int64(54)), np.int64(2)): [np.i

In [325]:
def create_model(trajectories):
    X = trajectories[:, :3]  # (state1, state2, action)
    y = trajectories[:, 3:]  # (new_state1, new_state2)

    print(y)
    
    # Ensure input is numerical
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()

    X_scaled = scaler_X.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y)

    # Train model
    nn = MLPRegressor(hidden_layer_sizes=(64, 64), activation='relu', solver='adam', max_iter=5000)
    nn.fit(X_scaled, y_scaled)

    return nn, scaler_X, scaler_y

# Create model
model, scaler_X, scaler_y = create_model(trajectories)

# Prepare input for prediction
test_input = np.array([[4, 61, 1]])  # Flattened input (state1, state2, action)
test_input_scaled = scaler_X.transform(test_input)  # Scale input

# Make prediction
prediction_scaled = model.predict(test_input_scaled)  # Predict in scaled space
prediction = scaler_y.inverse_transform(prediction_scaled)  # Convert back

# Round to 2 decimal places
prediction = np.round(prediction, decimals=2)

print(prediction)  # Predicted new state with fewer decimals



[[ 0 17 13]
 [ 0  4 57]
 [ 0 11 23]
 ...
 [ 0 12 16]
 [ 0 10  5]
 [ 0  4 30]]
[[ 0.    4.31 57.93]]


In [326]:
env = gym.make('skyscraper/GridWorld-v0')

In [327]:
def q_learning(s, a, r, new_s, ):
   ...

In [328]:
def Dyna(env, model, policy, beta, gamma, n, w):
    model = model
    w = w
    for k in range(1, 100):
        #take action
        #collect reward and new state
        #w(k+1) = QLearning(s, a, r, s´, ...)
        #model(k+1)=update_model(s, a, r, s´,...)

        for m in range(1, 100):
            #select random a and s
            r, new_s = model.predict(state, a)
            #w(k+1) = QLearning(s, a, r, s´, ...)


    optimal_policy = ...
    return optimal_policy

In [329]:
skyline = np.loadtxt("skyscraper/envs/skyline.txt", dtype=int)  # Reads as integers
array_skyline = []
for i, line in enumerate(skyline):
    for j, sky in enumerate(line):
        if(sky==1):
            array_skyline.append((i, j))

print(array_skyline)

[(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (0, 8), (0, 9), (0, 10), (0, 11), (0, 12), (0, 13), (0, 14), (0, 15), (0, 16), (0, 17), (0, 18), (0, 19), (0, 20), (0, 21), (0, 22), (0, 23), (0, 24), (0, 25), (0, 26), (0, 27), (0, 28), (0, 29), (0, 30), (0, 31), (0, 32), (0, 33), (0, 34), (0, 35), (0, 36), (0, 37), (0, 38), (0, 39), (0, 40), (0, 41), (0, 42), (0, 43), (0, 44), (0, 45), (0, 46), (0, 47), (0, 48), (0, 49), (0, 50), (0, 51), (0, 52), (0, 53), (0, 54), (0, 55), (0, 56), (0, 57), (0, 58), (0, 59), (0, 60), (0, 61), (0, 62), (0, 63), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13), (1, 14), (1, 15), (1, 16), (1, 17), (1, 18), (1, 19), (1, 20), (1, 21), (1, 22), (1, 23), (1, 24), (1, 25), (1, 26), (1, 27), (1, 28), (1, 29), (1, 30), (1, 31), (1, 32), (1, 33), (1, 34), (1, 35), (1, 36), (1, 37), (1, 38), (1, 39), (1, 40), (1, 41), (1, 42), (1, 43), (1, 44), (1, 45), (1, 46), (1, 47), (1, 48), (1

In [330]:
observation, info = env.reset()
print(observation)
env.step(0)

{'agent': {'pos': array([13,  5])}}


  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


({'agent': {'pos': array([13,  5])}},
 0,
 False,
 False,
 {'distance': np.int64(49), 'steps': 1})

In [331]:
env.step(1)

({'agent': {'pos': array([14.,  8.])}},
 0,
 False,
 False,
 {'distance': np.float64(46.0), 'steps': 2})

In [332]:
env.step(0)

({'agent': {'pos': array([13,  5])}},
 0,
 False,
 False,
 {'distance': np.int64(49), 'steps': 3})

#Chat kode

In [333]:
import random
def choose_action(state, action_size, Q, epsilon):
        """Epsilon-greedy action selection."""
        if np.random.rand() < epsilon:
            return np.random.choice(action_size)  # Explore
        return np.argmax(Q[state, i] for i in range(action_size))  # Exploit

def learn(state, action, reward, next_state, Q, alpha, gamma, action_size):
    """Q-learning update rule."""
    best_next_action = np.argmax(Q[next_state, i] for i in range(action_size))
    Q[state, action] += alpha * (reward + gamma * Q[next_state, best_next_action] - Q[state, action])

def update_model(state, action, reward, next_state, model):
    """Store the transition in the model."""
    model[(state, action)] = (reward, next_state)

def planning(model, states, planning_steps):
    """Simulate experience using the model."""
    for _ in range(planning_steps):

        # Sample a random (state, action) pair from the model
        (s, a) = random.choice(states)
        r, s_next = model.predict(s, a)

        # Update Q-table using the simulated experience
        learn(s, a, r, s_next)

In [335]:
def dyna_q(env, model, alpha=0.1, gamma=0.95, epsilon=0.1, planning_steps=10, episodes=500):
    """
    Implements the Dyna-Q reinforcement learning algorithm.
    
    Parameters:
        env: The environment (assumed to follow OpenAI Gym-like API).
        alpha: Learning rate.
        gamma: Discount factor.
        epsilon: Exploration probability.
        planning_steps: Number of simulated updates per real step.
        episodes: Number of episodes to train.

    Returns:
        Q-table (state-action values) and optimal policy.
    """
    state_size = env.observation_space.get("agent").get("pos").high
    action_size = env.action_space.n
    states = array_skyline

    # Initialize Q-table
    Q = defaultdict(float)
    print(len(Q))

    # Model: Dictionary storing transitions {(state, action): (reward, next_state)}
    model = model

    # Training loop
    for episode in range(episodes):
        observation, info  = env.reset()
        state = tuple(observation.get("agent").get("pos"))
        done = False

        while not done:
            action = choose_action(state, action_size, Q, epsilon)
            observation, reward, done, _, info = env.step(action)
            next_state = tuple(observation.get("agent").get("pos"))
            print(next_state)

            # Real experience update
            learn(state, action, reward, next_state, Q, alpha, gamma, action_size)

            # Update model
            update_model(state, action, reward, next_state, model)

            # Simulated experience (planning)
            planning(model, states, planning_steps)

            state = next_state

    # Derive optimal policy from Q-table
    optimal_policy = np.argmax(Q, axis=1)
    return Q, optimal_policy

dyna_q(env=env, model=create_model(trajectories))

[[ 0 17 13]
 [ 0  4 57]
 [ 0 11 23]
 ...
 [ 0 12 16]
 [ 0 10  5]
 [ 0  4 30]]
0
(np.int64(13), np.int64(5))


TypeError: 'tuple' object does not support item assignment