In [21]:
import gymnasium as gym
import skyscraper
import numpy as np
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
import torch
import torch.nn as nn
import torch.optim as optim
import random
import itertools

# ex5


In [22]:
trajectories = np.loadtxt("powered_flight.txt", dtype=int)  # Reads as integers
trajectories = trajectories - [1, 1, 0, 0, 1, 1]


In [None]:
#Transistion function

def transition_function(trajectories):
    transition_dict = {}

    for step in trajectories:
        state = (step[0], step[1])  # Current state (assuming 2D state)
        action = step[2]  # Action taken
        new_state = (step[4], step[5])  # Next state (assuming 2D state)

        # Store multiple transitions per (state, action) pair
        if (state, action) not in transition_dict:
            transition_dict[(state, action)] = []
        
        transition_dict[(state, action)].append(new_state)

    return transition_dict

print(transition_function(trajectories))

In [None]:
def reward_function(trajecotries):
    reward_dict = {}

    for step in trajectories:
        state = (step[0], step[1])  # Current state (assuming 2D state)
        action = step[2]  # Action taken
        reward = step[3]

        # Store multiple transitions per (state, action) pair
        if (state, action) not in reward_dict:
            reward_dict[(state, action)] = []
        
        reward_dict[(state, action)].append(reward)

    return reward_dict

print(reward_function(trajectories))

In [25]:
env = gym.make('skyscraper/GridWorld-v0')
env2 = gym.make('skyscraper/GridWorld-v0').unwrapped 

In [None]:
obs, info = env.reset()
print(obs)
obs, _, _, _, _=env.step(1)
print(obs)

#Chat kode

In [27]:
class TransitionModel(nn.Module):
    def __init__(self, input_dim=3):  # (x, y, a) → (x', y')
        super().__init__()
        map = [range(0, 32, 1), range(0, 64, 1)]
        self.label = list(itertools.product(*map))
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32*64)
        )

    def forward(self, state_action):
        return self.fc(state_action)

class RewardModel(nn.Module):
    def __init__(self, input_dim=3, output_dim=1):  # (x, y, a) → r
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)
        )

    def forward(self, state_action):
        return self.fc(state_action)



In [28]:
def train_model_from_data(recorded_trajectories, transition_model, reward_model, epochs=30):
    """
    Train the transition and reward models using recorded trajectories.
    
    Parameters:
    - recorded_trajectories: List of (s, a, r, s´) tuples.
    - transition_model: Neural network for predicting next state.
    - reward_model: Neural network for predicting reward.
    - epochs: Number of training iterations.
    - batch_size: Number of samples per training step.
    """
    loss_fn = nn.MSELoss()  # Mean Squared Error for both models
    optimizer_T = optim.Adam(transition_model.parameters(), lr=0.001)
    optimizer_R = optim.Adam(reward_model.parameters(), lr=0.001)
    
    for epoch in range(epochs):
        # Sample a batch of experiences
        for state_x, state_y, action, reward, next_state_x, next_state_y in recorded_trajectories:

        # Convert to PyTorch tensors
            reward = torch.tensor(reward, dtype=torch.float)
            next_state = torch.tensor(
                transition_model.label.index((next_state_x, next_state_y)), dtype=torch.long
                )
            state_action = torch.tensor((state_x, state_y, action), dtype=torch.float)  # (x, y, a)

            # Train transition model
            optimizer_T.zero_grad()
            pred_next_states = transition_model(state_action)
            loss_T = nn.functional.cross_entropy(pred_next_states, next_state)
            loss_T.backward()
            optimizer_T.step()

            # Train reward model
            optimizer_R.zero_grad()
            pred_rewards = reward_model(state_action)
            loss_R = loss_fn(pred_rewards, reward)
            loss_R.backward()
            optimizer_R.step()

        if epoch % 10 == 0:
            print(f"Epoch {epoch}: Loss_T = {loss_T.item()}, Loss_R = {loss_R.item()}")

    print("Model training completed!")



In [29]:
def eps_greedy(Q, state, action_size, epsilon=0.1):
        """Epsilon-greedy action selection."""
        if np.random.rand() < epsilon:
            return np.random.choice(action_size)  # Explore
        return np.argmax(Q[state][i] for i in range(action_size))  # Exploit

def q_learning(state, action, reward, next_state, Q, alpha, gamma, action_size):
    """Q-learning update rule."""
    best_next_action = np.argmax(Q[next_state[0]][next_state[1]][i] for i in range(action_size))
    Q[state[0]][state[1]][action] += alpha * (reward + gamma * Q[next_state[0]][next_state[1]][best_next_action] - Q[state[0]][state[1]][action])

def update_model(state, action, reward, next_state, transition_model, reward_model):
    train_model_from_data([(state[0], state[1], action, reward, next_state[0], next_state[1])], transition_model, reward_model, epochs=2)

def predict_model(state, action, transition_model, reward_model):
    with torch.no_grad():
        x = torch.tensor((state[0], state[1], action), dtype=torch.float)
        transition_model.eval()
        reward_model.eval()
        new_state = transition_model(x)
        new_state = transition_model.label[torch.argmax((new_state))]
        reward = reward_model(x)

    return reward, new_state

In [None]:
def dyna_q(env, recorded_trajectories, alpha=0.1, gamma=0.95, epsilon=0.1, planning_steps=10, episodes=500):
    """
    Implements the Dyna-Q reinforcement learning algorithm.
    
    Parameters:
        env: The environment (assumed to follow OpenAI Gym-like API).
        alpha: Learning rate.
        gamma: Discount factor.
        epsilon: Exploration probability.
        planning_steps: Number of simulated updates per real step.
        episodes: Number of episodes to train.

    Returns:
        Q-table (state-action values) and optimal policy.
    """
    action_size = env2.action_space.n
    # Initialize Q-table
    Q = np.zeros((env2.height, env2.width, 2), dtype=('float64')  )
    observed_state = []
    for x, y, _, _, a, b   in recorded_trajectories:
        if ([x, y] not in observed_state): observed_state.append([x,y])
        if ([a, b] not in observed_state): observed_state.append([a,b])
    recorded_trajectories = recorded_trajectories
    # Model: Dictionary storing transitions {(state, action): (reward, next_state)}
    transition_model = TransitionModel()
    reward_model = RewardModel()
    train_model_from_data(recorded_trajectories=recorded_trajectories, transition_model=transition_model, reward_model=reward_model)
    done = False
    # Training loop
    counter = 0
    while not done:
        counter+=1
        observation, _  = env.reset()
        state = tuple(observation.get("agent").get("pos"))
        if(state not in observed_state): observed_state.append(state)
        action = eps_greedy(Q, state, action_size)
        new_observation, reward, _, _, _ = env.step(action)
        new_state = tuple(new_observation["agent"]["pos"])
        new_state = (int(new_state[0]), int(new_state[1]))
        q_learning(state, action, reward, new_state, Q, alpha, gamma, action_size) 
        update_model(state, action, reward, new_state, transition_model, reward_model)
        if(new_state not in observed_state): observed_state.append(new_state)
        state=new_state
        if(state == (14, 54)):
            print("done")
            done = True

        for imaginary_step in range(100):
            imag_state = observed_state[np.random.choice(len(observed_state))]
            imag_action = np.random.choice(action_size)
            imag_reward, imag_new_state = predict_model(imag_state, imag_action, transition_model, reward_model)
            q_learning(imag_state, imag_action, imag_reward, imag_new_state, Q=Q, alpha=alpha, action_size=action_size, gamma=gamma)
        if(counter%100): print(counter)
    # Derive optimal policy from Q-table
    optimal_policy = np.argmax(Q, axis=1)
    return Q, optimal_policy

Q, optimal_policy = dyna_q(env=env, recorded_trajectories=trajectories)