In [909]:
import gymnasium as gym
import skyscraper
from sklearn.neural_network import MLPRegressor
import numpy as np
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
import torch
import torch.nn as nn
import torch.optim as optim
import random

# ex5


In [910]:
trajectories = np.loadtxt("powered_flight.txt", dtype=int)  # Reads as integers
print(trajectories)


[[18 17  1  0 17 13]
 [ 4 61  1  0  4 57]
 [12 20  2  0 11 23]
 ...
 [11 19  1  0 12 16]
 [ 9  8  1  0 10  5]
 [ 5 27  2  0  4 30]]


In [911]:
def sample_batch(trajectories, batch_size):
    """Samples a batch from the recorded trajectories."""
    indices = np.random.choice(len(trajectories), batch_size, replace=False)
    batch = trajectories[indices]  # Select batch using indices
    return batch

In [912]:
#Transistion function

def transition_function(trajectories):
    transition_dict = {}

    for step in trajectories:
        state = (step[0], step[1])  # Current state (assuming 2D state)
        action = step[2]  # Action taken
        new_state = (step[4], step[5])  # Next state (assuming 2D state)

        # Store multiple transitions per (state, action) pair
        if (state, action) not in transition_dict:
            transition_dict[(state, action)] = []
        
        transition_dict[(state, action)].append(new_state)

    return transition_dict

print(transition_function(trajectories))

{((np.int64(18), np.int64(17)), np.int64(1)): [(np.int64(17), np.int64(13))], ((np.int64(4), np.int64(61)), np.int64(1)): [(np.int64(4), np.int64(57))], ((np.int64(12), np.int64(20)), np.int64(2)): [(np.int64(11), np.int64(23))], ((np.int64(17), np.int64(10)), np.int64(2)): [(np.int64(19), np.int64(12))], ((np.int64(8), np.int64(15)), np.int64(1)): [(np.int64(9), np.int64(12))], ((np.int64(3), np.int64(38)), np.int64(1)): [(np.int64(3), np.int64(35))], ((np.int64(18), np.int64(17)), np.int64(2)): [(np.int64(17), np.int64(21))], ((np.int64(5), np.int64(56)), np.int64(1)): [(np.int64(5), np.int64(52))], ((np.int64(4), np.int64(40)), np.int64(2)): [(np.int64(2), np.int64(43))], ((np.int64(3), np.int64(59)), np.int64(2)): [(np.int64(3), np.int64(64))], ((np.int64(3), np.int64(57)), np.int64(2)): [(np.int64(4), np.int64(61))], ((np.int64(2), np.int64(54)), np.int64(1)): [(np.int64(3), np.int64(52))], ((np.int64(10), np.int64(14)), np.int64(2)): [(np.int64(11), np.int64(17))], ((np.int64(19)

In [913]:
def reward_function(trajecotries):
    reward_dict = {}

    for step in trajectories:
        state = (step[0], step[1])  # Current state (assuming 2D state)
        action = step[2]  # Action taken
        reward = step[3]

        # Store multiple transitions per (state, action) pair
        if (state, action) not in reward_dict:
            reward_dict[(state, action)] = []
        
        reward_dict[(state, action)].append(reward)

    return reward_dict

print(reward_function(trajectories))

{((np.int64(18), np.int64(17)), np.int64(1)): [np.int64(0)], ((np.int64(4), np.int64(61)), np.int64(1)): [np.int64(0)], ((np.int64(12), np.int64(20)), np.int64(2)): [np.int64(0)], ((np.int64(17), np.int64(10)), np.int64(2)): [np.int64(0)], ((np.int64(8), np.int64(15)), np.int64(1)): [np.int64(0)], ((np.int64(3), np.int64(38)), np.int64(1)): [np.int64(0)], ((np.int64(18), np.int64(17)), np.int64(2)): [np.int64(0)], ((np.int64(5), np.int64(56)), np.int64(1)): [np.int64(0)], ((np.int64(4), np.int64(40)), np.int64(2)): [np.int64(0)], ((np.int64(3), np.int64(59)), np.int64(2)): [np.int64(0)], ((np.int64(3), np.int64(57)), np.int64(2)): [np.int64(0)], ((np.int64(2), np.int64(54)), np.int64(1)): [np.int64(0)], ((np.int64(10), np.int64(14)), np.int64(2)): [np.int64(0)], ((np.int64(19), np.int64(58)), np.int64(2)): [np.int64(0)], ((np.int64(2), np.int64(56)), np.int64(1)): [np.int64(0)], ((np.int64(8), np.int64(11)), np.int64(2)): [np.int64(0)], ((np.int64(4), np.int64(54)), np.int64(2)): [np.i

In [914]:
def create_model(trajectories):
    X = trajectories[:, :3]  # (state1, state2, action)
    y = trajectories[:, 3:]  # (new_state1, new_state2)

    print(y)
    
    # Ensure input is numerical
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()

    X_scaled = scaler_X.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y)

    # Train model
    nn = MLPRegressor(hidden_layer_sizes=(64, 64), activation='relu', solver='adam', max_iter=5000)
    nn.fit(X_scaled, y_scaled)

    return nn, scaler_X, scaler_y

# Create model
model, scaler_X, scaler_y = create_model(trajectories)

# Prepare input for prediction
test_input = np.array([[4, 61, 1]])  # Flattened input (state1, state2, action)
test_input_scaled = scaler_X.transform(test_input)  # Scale input

# Make prediction
prediction_scaled = model.predict(test_input_scaled)  # Predict in scaled space
prediction = scaler_y.inverse_transform(prediction_scaled)  # Convert back

# Round to 2 decimal places
prediction = np.round(prediction, decimals=2)

print(prediction)  # Predicted new state with fewer decimals



[[ 0 17 13]
 [ 0  4 57]
 [ 0 11 23]
 ...
 [ 0 12 16]
 [ 0 10  5]
 [ 0  4 30]]
[[ 0.    4.09 57.82]]


In [915]:
env = gym.make('skyscraper/GridWorld-v0')
env = env.unwrapped 

#Chat kode

In [916]:
class TransitionModel(nn.Module):
    def __init__(self, input_dim=3, output_dim=2):  # (x, y, a) → (x', y')
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)
        )

    def forward(self, state_action):
        return self.fc(state_action)

class RewardModel(nn.Module):
    def __init__(self, input_dim=3, output_dim=1):  # (x, y, a) → r
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)
        )

    def forward(self, state_action):
        return self.fc(state_action)



In [917]:
def train_model_from_data(recorded_trajectories, transition_model, reward_model, epochs=1000, batch_size=32):
    """
    Train the transition and reward models using recorded trajectories.
    
    Parameters:
    - recorded_trajectories: List of (s, a, r, s´) tuples.
    - transition_model: Neural network for predicting next state.
    - reward_model: Neural network for predicting reward.
    - epochs: Number of training iterations.
    - batch_size: Number of samples per training step.
    """
    loss_fn = nn.MSELoss()  # Mean Squared Error for both models
    optimizer_T = optim.Adam(transition_model.parameters(), lr=0.001)
    optimizer_R = optim.Adam(reward_model.parameters(), lr=0.001)
    
    for epoch in range(epochs):
        # Sample a batch of experiences
        batch = sample_batch(recorded_trajectories, batch_size)

        # Extract (x, y), a, (x', y'), and r
        states = batch[:, :2]  # (x, y)
        actions = batch[:, 2:3]  # (a) - Keep as column
        rewards = batch[:, 3:4]  # (r)
        next_states = batch[:, 4:]  # (x', y')

        # Convert to PyTorch tensors
        states = torch.tensor(states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.float32)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)

        state_action = torch.cat((states, actions), dim=1)  # (x, y, a)

        # Train transition model
        optimizer_T.zero_grad()
        pred_next_states = transition_model(state_action)
        loss_T = loss_fn(pred_next_states, next_states)
        loss_T.backward()
        optimizer_T.step()

        # Train reward model
        optimizer_R.zero_grad()
        pred_rewards = reward_model(state_action)
        loss_R = loss_fn(pred_rewards, rewards)
        loss_R.backward()
        optimizer_R.step()

        if epoch % 100 == 0:
            print(f"Epoch {epoch}: Loss_T = {loss_T.item()}, Loss_R = {loss_R.item()}")

    print("Model training completed!")



In [918]:
def eps_greedy(Q, state, action_size, epsilon=0.1):
        """Epsilon-greedy action selection."""
        if np.random.rand() < epsilon:
            return np.random.choice(action_size)  # Explore
        return np.argmax(Q[state, i] for i in range(action_size))  # Exploit

def q_learning(state, action, reward, next_state, Q, alpha, gamma, action_size):
    """Q-learning update rule."""
    best_next_action = np.argmax(Q[next_state, i] for i in range(action_size))
    Q[state, action] += alpha * (reward + gamma * Q[next_state, best_next_action] - Q[state, action])

def update_model(state, action, reward, next_state, transition_model, reward_model, recorded_trajectories):
    #new_trajectory = np.concatenate([state, [action], [reward], next_state]).reshape(1, -1)
    #print(recorded_trajectories.shape)
    #np.append(arr=recorded_trajectories, values=new_trajectory, axis=0)
    #train_model_from_data(trajectories, transition_model, reward_model)
    ...
    

def predict_model(state, action, transition_model, reward_model):
     new_state = transition_model([state, action])
     reward = reward_model(state, action)
     return reward, new_state

In [919]:
def dyna_q(env, recorded_trajectories, alpha=0.1, gamma=0.95, epsilon=0.1, planning_steps=10, episodes=500):
    """
    Implements the Dyna-Q reinforcement learning algorithm.
    
    Parameters:
        env: The environment (assumed to follow OpenAI Gym-like API).
        alpha: Learning rate.
        gamma: Discount factor.
        epsilon: Exploration probability.
        planning_steps: Number of simulated updates per real step.
        episodes: Number of episodes to train.

    Returns:
        Q-table (state-action values) and optimal policy.
    """
    state_size = env.height * env.width
    action_size = env.action_space.n
    states = []
    # Initialize Q-table
    Q = np.zeros((env.height, env.width, 2))  
    observed_state = []
    for x, y, _, _, a, b   in recorded_trajectories:
        if ([x, y] not in observed_state): observed_state.append([x,y])
        if ([a, b] not in observed_state): observed_state.append([a,b])
    recorded_trajectories = recorded_trajectories
    # Model: Dictionary storing transitions {(state, action): (reward, next_state)}
    transition_model = TransitionModel()
    reward_model = RewardModel()
    train_model_from_data(recorded_trajectories=recorded_trajectories, transition_model=transition_model, reward_model=reward_model)

    # Training loop
    for episode in range(episodes):
        observation, info  = env.reset()
        state = tuple(observation.get("agent").get("pos"))
        if(state not in observed_state): observed_state.append(state)
        action = eps_greedy(Q, state, action_size)
        new_observation, reward, terminated, _, info = env.step(action)
        new_state = tuple(new_observation["agent"]["pos"])
        q_learning(state, action, reward, new_state, Q, alpha, gamma, action_size) 
        update_model(state, action, reward, new_state, transition_model, reward_model, recorded_trajectories)
        if(new_state not in observed_state): observed_state.append(new_state)
        state=new_state

        for imaginary_step in range(100):
            print(observed_state)
            imag_state = observed_state[np.random.choice(len(observed_state))]
            imag_action = np.random.choice(action_size)
            imag_reward, imag_new_state = predict_model(imag_state, imag_action, transition_model, reward_model)
            Q[imag_state, imag_action] += q_learning(imag_state, imag_action, imag_reward, imag_new_state, Q=Q, alpha=alpha, action_size=action_size, gamma=gamma)

    # Derive optimal policy from Q-table
    optimal_policy = np.argmax(Q, axis=1)
    return Q, optimal_policy

dyna_q(env=env, recorded_trajectories=trajectories)

Epoch 0: Loss_T = 1263.0784912109375, Loss_R = 1.7575722932815552
Epoch 100: Loss_T = 34.2315673828125, Loss_R = 0.0021147404331713915
Epoch 200: Loss_T = 7.097359657287598, Loss_R = 0.0013571748277172446
Epoch 300: Loss_T = 11.244699478149414, Loss_R = 0.000860168132930994
Epoch 400: Loss_T = 13.739879608154297, Loss_R = 0.0009906874038279057
Epoch 500: Loss_T = 11.716620445251465, Loss_R = 0.0008339530322700739
Epoch 600: Loss_T = 7.413710594177246, Loss_R = 0.03441735729575157
Epoch 700: Loss_T = 11.324153900146484, Loss_R = 0.007233027834445238
Epoch 800: Loss_T = 14.089414596557617, Loss_R = 0.0050805117934942245
Epoch 900: Loss_T = 9.779200553894043, Loss_R = 0.0008391179144382477
Model training completed!
[[np.int64(18), np.int64(17)], [np.int64(17), np.int64(13)], [np.int64(4), np.int64(61)], [np.int64(4), np.int64(57)], [np.int64(12), np.int64(20)], [np.int64(11), np.int64(23)], [np.int64(17), np.int64(10)], [np.int64(19), np.int64(12)], [np.int64(8), np.int64(15)], [np.int64(

TypeError: linear(): argument 'input' (position 1) must be Tensor, not list