In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from SystemDegradationEnv import MDPEnvironment
import time
import numpy as np
import json

In [None]:
env = MDPEnvironment(state_size=15, action_size=8)
env.save_environment("mdp_environment.json")
print("✅ Environment gespeichert als 'mdp_environment.json'")

In [None]:
class CustomPPOAgent(PPO):
    def compute_advantage(self, rewards, values, reference_value):
        advantages = []
        for t in range(len(rewards)):
            A_t = rewards[t] + values[t+1] - reference_value  # Ohne Discounting
            advantages.append(A_t)
        return np.array(advantages)

    def train(self, env, total_timesteps):
        obs = env.reset()
        for _ in range(total_timesteps):
            action, _ = self.predict(obs)
            next_obs, reward, done, _ = env.step(action)

            # Compute modified advantage
            reference_value = self.policy.value_function(env.state_0)  # Referenz-Q-Wert
            advantage = self.compute_advantage(reward, self.policy.value_function(obs), reference_value)

            # Train with modified loss
            self.learn(total_timesteps=1, values=advantage)

            obs = next_obs if not done else env.reset()

In [None]:
# Load your custom environment
env = MDPEnvironment(load_from="mdp_environment.json")

# Vectorized Environment (recommended for PPO)
vec_env = make_vec_env(lambda: env, n_envs=1)

# Create PPO model
model = PPO("MlpPolicy", vec_env, verbose=0, learning_rate=3e-4)

start_time=time.time()
# Train PPO agent
model.learn(total_timesteps=10000)
end_time = time.time() - start_time
print(f"Dauer des Trainings {end_time - start_time}")

# Save the trained model
model.save("ppo_maintenance")

In [None]:
def extract_policy(model, env):
    policy = {}
    for state in range(env.observation_space.n):  # Alle möglichen Zustände durchlaufen
        action, _ = model.predict(np.array([state]), deterministic=True)
        policy[state] = action  # Beste Aktion für jeden Zustand speichern
    return policy

In [None]:
model = PPO.load("ppo_maintenance")  # Lade das trainierte Modell
policy = extract_policy(model, env)

print("Optimale Politik:")
for state, action in policy.items():
    print(f"π({state}) = {action}")

In [None]:


with open("ppo_policy.json", "w") as file:
    json.dump(policy, file, indent=4)

with open("ppo_policy.json", "r") as file:
    loaded_policy = json.load(file)
print(loaded_policy)

In [None]:
def evaluate_policy(policy, env, episodes=100):
    total_rewards = []
    
    for _ in range(episodes):
        state = env.reset()
        done = False
        episode_reward = 0

        while not done:
            action = policy.get(state[0], 0)  # Falls Zustand nicht in Politik, wähle Standardaktion
            state, reward, done, _ = env.step(action)
            episode_reward += reward
        
        total_rewards.append(episode_reward)
    
    avg_reward = np.mean(total_rewards)
    print(f"Durchschnittliche Belohnung mit Politik: {avg_reward:.2f}€")
    return avg_reward

evaluate_policy(policy, env)