In [1]:
import gym
from stable_baselines3 import PPO
import numpy as np
from pyswarm import pso
from stable_baselines3.common.evaluation import evaluate_policy
import gym
from gym import spaces
import numpy as np
from gym.envs.registration import register
from pyenergyplus.api import EnergyPlusAPI

class EnergyPlusEnv(gym.Env):
    def __init__(self):
        super(EnergyPlusEnv, self).__init__()
        
        # Define action and observation space
        # Example: action space could be [0, 1, 2, 3] corresponding to different HVAC settings
        self.action_space = spaces.Discrete(4) # low, mid, high. off
        
        # Observation space: example with indoor temp, outdoor temp, energy consumption, occupancy status
        self.observation_space = spaces.Box(
            low=np.array([0, -30, 0, 0]), 
            high=np.array([50, 50, 100, 1]), 
            dtype=np.float32
        )
        
        # Initialize state
        self.state = None
        self.reset()

    def step(self, action):
        # Send action to EnergyPlus and get new state, reward, done
        # This is a placeholder example
        new_state = np.array([22, 10, 30, 1])  # Replace with actual call to EnergyPlus
        reward = -1  # Replace with actual reward calculation
        done = False  # Replace with actual done condition

        self.state = new_state
        return new_state, reward, done, {}

    def reset(self):
        # Reset EnergyPlus to initial state
        # This is a placeholder example
        initial_state = np.array([20, 10, 0, 1])  # Replace with actual reset logic
        self.state = initial_state
        return initial_state

    def render(self, mode='human'):
        # Render the environment (optional)
        pass

    def close(self):
        # Close the environment (optional)
        pass

# Register the environment with Gym
gym.envs.registration.register(
    id='EnergyPlus-v0', 
    entry_point='__main__:EnergyPlusEnv'
)

# Create an instance of the environment
env = gym.make('EnergyPlus-v0')

# Define the Q-learning model
model = PPO('MlpPolicy', env, verbose=1)

# Train the Q-learning model
model.learn(total_timesteps=10000)

# Save the trained model
model.save("q_learning_model")

# Evaluate the trained model
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward} +/- {std_reward}")

# Define the fitness function for PSO
def fitness(params):
    learning_rate, gamma = params
    
    # Define the Q-learning model with the given parameters
    model = PPO('MlpPolicy', env, learning_rate=learning_rate, gamma=gamma, verbose=0)
    
    # Train the model
    model.learn(total_timesteps=5000)
    
    # Evaluate the trained model
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10)
    
    # Return the negative mean reward as PSO minimizes the function
    return -mean_reward

# Set the bounds for the parameters: learning rate and gamma
lb = [1e-5, 0.8]  # Lower bounds for learning rate and gamma
ub = [1e-2, 0.99]  # Upper bounds for learning rate and gamma

# Perform PSO to find the optimal parameters
best_params, best_fitness = pso(fitness, lb, ub, swarmsize=10, maxiter=10)

print("Best Parameters Found: Learning Rate = {}, Gamma = {}".format(best_params[0], best_params[1]))
print("Best Fitness (Negative Mean Reward):", best_fitness)

# Train the final Q-learning model using the optimized parameters
optimal_learning_rate, optimal_gamma = best_params
final_model = PPO('MlpPolicy', env, learning_rate=optimal_learning_rate, gamma=optimal_gamma, verbose=1)
final_model.learn(total_timesteps=10000)

# Evaluate the final model
mean_reward, std_reward = evaluate_policy(final_model, env, n_eval_episodes=10)
print(f"Final Model Mean Reward: {mean_reward} +/- {std_reward}")
