# GAIA-DRL: DDPG Training
This notebook demonstrates how to train the GAIA-DRL agent using synthetic IoT and geospatial data.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import gym
from gym import spaces

# Carregar amostras de vetor de estado com NDVI (Vt)
df = pd.read_csv('../data/Vt_samples.csv')
df.head()

In [None]:
class GAIAEnv(gym.Env):
    def __init__(self, df):
        super(GAIAEnv, self).__init__()
        self.df = df
        self.max_steps = len(df)
        self.action_space = spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32)
        self.observation_space = spaces.Box(low=0.0, high=1.0, shape=(5,), dtype=np.float32)
        self.reset()

    def reset(self):
        self.current_step = 0
        return self._get_obs()

    def _get_obs(self):
        row = self.df.iloc[self.current_step]
        return np.array([row['Rt'], row['Et'], row['Lt'], row['It'], row['Vt']], dtype=np.float32)

    def step(self, action):
        row = self.df.iloc[self.current_step]
        reward = 0.25 * row['Et'] + 0.25 * row['Rt'] - 0.2 * row['Lt'] - 0.2 * row['It'] + 0.1 * row['Vt']
        self.current_step += 1
        done = self.current_step >= self.max_steps
        return self._get_obs(), reward, done, {}

In [None]:
env = GAIAEnv(df)
obs = env.reset()
print('Initial Observation:', obs)

for _ in range(5):
    action = env.action_space.sample()
    obs, reward, done, _ = env.step(action)
    print(f'Action: {action}, Reward: {reward}')

## DDPG Implementation
Basic implementation of the Deep Deterministic Policy Gradient (DDPG) algorithm.

In [None]:
from tensorflow.keras import layers

class OUActionNoise:
    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
        self.theta = theta
        self.mean = mean
        self.std_dev = std_deviation
        self.dt = dt
        self.x_initial = x_initial
        self.reset()

    def __call__(self):
        x = (
            self.x_prev + self.theta * (self.mean - self.x_prev) * self.dt +
            self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        )
        self.x_prev = x
        return x

    def reset(self):
        if self.x_initial is not None:
            self.x_prev = self.x_initial
        else:
            self.x_prev = np.zeros_like(self.mean)

In [None]:
class Buffer:
    def __init__(self, buffer_capacity=100000, batch_size=64):
        self.buffer_capacity = buffer_capacity
        self.batch_size = batch_size
        self.buffer_counter = 0
        self.state_buffer = np.zeros((self.buffer_capacity, 5))
        self.action_buffer = np.zeros((self.buffer_capacity, 1))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, 5))

    def record(self, obs_tuple):
        index = self.buffer_counter % self.buffer_capacity
        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]
        self.buffer_counter += 1

In [None]:
def get_actor():
    inputs = layers.Input(shape=(5,))
    out = layers.Dense(64, activation='relu')(inputs)
    out = layers.Dense(64, activation='relu')(out)
    outputs = layers.Dense(1, activation='sigmoid')(out)
    model = tf.keras.Model(inputs, outputs)
    return model

def get_critic():
    state_input = layers.Input(shape=(5,))
    action_input = layers.Input(shape=(1,))
    concat = layers.Concatenate()([state_input, action_input])
    out = layers.Dense(64, activation='relu')(concat)
    out = layers.Dense(64, activation='relu')(out)
    outputs = layers.Dense(1)(out)
    model = tf.keras.Model([state_input, action_input], outputs)
    return model

## Training Loop for GAIA-DRL

In [None]:
std_dev = 0.2
ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))

actor_model = get_actor()
critic_model = get_critic()

target_actor = get_actor()
target_critic = get_critic()

target_actor.set_weights(actor_model.get_weights())
target_critic.set_weights(critic_model.get_weights())

In [None]:
critic_lr = 0.002
actor_lr = 0.001
critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)

buffer = Buffer(50000, 64)

In [None]:
@tf.function
def update(state_batch, action_batch, reward_batch, next_state_batch):
    with tf.GradientTape() as tape:
        target_actions = target_actor(next_state_batch, training=True)
        y = reward_batch + 0.99 * target_critic([next_state_batch, target_actions], training=True)
        critic_value = critic_model([state_batch, action_batch], training=True)
        critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))
    critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables)
    critic_optimizer.apply_gradients(zip(critic_grad, critic_model.trainable_variables))

    with tf.GradientTape() as tape:
        actions = actor_model(state_batch, training=True)
        critic_value = critic_model([state_batch, actions], training=True)
        actor_loss = -tf.math.reduce_mean(critic_value)
    actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables)
    actor_optimizer.apply_gradients(zip(actor_grad, actor_model.trainable_variables))

In [None]:
env = GAIAEnv(df)
epochs = 50
all_rewards = []

for ep in range(epochs):
    prev_state = env.reset()
    episodic_reward = 0
    for _ in range(env.max_steps):
        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
        action = actor_model(tf_prev_state)
        action = action.numpy()[0] + ou_noise()
        state, reward, done, _ = env.step(action)
        buffer.record((prev_state, action, reward, state))
        episodic_reward += reward
        prev_state = state

        if buffer.buffer_counter > buffer.batch_size:
            states, actions, rewards, next_states = buffer.sample()
            update(states, actions, rewards, next_states)

    all_rewards.append(episodic_reward)
    print(f'Episode {ep+1}, Reward: {episodic_reward:.2f}')

In [None]:
plt.plot(all_rewards)
plt.title('Recompensa total por episódio')
plt.xlabel('Episódio')
plt.ylabel('Recompensa acumulada')
plt.grid(True)
plt.show()

## Finalização: Salvando resultados e preparando para análise comparativa

In [None]:
# Salvar recompensas em arquivo CSV
rewards_df = pd.DataFrame({'episode': list(range(1, len(all_rewards)+1)), 'reward': all_rewards})
rewards_df.to_csv('../data/rewards_gaia_drl.csv', index=False)
print("Recompensas salvas em '../data/rewards_gaia_drl.csv'")

In [None]:
# Sugestão para comparação futura com baseline (exemplo)
# baseline_rewards = [100, 110, 95, ...]  # valores fixos simulados
# plt.plot(baseline_rewards, label='Baseline')
# plt.plot(all_rewards, label='GAIA-DRL')
# plt.legend()
# plt.title('Comparação entre GAIA-DRL e estratégia estática')
# plt.xlabel('Episódio')
# plt.ylabel('Recompensa')
# plt.grid(True)
# plt.show()