# **01_exploration.ipynb**

### **Preparamos el entorno**

In [1]:
import numpy as np
import time
import pandas as pd
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim 

### **Creamos el ambiente**

In [2]:
env = gym.make("CartPole-v1", render_mode="human")
obs, _ = env.reset()

done = False

while not done:
    action = env.action_space.sample()
    obs, reward, termitated, truncated, info = env.step(action)
    done = termitated or truncated


time.sleep(2)
env.close()

  from pkg_resources import resource_stream, resource_exists


### **Vemos si Tenemos una GPU Disponible**

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Usando Dispositivo: ", device)

Usando Dispositivo:  cpu


### **Definimos la red neuronal**

In [4]:
class PolicyNetwork(nn.Module):
    def __init__(self, obs_size, n_actions, hidden_size=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size), 
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )
        
    def forward(self, x):
        return self.net(x)

In [5]:
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

policy_net = PolicyNetwork(obs_size, n_actions).to(device)
optimizer = optim.Adam(policy_net.parameters(), lr=0.01)

### **Función para Generar Datos de los Episodios**

In [6]:
def generate_episode(env, policy_net, device):
    obs, _ = env.reset()
    obs_list = []
    action_list = []
    total_reward = 0.0
    done = False
    
    while not done:
        obs_v = torch.tensor([obs], dtype=torch.float32).to(device)
        action_probs = torch.softmax(policy_net(obs_v), dim=1)
        action = torch.multinomial(action_probs, 1).item()
        
        obs_list.append(obs)
        action_list.append(action)
        
        obs, reward, terminated, trucated, info = env.step(action)
        total_reward += reward
        done = terminated or trucated
    
    return total_reward, obs_list, action_list        

In [7]:
# Ejemplo del agente jugando con la red neuronal sin entrenar

env = gym.make("CartPole-v1")
total_reward, obs_list, action_list = generate_episode(env, policy_net, device)

print("total_reward:", total_reward)
print("\nobs_list:", obs_list)
print("\naction_list:", action_list)

total_reward: 16.0

obs_list: [array([-0.03007691,  0.01168857,  0.01957042, -0.02855192], dtype=float32), array([-0.02984314, -0.18370849,  0.01899938,  0.27024087], dtype=float32), array([-0.03351731, -0.37909633,  0.02440419,  0.5688552 ], dtype=float32), array([-0.04109924, -0.18432502,  0.0357813 ,  0.28395936], dtype=float32), array([-0.04478574,  0.01026881,  0.04146048,  0.00277293], dtype=float32), array([-0.04458036,  0.20477238,  0.04151595, -0.27654582], dtype=float32), array([-0.04048492,  0.3992782 ,  0.03598503, -0.555851  ], dtype=float32), array([-0.03249935,  0.20366998,  0.02486801, -0.2520513 ], dtype=float32), array([-0.02842595,  0.39842817,  0.01982698, -0.5367878 ], dtype=float32), array([-0.02045739,  0.59326583,  0.00909123, -0.8231581 ], dtype=float32), array([-0.00859207,  0.78826225, -0.00737193, -1.1129678 ], dtype=float32), array([ 0.00717317,  0.9834802 , -0.02963129, -1.4079542 ], dtype=float32), array([ 0.02684278,  1.178957  , -0.05779038, -1.709751  

  obs_v = torch.tensor([obs], dtype=torch.float32).to(device)


### **Generar Múltiples Episodios y Seleccionar los Mejores**

In [8]:
def generate_batch(env, policy_net, device, batch_size, percentile=70):
    rewards = []
    batch_obs = []
    batch_actions = []
    
    for _ in range(batch_size):
        total_reward, obs_list, action_list = generate_episode(env, policy_net, device)
        rewards.append(total_reward)
        batch_obs.append(obs_list)
        batch_actions.append(action_list)
        
    reward_threshold = np.percentile(rewards, percentile)
    
    train_obs = []
    train_actions = []
    
    for i in range(batch_size):
        if rewards[i] >= reward_threshold:
            train_obs.extend(batch_obs[i])
            train_actions.extend(batch_actions[i])
            
    train_obs = torch.tensor(train_obs, dtype=torch.float32).to(device)
    train_actions = torch.tensor(train_actions, dtype=torch.long).to(device)        
    
    return train_obs, train_actions, np.mean(rewards), reward_threshold

In [9]:
# Ejemplo degeneración de un batch con la red neuronal sin entrenar

env = gym.make("CartPole-v1")
train_o, train_a, avg_rew, threshold = generate_batch(env, policy_net, device, batch_size=100)

print("train_obs:", train_o)
print("\ntrain_actions:", train_a)
print("\navg_reward:", avg_rew)
print("\nthreshold:", threshold)

train_obs: tensor([[ 0.0276,  0.0492,  0.0194, -0.0081],
        [ 0.0286,  0.2441,  0.0192, -0.2946],
        [ 0.0335,  0.0487,  0.0134,  0.0041],
        ...,
        [ 0.1215,  1.1329, -0.1277, -1.5074],
        [ 0.1441,  0.9395, -0.1578, -1.2572],
        [ 0.1629,  1.1363, -0.1829, -1.5948]])

train_actions: tensor([1, 0, 1,  ..., 0, 1, 0])

avg_reward: 23.92

threshold: 28.299999999999997


### **Entrenamos la Red con los Mejores Episodios**

In [10]:
def train_step(policy_net, optimizer, train_obs, train_actions):
    optimizer.zero_grad()
    action_scores = policy_net(train_obs)
    loss = nn.CrossEntropyLoss()(action_scores, train_actions)
    loss.backward()
    optimizer.step()
    return loss.item()

In [11]:
train_step(policy_net, optimizer, train_o, train_a)

0.6802474856376648

### **Entrenamiento Iterativo**

In [12]:
env = gym.make("CartPole-v1")

n_iterations = 50
batch_size = 50
percentile = 70

start_time = time.time()

for i in range(n_iterations):
    train_obs, train_actions, reward_mean, reward_threshold = generate_batch(env, policy_net, device, batch_size, percentile)
    loss = train_step(policy_net, optimizer, train_obs, train_actions)
    print(f"Iter {i}: loss={loss:.3f}, mean_reward={reward_mean:.1f}, reward_threshold={reward_threshold:.1f}")
    
end_time = time.time() 
elapsed_time = end_time - start_time

print(f"\nEntrenamiento completado en {elapsed_time:.2f} segundos")

Iter 0: loss=0.682, mean_reward=30.2, reward_threshold=32.0
Iter 1: loss=0.670, mean_reward=30.8, reward_threshold=36.0
Iter 2: loss=0.658, mean_reward=32.1, reward_threshold=36.3
Iter 3: loss=0.652, mean_reward=30.5, reward_threshold=36.0
Iter 4: loss=0.640, mean_reward=32.9, reward_threshold=36.0
Iter 5: loss=0.624, mean_reward=51.1, reward_threshold=57.6
Iter 6: loss=0.607, mean_reward=50.4, reward_threshold=57.2
Iter 7: loss=0.607, mean_reward=60.0, reward_threshold=62.6
Iter 8: loss=0.598, mean_reward=53.2, reward_threshold=54.2
Iter 9: loss=0.592, mean_reward=71.5, reward_threshold=72.0
Iter 10: loss=0.585, mean_reward=60.1, reward_threshold=69.3
Iter 11: loss=0.580, mean_reward=68.7, reward_threshold=74.0
Iter 12: loss=0.577, mean_reward=89.1, reward_threshold=95.8
Iter 13: loss=0.569, mean_reward=100.7, reward_threshold=99.6
Iter 14: loss=0.565, mean_reward=107.9, reward_threshold=119.2
Iter 15: loss=0.557, mean_reward=141.1, reward_threshold=164.9
Iter 16: loss=0.556, mean_rew

In [13]:
def play_trainde_agent(env, policy_net, n_episodes):
    for episode in range(n_episodes):
        obs, _ = env.reset()
        done = False
        total_reward = 0.0
        
        while not done:
            env.render()
            obs_v = torch.tensor([obs], dtype=torch.float32).to(device)
            action_probs = torch.softmax(policy_net(obs_v), dim=1)
            action = torch.multinomial(action_probs, 1).item()
            
            obs, reward, terminated, truncated, info = env.step(action)
            total_reward += reward
            done = terminated or truncated
            
        print(f"Episode {episode+1}: total_reward={total_reward}")
        
    env.close()

In [14]:
env = gym.make("CartPole-v1", render_mode="human")
play_trainde_agent(env, policy_net, n_episodes=3)

Episode 1: total_reward=469.0
Episode 2: total_reward=500.0
Episode 3: total_reward=500.0
