In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal



In [4]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class PolicyNetwork(nn.Module):
    def __init__(self, state_dim=5, action_dim=1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim)  # Output insulin dose (continuous)
        )
    
    def forward(self, state):
        return self.net(state)

In [5]:
class PPOPolicy(nn.Module):
    def __init__(self, state_dim=5, action_dim=1):
        super().__init__()
        self.shared = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU()
        )
        self.actor_mean = nn.Linear(64, action_dim)
        self.actor_std = nn.Parameter(torch.zeros(1, action_dim))  # Learnable std
    
    def forward(self, state):
        x = self.shared(state)
        mean = self.actor_mean(x)
        std = torch.exp(self.actor_std)
        return mean, std

In [6]:
def reward(glucose):
    target_min, target_max = 70, 180
    if glucose < 70:
        return -10 * (70 - glucose)  # Hypoglycemia penalty
    elif glucose > 180:
        return - (glucose - 180)     # Hyperglycemia penalty
    else:
        return 1.0                   # Reward for in-range

In [7]:

# Initialize policy and optimizer
policy = PPOPolicy(state_dim=5, action_dim=1).to(device)
optimizer = optim.Adam(policy.parameters(), lr=3e-4)

# Training loop
for episode in range(1000):
    state = env.reset()
    states, actions, rewards, log_probs = [], [], [], []
    
    while True:
        state_tensor = torch.FloatTensor(state).to(device)
        mean, std = policy(state_tensor)
        dist = Normal(mean, std)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        
        next_state, reward, done, _ = env.step(action.cpu().numpy())
        
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        log_probs.append(log_prob)
        state = next_state
        
        if done:
            break
    
    # PPO update (simplified)
    states = torch.FloatTensor(np.array(states)).to(device)
    returns = compute_returns(rewards, gamma=0.99)
    advantages = returns - policy.critic(states)
    
    # Clipped surrogate loss
    ratios = torch.exp(log_probs - old_log_probs)
    surr1 = ratios * advantages
    surr2 = torch.clamp(ratios, 0.8, 1.2) * advantages
    loss = -torch.min(surr1, surr2).mean()
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

NameError: name 'env' is not defined