In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os
import csv
import pandas as pd
from datetime import datetime


In [8]:


# State: [current_glucose, glucose_trend, heart_rate, heart_rate_trend, insulin_on_board]
state_dim = 5
action_dim = 1  # Continuous insulin dose (steps of 0.05 units)

# Hyperparameters
alpha = 0.2  # Entropy coefficient
cql_weight = 5.0  # CQL penalty strength
batch_size = 256
device = "cuda" if torch.cuda.is_available() else "cpu"



In [9]:

class SACCQL(nn.Module):
    def __init__(self):
        super().__init__()
        # Actor (policy) network
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, action_dim),
            nn.Tanh()  # Output in [-1, 1] (rescale to insulin range)
        )
        
        # Critic networks (twin Q-functions)
        self.q1 = nn.Sequential(
            nn.Linear(state_dim + action_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
        self.q2 = nn.Sequential(
            nn.Linear(state_dim + action_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
        
    def act(self, state):
        with torch.no_grad():
            state = torch.FloatTensor(state).to(device)
            action = self.actor(state)
        return action.cpu().numpy()




In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset

class DiabetesDataset(Dataset):
    def __init__(self, sequence_length=1):
        # Assume you have loaded your time-series data into these arrays:
        self.glucose = np.random.randn(10000)            # (num_timesteps,)
        self.glucose_deriv = np.random.randn(10000)      # (num_timesteps,)
        self.heart_rate = np.random.randn(10000)         # (num_timesteps,)
        self.hr_deriv = np.random.randn(10000)           # (num_timesteps,)
        self.iob = np.random.randn(10000)                # (num_timesteps,)
        self.insulin_doses = np.random.randn(10000)      # (num_timesteps,)
        
        # Compute rewards (example: penalize deviations from target glucose)
        self.rewards = -np.abs(self.glucose - 100)  # Target = 100 mg/dL
        
        # States: Stack all 5 time-series features
        self.states = np.column_stack([
            self.glucose,
            self.glucose_deriv,
            self.heart_rate,
            self.hr_deriv,
            self.iob
        ])  # Shape: (num_timesteps, 5)
        
        # Next states: Shift states by 1 timestep
        self.next_states = np.roll(self.states, shift=-1, axis=0)
        
        # "Done" flags (0 = episode continues, 1 = episode ends)
        # Assume episodes never terminate (modify for real data)
        self.dones = np.zeros(len(self.states))
        self.dones[-1] = 1  # Mark the end of the dataset

    def __len__(self):
        return len(self.states) - 1  # Ignore last next_state (no future)

    def __getitem__(self, idx):
        return {
            "state": self.states[idx],          # Shape: (5,)
            "action": self.insulin_doses[idx],  # Shape: (1,)
            "reward": self.rewards[idx],        # Shape: (1,)
            "next_state": self.next_states[idx],# Shape: (5,)
            "done": self.dones[idx]             # Shape: (1,)
        }

In [None]:
# Load your historical dataset (replace with your data)
class DiabetesDataset(torch.utils.data.Dataset):
    def __init__(self):
        self.states = np.random.randn(10000, state_dim)  # Replace with real data
        self.actions = np.random.randn(10000, action_dim) * 0.05  # Insulin doses
        self.rewards = np.random.randn(10000)  # Reward = f(glucose)
        self.next_states = np.random.randn(10000, state_dim)
        self.dones = np.zeros(10000)
        
    def __len__(self):
        return len(self.states)
    
    def __getitem__(self, idx):
        return (
            self.states[idx], self.actions[idx], self.rewards[idx],
            self.next_states[idx], self.dones[idx]
        )

dataset = DiabetesDataset()
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize networks and optimizers
model = SACCQL().to(device)
optimizer_actor = optim.Adam(model.actor.parameters(), lr=3e-4)
optimizer_critic = optim.Adam(list(model.q1.parameters()) + list(model.q2.parameters()), lr=3e-4)

# Training loop
for epoch in range(1000):
    for states, actions, rewards, next_states, dones in dataloader:
        states = torch.FloatTensor(states).to(device)
        actions = torch.FloatTensor(actions).to(device)
        rewards = torch.FloatTensor(rewards).to(device).unsqueeze(1)
        next_states = torch.FloatTensor(next_states).to(device)
        dones = torch.FloatTensor(dones).to(device).unsqueeze(1)
        
        # Critic loss (CQL + TD error)
        with torch.no_grad():
            next_actions = model.actor(next_states)
            q1_next = model.q1(torch.cat([next_states, next_actions], 1))
            q2_next = model.q2(torch.cat([next_states, next_actions], 1))
            q_next = torch.min(q1_next, q2_next)
            target_q = rewards + (1 - dones) * 0.99 * q_next
        
        # Current Q-values
        current_q1 = model.q1(torch.cat([states, actions], 1))
        current_q2 = model.q2(torch.cat([states, actions], 1))
        
        # TD loss
        td_loss = nn.MSELoss()(current_q1, target_q) + nn.MSELoss()(current_q2, target_q)
        
        # CQL penalty: logsumexp(Q(s, a')) - Q(s, a)
        random_actions = torch.rand_like(actions) * 2 - 1  # Random actions in [-1, 1]
        q1_rand = model.q1(torch.cat([states, random_actions], 1))
        q2_rand = model.q2(torch.cat([states, random_actions], 1))
        cql_penalty = (
            torch.logsumexp(torch.cat([q1_rand, q2_rand], 1), dim=1).mean() -
            (current_q1.mean() + current_q2.mean()) / 2
        )
        
        # Total critic loss
        critic_loss = td_loss + cql_weight * cql_penalty
        
        # Actor loss (maximize Q-value + entropy)
        pred_actions = model.actor(states)
        q1_pred = model.q1(torch.cat([states, pred_actions], 1))
        q2_pred = model.q2(torch.cat([states, pred_actions], 1))
        actor_loss = -torch.min(q1_pred, q2_pred).mean()
        
        # Update critic
        optimizer_critic.zero_grad()
        critic_loss.backward()
        optimizer_critic.step()
        
        # Update actor
        optimizer_actor.zero_grad()
        actor_loss.backward()
        optimizer_actor.step()