In [None]:
import pandas as pd

# Load data
data = pd.read_csv("/Users/colepuls/CS/MUHackathon2025/Raw_data/diabetes.csv")

In [None]:
# Check first few rows
data.head()

In [None]:
# Get basic stats
data.describe()

In [None]:
# Check for missing values or weird entries
data.isnull().sum()

In [None]:
data.info()

In [413]:
# Count duplicate how many duplicate rows exist
num_dups = data.duplicated().sum()
num_dups

np.int64(0)

In [414]:
data.shape

(768, 9)

In [415]:
# Remove any dups
if num_dups > 0:
    data.drop_duplicates(inpalce=True)
data.shape

(768, 9)

In [416]:
# List columns where zeros are likely invalid or indicate missing data
import numpy as np

cols_to_clean = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]

# Replace 0 with nan, indicates missing value
for c in cols_to_clean:
    data[c].replace(0, np.nan, inplace=True)

# Check how many nans are in columns
data[cols_to_clean].isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[c].replace(0, np.nan, inplace=True)


Glucose          0
BloodPressure    0
SkinThickness    0
Insulin          0
BMI              0
dtype: int64

In [None]:
data.shape

In [None]:
# Impute missing values, fill with column means
data[cols_to_clean] = data[cols_to_clean].fillna(data[cols_to_clean].mean())
data.shape

In [None]:
data.head()

In [417]:
import torch

# Split targets
x = data.drop(columns=["Outcome"]).values
y = data["Outcome"].values

In [None]:
x.shape

In [None]:
y.shape

In [None]:
# Convert into tensors
x_tensor = torch.tensor(x, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long) # for classification labels

In [None]:
# Calculate mean and std for each feature (column)
mean = x_tensor.mean(dim=0)
std = x_tensor.std(dim=0)

std[std == 0] = 1e-7 # avoid division by zero

# Normalize!!!
x_tensor = (x_tensor - mean) / std # x tensor is now normalized column wise

In [None]:
from torch.utils.data import Dataset

class DiabetesDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

# initialize dataset    
dataset = DiabetesDataset(x_tensor, y_tensor)

In [None]:
len(dataset) # number of samples

In [None]:
sample_features = dataset[0]
sample_labels = dataset[0]

In [None]:
sample_features

In [None]:
sample_labels

In [None]:
# Split into training and validation sets
from torch.utils.data import random_split

# 80/20 split
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

len(train_dataset)


In [None]:
len(val_dataset)

In [None]:
# Create data loaders
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# test
for batch_features, batch_labels in train_loader:
    print(batch_features.shape, batch_labels.shape)
    break


In [None]:
# compute stats after normalization
check_mean = torch.mean(torch.vstack([dataset[i][0] for i in range(len(dataset))]), dim=0)
check_std = torch.std(torch.vstack([dataset[i][0] for i in range(len(dataset))]), dim=0)

check_mean

In [None]:
check_std

In [None]:
# neural network
import torch.nn as nn

class SimpleNet(nn.Module):
    def __init__(self, input_dim, hidden_dim=16, output_dim=2):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [None]:
import torch.optim as optim

model = SimpleNet(input_dim=x.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training

epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch_features, batch_labels in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_features)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_train_loss = total_loss / len(train_loader)


In [421]:
# Validation
model.eval()
val_loss = 0
correct = 0
total = 0
with torch.no_grad():
    for val_features, val_labels in val_loader:
        val_outputs = model(val_features)
        loss = criterion(val_outputs, val_labels)
        val_loss += loss.item()

        _, predicted = torch.max(val_outputs, dim=1)
        correct += (predicted == val_labels).sum().item()
        total += val_labels.size(0)
avg_val_loss = val_loss / len(val_loader)
accuracy = 100 * correct / total

print(f"Epoch {epoch+1}/{epochs} | "
      f"Train Loss: {avg_train_loss:.4f} | "
      f"Val Loss: {avg_val_loss:.4f} | "
      f"Val Acc: {accuracy:.2f}%")

Epoch 5/5 | Train Loss: 0.5654 | Val Loss: 0.5041 | Val Acc: 77.92%
