In [24]:
# youssef ahmed ibrahim 
# 223101109

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np




In [26]:
data_path = r"A:\future\collage\sem 5\NN\assignment\1\diabetes_dataset.csv"

df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nDataset info:")
print(df.info())
print(f"\nClass distribution:")
print(df['diagnosed_diabetes'].value_counts())


Dataset shape: (100000, 31)

First few rows:
   age  gender ethnicity education_level  income_level employment_status  \
0   58    Male     Asian      Highschool  Lower-Middle          Employed   
1   48  Female     White      Highschool        Middle          Employed   
2   60    Male  Hispanic      Highschool        Middle        Unemployed   
3   74  Female     Black      Highschool           Low           Retired   
4   46    Male     White        Graduate        Middle           Retired   

  smoking_status  alcohol_consumption_per_week  \
0          Never                             0   
1         Former                             1   
2          Never                             1   
3          Never                             0   
4          Never                             1   

   physical_activity_minutes_per_week  diet_score  ...  hdl_cholesterol  \
0                                 215         5.7  ...               41   
1                                 143         6

In [27]:
print("Missing values:")
print(df.isnull().sum())


Missing values:
age                                   0
gender                                0
ethnicity                             0
education_level                       0
income_level                          0
employment_status                     0
smoking_status                        0
alcohol_consumption_per_week          0
physical_activity_minutes_per_week    0
diet_score                            0
sleep_hours_per_day                   0
screen_time_hours_per_day             0
family_history_diabetes               0
hypertension_history                  0
cardiovascular_history                0
bmi                                   0
waist_to_hip_ratio                    0
systolic_bp                           0
diastolic_bp                          0
heart_rate                            0
cholesterol_total                     0
hdl_cholesterol                       0
ldl_cholesterol                       0
triglycerides                         0
glucose_fasting         

In [29]:
class DiabetesDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.FloatTensor(features)
        self.labels = torch.LongTensor(labels)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]


In [None]:
# Remove data leakage columns 
# diabetes_stage: literally tells if someone has diabetes (Type 2, No Diabetes, etc.)
# diabetes_risk_score: likely calculated based on diagnosis
print("⚠️ Removing data leakage columns...")
df_clean = df.drop(['diabetes_stage', 'diabetes_risk_score'], axis=1)
print(f"Removed: diabetes_stage, diabetes_risk_score\n")

# Encode categorical variables
categorical_cols = ['gender', 'ethnicity', 'education_level', 'income_level', 
                    'employment_status', 'smoking_status']

print("Encoding categorical variables...")
df_encoded = pd.get_dummies(df_clean, columns=categorical_cols, drop_first=True)

print(f"\nOriginal features (with leakage): {df.shape[1] - 1}")
print(f"Features after removing leakage and encoding: {df_encoded.shape[1] - 1}")

# Separate features and labels
X = df_encoded.drop('diagnosed_diabetes', axis=1).values
y = df_encoded['diagnosed_diabetes'].values

print(f"\nFeatures shape: {X.shape}")
print(f"Labels shape: {y.shape}")

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"\nFeature scaling completed.")
print(f"Mean of scaled features: {X_scaled.mean():.6f}")
print(f"Std of scaled features: {X_scaled.std():.6f}")


⚠️ Removing data leakage columns...
Removed: diabetes_stage, diabetes_risk_score

Encoding categorical variables...

Original features (with leakage): 30
Features after removing leakage and encoding: 40

Features shape: (100000, 40)
Labels shape: (100000,)

Feature scaling completed.
Mean of scaled features: -0.000000
Std of scaled features: 1.000000


In [31]:
full_dataset = DiabetesDataset(X_scaled, y)

train_len = int(0.8 * len(full_dataset))
test_len = len(full_dataset) - train_len
train_dataset, test_dataset = random_split(full_dataset, [train_len, test_len])

print(f"Total samples: {len(full_dataset)}")
print(f"Training samples: {train_len}")
print(f"Testing samples: {test_len}")

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=0, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=0, pin_memory=True)

print(f"\nNumber of training batches: {len(train_loader)}")
print(f"Number of testing batches: {len(test_loader)}")


Total samples: 100000
Training samples: 80000
Testing samples: 20000

Number of training batches: 313
Number of testing batches: 79


In [32]:
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, num_classes=2):
        super().__init__()
        
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            
            nn.Linear(32, num_classes)
        )
    
    def forward(self, x):
        return self.model(x)


In [None]:
input_dim = X.shape[1]
num_classes = 2 

model = MLPClassifier(input_dim=input_dim, num_classes=num_classes)

print(model)
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters())}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")


MLPClassifier(
  (model): Sequential(
    (0): Linear(in_features=40, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=128, out_features=64, bias=True)
    (9): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Dropout(p=0.2, inplace=False)
    (12): Linear(in_features=64, out_features=32, bias=True)
    (13): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (14): ReLU()
    (15): Linear(in_features=32, out_features=2, bias=True)
  )
)

Total parameters: 54754
Trainable parameters: 54754


In [34]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
print(f"Using device: {device}")


Using device: cpu


In [35]:
def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for features, labels in tqdm(train_loader, desc="Training"):
        features, labels = features.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100 * correct / total
    return epoch_loss, epoch_acc



In [36]:
def evaluate(model, test_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    all_probs = []
    
    with torch.no_grad():
        for features, labels in tqdm(test_loader, desc="Evaluating"):
            features, labels = features.to(device), labels.to(device)
            
            outputs = model(features)
            loss = criterion(outputs, labels)
            probs = torch.softmax(outputs, dim=1)
            _, predicted = torch.max(outputs.data, 1)
            
            running_loss += loss.item()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(probs[:, 1].cpu().numpy())
    
    epoch_loss = running_loss / len(test_loader)
    epoch_acc = 100 * correct / total
    return epoch_loss, epoch_acc, all_preds, all_labels, all_probs

In [37]:
num_epochs = 10
train_losses = []
train_accuracies = []
test_losses = []
test_accuracies = []

print("Starting training...\n")

for epoch in range(num_epochs):
    print(f"Epoch [{epoch+1}/{num_epochs}]")
    
    # Train
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    train_losses.append(train_loss)
    train_accuracies.append(train_acc)
    
    # Evaluate 
    test_loss, test_acc, _, _, _ = evaluate(model, test_loader, criterion, device)
    test_losses.append(test_loss)
    test_accuracies.append(test_acc)
    
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
    print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%\n")

print("Training completed!")



Starting training...

Epoch [1/10]


Training: 100%|██████████| 313/313 [00:01<00:00, 200.92it/s]
Evaluating: 100%|██████████| 79/79 [00:00<00:00, 616.66it/s]


Train Loss: 0.3382, Train Acc: 85.20%
Test Loss: 0.2752, Test Acc: 88.50%

Epoch [2/10]


Training: 100%|██████████| 313/313 [00:01<00:00, 207.49it/s]
Evaluating: 100%|██████████| 79/79 [00:00<00:00, 615.80it/s]


Train Loss: 0.2818, Train Acc: 88.20%
Test Loss: 0.2569, Test Acc: 89.53%

Epoch [3/10]


Training: 100%|██████████| 313/313 [00:01<00:00, 207.10it/s]
Evaluating: 100%|██████████| 79/79 [00:00<00:00, 603.79it/s]


Train Loss: 0.2681, Train Acc: 88.87%
Test Loss: 0.2502, Test Acc: 90.09%

Epoch [4/10]


Training: 100%|██████████| 313/313 [00:01<00:00, 228.12it/s]
Evaluating: 100%|██████████| 79/79 [00:00<00:00, 620.10it/s]


Train Loss: 0.2617, Train Acc: 89.30%
Test Loss: 0.2466, Test Acc: 90.08%

Epoch [5/10]


Training: 100%|██████████| 313/313 [00:01<00:00, 206.92it/s]
Evaluating: 100%|██████████| 79/79 [00:00<00:00, 461.81it/s]


Train Loss: 0.2589, Train Acc: 89.48%
Test Loss: 0.2435, Test Acc: 90.51%

Epoch [6/10]


Training: 100%|██████████| 313/313 [00:01<00:00, 179.64it/s]
Evaluating: 100%|██████████| 79/79 [00:00<00:00, 546.94it/s]


Train Loss: 0.2556, Train Acc: 89.68%
Test Loss: 0.2422, Test Acc: 90.47%

Epoch [7/10]


Training: 100%|██████████| 313/313 [00:02<00:00, 150.28it/s]
Evaluating: 100%|██████████| 79/79 [00:00<00:00, 403.74it/s]


Train Loss: 0.2541, Train Acc: 89.79%
Test Loss: 0.2401, Test Acc: 90.75%

Epoch [8/10]


Training: 100%|██████████| 313/313 [00:02<00:00, 131.98it/s]
Evaluating: 100%|██████████| 79/79 [00:00<00:00, 436.42it/s]


Train Loss: 0.2497, Train Acc: 89.99%
Test Loss: 0.2433, Test Acc: 90.69%

Epoch [9/10]


Training: 100%|██████████| 313/313 [00:01<00:00, 170.04it/s]
Evaluating: 100%|██████████| 79/79 [00:00<00:00, 484.01it/s]


Train Loss: 0.2505, Train Acc: 89.87%
Test Loss: 0.2351, Test Acc: 90.70%

Epoch [10/10]


Training: 100%|██████████| 313/313 [00:01<00:00, 172.33it/s]
Evaluating: 100%|██████████| 79/79 [00:00<00:00, 513.08it/s]

Train Loss: 0.2485, Train Acc: 90.07%
Test Loss: 0.2379, Test Acc: 90.88%

Training completed!



