<a href="https://colab.research.google.com/github/chdmitr2/Deep-Learning-22961/blob/main/maman13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Maman 13

Dmitriy Chudnovsky 324793900

Question 1

In [13]:
# ============================================================
# 0.  Imports & reproducibility
# ------------------------------------------------------------

import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import os

# Check if running on Colab
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

# Set random seeds
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# ============================================================
# 1.  Load local diabetes dataset (assumed already preprocessed)
# ------------------------------------------------------------

if IN_COLAB:
    from google.colab import drive
    try:
        drive.mount('/content/drive')
    except Exception as e:
        print("Drive might already be mounted. If needed, you can force remount.")

    # Path to your file inside Google Drive
    data_path = "/content/drive/MyDrive/Colab Notebooks/diabetes.csv"

else:
    data_path = 'diabetes.csv'

# Read the dataset
df = pd.read_csv(data_path, delimiter="\t")

# If "Y" is not the target column name, change it here
if "Y" not in df.columns:
    df.rename(columns={df.columns[-1]: "Y"}, inplace=True)

# (a) rename features to f0, f1, ...
feature_cols = [col for col in df.columns if col != "Y"]
df.rename(columns={col: f"f{i}" for i, col in enumerate(feature_cols)}, inplace=True)

#print(df["Y"].dtype)  # Check data type
#print(df["Y"].head())  # Check first few values

# (b) compute deciles (labels 1–10)
df["Class_decile"] = pd.qcut(df["Y"], q=10, labels=False) + 1

# ============================================================
# 2.  Custom PyTorch Dataset
# ------------------------------------------------------------
# Define a custom Dataset class for the Diabetes dataset
class DiabetesDataset(Dataset):
    def __init__(
        self,
        dataframe: pd.DataFrame,
        split: str = "train",
        test_size: float = 0.2,
        use_percentiles: bool = False,
        random_state: int = 42,
    ):
        # Ensure 'split' parameter is either "train" or "test"
        assert split in {"train", "test"}, "`split` must be 'train' or 'test'"

        df = dataframe.copy()

        # Choose whether to use 100 percentiles or 10 deciles for classification
        if use_percentiles:
            df["Class"] = pd.qcut(df["Y"], q=100, labels=False) + 1  # Divide Y into 100 quantile bins
            self.n_classes = 100
        else:
            df["Class"] = df["Class_decile"]  # Use existing decile classes
            self.n_classes = 10

        # Split the dataset into training and testing sets
        train_df, test_df = train_test_split(
            df, test_size=test_size, random_state=random_state
        )
        self.df = train_df if split == "train" else test_df

        # Extract features (drop labels) and convert to PyTorch tensors
        self.features = torch.tensor(
            self.df.drop(columns=["Y", "Class_decile", "Class"]).values,
            dtype=torch.float32,
        )

        # Create label tensors for classification and regression
        self.y_class = torch.tensor(self.df["Class"].values - 1, dtype=torch.long)  # 0-based classes
        self.y_reg = torch.tensor(self.df["Y"].values, dtype=torch.float32)          # Regression targets

    def __len__(self):
        # Return the total number of samples
        return len(self.df)

    def __getitem__(self, idx):
        # Return a single sample (features and labels) by index
        return {
            "x": self.features[idx],
            "y_class": self.y_class[idx],
            "y_reg": self.y_reg[idx],
        }

# ============================================================
# 3.  Model builders
# ------------------------------------------------------------

# Build a classifier neural network
def make_classifier(in_dim, n_classes):
    return nn.Sequential(
        nn.Linear(in_dim, 256), nn.ReLU(),    # First hidden layer with ReLU activation
        nn.Linear(256, 128), nn.ReLU(),        # Second hidden layer with ReLU activation
        nn.Linear(128, n_classes)              # Output layer for classification
    )

# Build a regressor neural network
def make_regressor(in_dim):
    return nn.Sequential(
        nn.Linear(in_dim, 64), nn.ReLU(),      # First hidden layer with ReLU
        nn.Linear(64, 32), nn.ReLU(),           # Second hidden layer with ReLU
        nn.Linear(32, 1)                        # Output single continuous value
    )

# ============================================================
# 4.  Training / evaluation functions
# ------------------------------------------------------------

# Train the model for one epoch
def train_epoch(model, loader, criterion, optimizer, task="class"):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        preds = model(batch["x"])  # Forward pass
        if task == "class":
            loss = criterion(preds, batch["y_class"])  # Classification loss
        else:
            loss = criterion(preds.squeeze(1), batch["y_reg"])  # Regression loss
        loss.backward()    # Backward pass
        optimizer.step()   # Update weights
        total_loss += loss.item() * len(batch["x"])
    return total_loss / len(loader.dataset)

# Evaluate the model without gradients
@torch.no_grad()
def evaluate(model, loader, task="class"):
    model.eval()
    if task == "class":
        # For classification, compute accuracy
        all_preds, all_true = [], []
        for batch in loader:
            logits = model(batch["x"])
            all_preds.append(logits.argmax(1).cpu().numpy())
            all_true.append(batch["y_class"].cpu().numpy())
        return accuracy_score(np.concatenate(all_true), np.concatenate(all_preds))
    else:
        # For regression, compute R² score
        preds, true = [], []
        for batch in loader:
            preds.append(model(batch["x"]).squeeze(1).cpu().numpy())
            true.append(batch["y_reg"].cpu().numpy())
        return r2_score(np.concatenate(true), np.concatenate(preds))

# ============================================================
# 5.  Create DataLoaders
# ------------------------------------------------------------
batch_size = 10

# Create training and testing datasets
train_ds = DiabetesDataset(df, split="train", use_percentiles=False)
test_ds = DiabetesDataset(df, split="test", use_percentiles=False)

# Create corresponding data loaders
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

# Print a sample batch to verify
print("\n=== One sample batch ===")
sample = next(iter(train_loader))
print("x shape:", sample["x"].shape)
print("y_class:", sample["y_class"])
print("y_reg  :", sample["y_reg"])

# ============================================================
# 6.  Train a classifier on decile classes
# ------------------------------------------------------------

# Build the classifier model
classifier = make_classifier(in_dim=train_ds.features.shape[1], n_classes=train_ds.n_classes)
opt = torch.optim.Adam(classifier.parameters(), lr=1e-3)  # Optimizer
criterion_cls = nn.CrossEntropyLoss()                     # Loss function for classification

print("\n=== Decile Classification Results ===")
for epoch in range(30):
    train_epoch(classifier, train_loader, criterion_cls, opt, task="class")
    if (epoch + 1) % 10 == 0:
        acc = evaluate(classifier, test_loader, task="class")
        print(f"[Decile‑CLS] epoch {epoch+1:02d}  |  test-accuracy = {acc*100:.2f}%")

# ============================================================
# 7.  Train a regressor on Y values
# ------------------------------------------------------------

# Build the regressor model
regressor = make_regressor(train_ds.features.shape[1])
opt_r = torch.optim.Adam(regressor.parameters(), lr=1e-3)
criterion_reg = nn.MSELoss()  # Mean Squared Error for regression

print("\n=== Standard Regression Results ===")
for epoch in range(50):
    train_epoch(regressor, train_loader, criterion_reg, opt_r, task="reg")
    if (epoch + 1) % 10 == 0:
        r2 = evaluate(regressor, test_loader, task="reg")
        print(f"[Regression] epoch {epoch+1:02d}  |  R² on test = {r2:.3f}")

print(
    """
Explain the difference in the performance of the networks. Why is one better than the other?
 •  The regression model performs better because it naturally fits the task of predicting a continuous value,
    while classification into deciles makes the problem harder and artificial — which harms the performance.
    """
)

# ============================================================
# 8.  (Optional) Train a classifier on 100 percentiles
# ------------------------------------------------------------

# Create datasets and loaders using percentiles instead of deciles
train_ds_pct = DiabetesDataset(df, split="train", use_percentiles=True)
test_ds_pct = DiabetesDataset(df, split="test", use_percentiles=True)
train_loader_pct = DataLoader(train_ds_pct, batch_size=batch_size, shuffle=True)
test_loader_pct = DataLoader(test_ds_pct, batch_size=batch_size, shuffle=False)

# Build and train a classifier for 100 classes
classifier_pct = make_classifier(train_ds_pct.features.shape[1], n_classes=100)
opt_pct = torch.optim.Adam(classifier_pct.parameters(), lr=1e-3)
criterion_pct = nn.CrossEntropyLoss()

print("\n=== Percentile Classification Results ===")
for epoch in range(50):
    train_epoch(classifier_pct, train_loader_pct, criterion_pct, opt_pct, task="class")
    if (epoch + 1) % 10 == 0:
        acc_pct = evaluate(classifier_pct, test_loader_pct, task="class")
        print(f"[Percentile‑CLS] epoch {epoch+1:02d}  |  test-accuracy = {acc_pct*100:.2f}%")

print(
    """
Deciles vs Percentiles? —
    •  Deciles are generally preferred for most tasks
            because they provide a better balance between accuracy and the number of categories.
    •  Percentiles are useful if very specific categories are needed,
            but with lower performance due to class imbalance.
    """
)

# ============================================================
# 9.  Full regression model (ignores Class)
# ------------------------------------------------------------

# Build a full regression model (without using the 'Class' column)
regressor_full = make_regressor(in_dim=df.drop(columns=["Y", "Class_decile"]).shape[1])
opt_f = torch.optim.Adam(regressor_full.parameters(), lr=1e-3)

print("\n=== Full Regression Results ===")
for epoch in range(50):
    train_epoch(regressor_full, train_loader, criterion_reg, opt_f, task="reg")
    if (epoch + 1) % 10 == 0:
        r2_full = evaluate(regressor_full, test_loader, task="reg")
        print(f"[Full‑Regression] epoch {epoch+1:02d}  |  R² on test = {r2_full:.3f}")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

=== One sample batch ===
x shape: torch.Size([10, 10])
y_class: tensor([0, 5, 4, 2, 6, 2, 7, 0, 7, 0])
y_reg  : tensor([ 52., 160., 123.,  90., 170.,  87., 198.,  59., 232.,  60.])

=== Decile Classification Results ===
[Decile‑CLS] epoch 10  |  test-accuracy = 12.36%
[Decile‑CLS] epoch 20  |  test-accuracy = 13.48%
[Decile‑CLS] epoch 30  |  test-accuracy = 15.73%

=== Standard Regression Results ===
[Regression] epoch 10  |  R² on test = 0.216
[Regression] epoch 20  |  R² on test = 0.278
[Regression] epoch 30  |  R² on test = 0.317
[Regression] epoch 40  |  R² on test = 0.325
[Regression] epoch 50  |  R² on test = 0.335

Explain the difference in the performance of the networks. Why is one better than the other?
 •  The regression model performs better because it naturally fits the task of predicting a continuous value,
    while classification into deciles