# Imports

In [1]:
# We use PyTorch for models, scikit-learn for preprocessing and metrics.

import pandas as pd  # For data loading and manipulation
import numpy as np  # For numerical operations
from sklearn.model_selection import train_test_split  # For splitting data
from sklearn.preprocessing import StandardScaler  # For scaling features
from sklearn.metrics import r2_score, mean_squared_error  # For evaluation
import torch  # For neural networks
import torch.nn as nn  # For model layers
import torch.optim as optim  # For optimizer
from torch.utils.data import Dataset, DataLoader  # For data loading in PyTorch
import matplotlib.pyplot as plt  # For plotting losses
import os  # For directory creation and saving files
import joblib  # For saving scaler

# Device configuration: Use GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Exploratory Data Analysis (Finding answers for Step-1)

In [2]:
# Load the dataset and print key statistics to answer report questions.

# Load the dataset with specified encoding
df = pd.read_csv('/content/cancer_reg-1.csv', encoding='latin-1')

# Print basic info for Step 1
print("Number of samples (Question 1):", df.shape[0])
print("Number of features (Question 4):", df.shape[1] - 1)  # Excluding label
print("Missing information (Question 5):")
print(df.isnull().sum())  # Shows missing per column
print("Label (Question 6): TARGET_deathRate")

# Compute min and max in the dataset (Question 3)
numerical_df = df.select_dtypes(include='number')
min_val = numerical_df.min().min()
max_val = numerical_df.max().max()
print("Min value in dataset:", min_val)
print("Max value in dataset:", max_val)

# Describe for more insights
print(df.describe())

Number of samples (Question 1): 3047
Number of features (Question 4): 33
Missing information (Question 5):
avgAnnCount                   0
avgDeathsPerYear              0
TARGET_deathRate              0
incidenceRate                 0
medIncome                     0
popEst2015                    0
povertyPercent                0
studyPerCap                   0
binnedInc                     0
MedianAge                     0
MedianAgeMale                 0
MedianAgeFemale               0
Geography                     0
AvgHouseholdSize              0
PercentMarried                0
PctNoHS18_24                  0
PctHS18_24                    0
PctSomeCol18_24            2285
PctBachDeg18_24               0
PctHS25_Over                  0
PctBachDeg25_Over             0
PctEmployed16_Over          152
PctUnemployed16_Over          0
PctPrivateCoverage            0
PctPrivateCoverageAlone     609
PctEmpPrivCoverage            0
PctPublicCoverage             0
PctPublicCoverageAlone       

# Data Pre-Processing (Convert **binnedInc** format)

In [3]:
# Pre-process the data: drop unnecessary columns, convert binnedInc, fill missing with train means,
# split into train/val/test, scale features, save CSVs and scaler.

# Define function to parse binnedInc to midpoint
def parse_bin(s):
    s = s.replace('[', '').replace(']', '').replace('(', '').replace(')', '')
    parts = s.split(',')
    if len(parts) == 2:
        low = float(parts[0].strip())
        high = float(parts[1].strip())
        return (low + high) / 2
    else:
        return np.nan

# Drop Geography (unique categorical, not useful)
df = df.drop('Geography', axis=1)

# Convert binnedInc
df['binnedInc'] = df['binnedInc'].apply(parse_bin)

# Split indices for reproducibility
train_idx, temp_idx = train_test_split(df.index, test_size=0.3, random_state=42)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42)

# Create split DataFrames
df_train = df.loc[train_idx].copy()
df_val = df.loc[val_idx].copy()
df_test = df.loc[test_idx].copy()

# Save splits as CSVs (full, with labels)
df_train.to_csv('train.csv', index=False, encoding='latin-1')
df_val.to_csv('val.csv', index=False, encoding='latin-1')
df_test.to_csv('test.csv', index=False, encoding='latin-1')

# Fill missing with train means
for col in df_train.columns:
    if df_train[col].dtype in ['float64', 'int64'] and df_train[col].isnull().any():
        train_mean = df_train[col].mean()
        df_train[col] = df_train[col].fillna(train_mean)
        df_val[col] = df_val[col].fillna(train_mean)
        df_test[col] = df_test[col].fillna(train_mean)

# Separate features and labels
X_train = df_train.drop('TARGET_deathRate', axis=1).values
y_train = df_train['TARGET_deathRate'].values
X_val = df_val.drop('TARGET_deathRate', axis=1).values
y_val = df_val['TARGET_deathRate'].values
X_test = df_test.drop('TARGET_deathRate', axis=1).values
y_test = df_test['TARGET_deathRate'].values

# Scale features (fit on train)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Save scaler for test_model
joblib.dump(scaler, 'scaler.pkl')

# Custom Dataset class
class CancerDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create datasets and loaders (larger batch for GPU)
train_dataset = CancerDataset(X_train, y_train)
val_dataset = CancerDataset(X_val, y_val)
test_dataset = CancerDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

input_size = X_train.shape[1]
print("Pre-processing complete. Input size:", input_size)

Pre-processing complete. Input size: 32


# Defining Model Class

In [4]:
# DNN class for linear (no hidden) and deep/wide networks.

class DNN(nn.Module):
    def __init__(self, input_size, hidden_layers=[]):
        super(DNN, self).__init__()
        layers = []
        in_size = input_size
        for h in hidden_layers:
            layers.append(nn.Linear(in_size, h))
            layers.append(nn.ReLU())
            in_size = h
        layers.append(nn.Linear(in_size, 1))  # Regression output
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

# Training and Evaluation Functions

In [6]:
# Train with early stopping, gradient clipping. Evaluate and handle NaN.

def train_model(model, train_loader, val_loader, lr=0.01, epochs=500, loss_fn=nn.MSELoss(), patience=50):
    model.to(device)
    criterion = loss_fn
    optimizer = optim.SGD(model.parameters(), lr=lr)
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    patience_counter = 0
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            inputs, labels = batch[0].to(device), batch[1].to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            if torch.isnan(loss).any():
                print(f"NaN loss at epoch {epoch+1}. Stopping.")
                return train_losses, val_losses
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)
        train_loss /= len(train_loader.dataset)
        train_losses.append(train_loss)

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                inputs, labels = batch[0].to(device), batch[1].to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)
        val_loss /= len(val_loader.dataset)
        val_losses.append(val_loss)

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

        # Print every 50 epochs for screenshots
        if (epoch + 1) % 50 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    return train_losses, val_losses

def evaluate_model(model, test_loader):
    model.to(device)
    model.eval()
    y_pred = []
    y_true = []
    with torch.no_grad():
        for batch in test_loader:
            inputs, labels = batch[0].to(device), batch[1].to(device)
            outputs = model(inputs)
            y_pred.extend(outputs.cpu().numpy().flatten())
            y_true.extend(labels.cpu().numpy().flatten())
    if np.isnan(y_pred).any():
        print("NaN in predictions.")
        return float('inf'), float('-inf')
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"Test MSE: {mse:.4f}, Test R2: {r2:.4f}")
    return mse, r2

def test_model(model_path, test_csv_path, hidden_layers, input_size=32):
    model = DNN(input_size, hidden_layers)
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()

    test_df = pd.read_csv(test_csv_path, encoding='latin-1')
    if 'Geography' in test_df.columns:
        test_df = test_df.drop('Geography', axis=1)
    test_df['binnedInc'] = test_df['binnedInc'].apply(parse_bin)
    for col in test_df.columns:
        if test_df[col].dtype in ['float64', 'int64'] and test_df[col].isnull().any():
            test_df[col] = test_df[col].fillna(test_df[col].mean())
    scaler = joblib.load('scaler.pkl')
    X_test_new = scaler.transform(test_df.values)
    test_tensor = torch.tensor(X_test_new, dtype=torch.float32).to(device)

    with torch.no_grad():
        predictions = model(test_tensor).cpu().numpy().flatten()
    print("Predictions:", predictions)
    return predictions

# Train and Evaluate All Models (Answers for steps 2,5,6)

In [7]:
# Define 13 architectures: 5 mandatory DNN + linear, 4 wide, 4 deep.
# Train with different LR, save plots for LR=0.01, find best, save models.

# Create plots folder
os.makedirs('/content/plots', exist_ok=True)

# Architectures
architectures = {
    'Linear': [],
    'DNN-16': [16],
    'DNN-30-8': [30, 8],
    'DNN-30-16-8': [30, 16, 8],
    'DNN-30-16-8-4': [30, 16, 8, 4],
    'DNN-128': [128],  # Wide 1
    'DNN-256-128': [256, 128],  # Wide 2
    'DNN-512-256-128': [512, 256, 128],  # Wide 3
    'DNN-1024-512-256-128': [1024, 512, 256, 128],  # Wide 4
    'DNN-32x4': [32] * 4,  # Deep 1
    'DNN-16x5': [16] * 5,  # Deep 2
    'DNN-8x6': [8] * 6,  # Deep 3
    'DNN-64-32-16-8-4': [64, 32, 16, 8, 4],  # Deep 4
}

# Learning rates
lrs = [0.1, 0.01, 0.001, 0.0001]

# Results storage
step2_r2 = {}  # LR=0.01
step5_r2 = {name: {} for name in architectures}
step6_mse = {}  # LR=0.01
best_model_name = None
best_r2 = -float('inf')
best_hidden_layers = []

# Train loop
for model_name, hidden_layers in architectures.items():
    print(f"\nTraining {model_name}")
    for lr in lrs:
        model = DNN(input_size, hidden_layers)
        train_losses, val_losses = train_model(model, train_loader, val_loader, lr=lr, epochs=500)
        mse, r2 = evaluate_model(model, test_loader)
        step5_r2[model_name][lr] = r2
        if lr == 0.01:
            step2_r2[model_name] = r2
            step6_mse[model_name] = mse

            # Save plot with caption
            plt.figure(figsize=(10, 6))
            plt.plot(train_losses, label='Train Loss')
            plt.plot(val_losses, label='Val Loss')
            plt.xlabel('Epoch')
            plt.ylabel('MSE Loss')
            plt.title(f'Loss Curve for {model_name}')
            plt.legend()
            caption = f"Training and Validation Loss for {model_name} over {len(train_losses)} epochs (LR=0.01).\nFinal Train Loss: {train_losses[-1]:.4f}, Final Val Loss: {val_losses[-1]:.4f}.\nThis plot shows convergence; early stopping applied if no improvement for 50 epochs."
            plt.figtext(0.5, -0.1, caption, wrap=True, ha='center', fontsize=10)
            plt.tight_layout(rect=[0, 0.03, 1, 0.95])
            plt.savefig(f'/content/plots/{model_name}_loss.png', bbox_inches='tight')
            plt.close()

            # Track best
            if r2 > best_r2:
                best_r2 = r2
                best_model_name = model_name
                best_hidden_layers = hidden_layers
                torch.save(model.state_dict(), 'best_dnn.pth')

    # Save linear separately
    if model_name == 'Linear':
        torch.save(model.state_dict(), 'linear.pth')

# Print results
print("\nStep 2 Results (R2 for LR=0.01):")
for name, r2 in step2_r2.items():
    print(f"{name}: {r2:.4f}")

print("\nStep 5 Results (R2 for different LR):")
for name, lr_dict in step5_r2.items():
    print(name)
    for lr, r2 in lr_dict.items():
        print(f"LR {lr}: {r2:.4f}")

print("\nStep 6 MSE (for LR=0.01):")
for name, mse in step6_mse.items():
    print(f"{name}: {mse:.4f}")

# Try MAE for DNN-30-16-8, LR=0.01
print("\nTrying MAE loss for DNN-30-16-8")
model_mae = DNN(input_size, [30, 16, 8])
train_losses_mae, val_losses_mae = train_model(model_mae, train_loader, val_loader, lr=0.01, loss_fn=nn.L1Loss())
mse_mae, r2_mae = evaluate_model(model_mae, test_loader)
print(f"With MAE: Test R2 {r2_mae:.4f}")

print(f"\nBest model: {best_model_name} with R2 {best_r2:.4f}")


Training Linear
Epoch 50/500, Train Loss: 353.6811, Val Loss: 422.3233
Early stopping at epoch 64
Test MSE: 442.3259, Test R2: 0.4430
Epoch 50/500, Train Loss: 12534.4308, Val Loss: 12063.7306
Epoch 100/500, Train Loss: 2064.8459, Val Loss: 1935.7911
Epoch 150/500, Train Loss: 357.3207, Val Loss: 415.2481
Early stopping at epoch 189
Test MSE: 440.8216, Test R2: 0.4449
Epoch 50/500, Train Loss: 30192.8149, Val Loss: 29606.4165
Epoch 100/500, Train Loss: 27788.7417, Val Loss: 27226.2938
Epoch 150/500, Train Loss: 25497.7778, Val Loss: 24961.3845
Epoch 200/500, Train Loss: 23317.4128, Val Loss: 22809.7394
Epoch 250/500, Train Loss: 21242.9474, Val Loss: 20763.7374
Epoch 300/500, Train Loss: 19268.2506, Val Loss: 18820.0108
Epoch 350/500, Train Loss: 17395.3183, Val Loss: 16976.9704
Epoch 400/500, Train Loss: 15616.0604, Val Loss: 15227.3852
Epoch 450/500, Train Loss: 13939.3527, Val Loss: 13579.5818
Epoch 500/500, Train Loss: 12362.4627, Val Loss: 12031.6218
Test MSE: 12721.0387, Test R2