<a href="https://colab.research.google.com/github/gulshan0201/DATA-Science/blob/main/ML_LAB_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Step 1: Install and Import Libraries

!pip install torch torchvision

import torch
from torchvision import datasets, transforms
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split

# Step 2: Load Dataset

transform = transforms.Compose([transforms.ToTensor()])
train_dataset = datasets.MNIST(root='./data', train=True, transform=transform, download=True)

# Step 3: Data Parallelism Example

# Split dataset
train1, train2 = random_split(train_dataset, [30000, 30000])
loader1 = DataLoader(train1, batch_size=64, shuffle=True)
loader2 = DataLoader(train2, batch_size=64, shuffle=True)

# Model
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(28*28, 128)
        self.fc2 = nn.Linear(128, 10)
    def forward(self, x):
        x = x.view(-1, 28*28)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

model = Net()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Helper function to compute gradients on one subset
def compute_gradients(data_loader):
    model.zero_grad()
    for data, target in data_loader:
        data, target = data.to(device), target.to(device)
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        break  # process only one batch per subset for simplicity
    grads = [p.grad.clone() for p in model.parameters()]
    return grads

# Training loop with gradient averaging
for epoch in range(2):
    grads1 = compute_gradients(loader1)
    grads2 = compute_gradients(loader2)

    # Average gradients
    with torch.no_grad():
        for p, g1, g2 in zip(model.parameters(), grads1, grads2):
            p.grad = (g1 + g2) / 2.0

    optimizer.step()
    print(f"Epoch {epoch+1} completed with averaged gradients.")

# Step 4: Model Parallelism Example

# Define devices
device0 = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device1 = torch.device("cuda:1" if torch.cuda.device_count() > 1 else device0)

# Split model across two devices
class SplitNet(nn.Module):
    def __init__(self):
        super(SplitNet, self).__init__()
        self.part1 = nn.Linear(28*28, 256).to(device0)
        self.part2 = nn.Linear(256, 10).to(device1)

    def forward(self, x):
        x = x.view(-1, 28*28).to(device0)
        x = F.relu(self.part1(x))
        x = x.to(device1)
        return self.part2(x)

# Initialize model, loss, and optimizer
model = SplitNet()
criterion = nn.CrossEntropyLoss().to(device1)
optimizer = optim.Adam([
    {'params': model.part1.parameters()},
    {'params': model.part2.parameters()}
], lr=0.001)

# One-batch demonstration
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
data, target = next(iter(train_loader))
data, target = data.to(device0), target.to(device1)

optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()

print("Model parallelism step completed successfully.")




100%|██████████| 9.91M/9.91M [00:00<00:00, 126MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 13.8MB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 106MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 10.5MB/s]


Epoch 1 completed with averaged gradients.
Epoch 2 completed with averaged gradients.
Model parallelism step completed successfully.


# Task 1:

In [4]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split, TensorDataset
import numpy as np

# --- Task 1: Load and Describe the New Dataset ---

print("--- Loading and Describing ENB2012_data.csv ---")
file_name = 'ENB2012_data.csv'
df = pd.read_csv(file_name)

# Describe the dataset
print(f"Number of training samples: {len(df)}")
features = df.columns.drop(['Y1', 'Y2'])
targets = ['Y1', 'Y2']
print(f"Features ({len(features)}): {list(features)}")
print(f"Target variables ({len(targets)}): {list(targets)}")
print("This is a regression task, not a classification task.")
print("-" * 30 + "\n")

# Prepare data for PyTorch
X = torch.tensor(df[features].values.astype(np.float32))
y = torch.tensor(df[targets].values.astype(np.float32))
full_dataset = TensorDataset(X, y) # Uses all 768 samples


# --- Task 2: Adapt and Run Step 3 (Data Parallelism) ---

print("--- Running Adapted Data Parallelism Example (Step 3) ---")
# Split dataset (768 samples -> 384 + 384)
train1, train2 = random_split(full_dataset, [384, 384])
loader1 = DataLoader(train1, batch_size=64, shuffle=True)
loader2 = DataLoader(train2, batch_size=64, shuffle=True)


# Model - ADAPTED for 8 inputs and 2 outputs
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # 8 input features (X1-X8)
        self.fc1 = nn.Linear(8, 128)
        # 2 output variables (Y1, Y2)
        self.fc2 = nn.Linear(128, 2)

    def forward(self, x):
        # No .view() needed as data is already 1D
        x = F.relu(self.fc1(x))
        return self.fc2(x)

model_dp = Net()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
model_dp = model_dp.to(device)
optimizer_dp = optim.Adam(model_dp.parameters(), lr=0.001)
# Criterion - ADAPTED for regression
criterion_dp = nn.MSELoss()

# Helper function to compute gradients on one subset
def compute_gradients(data_loader, model, criterion):
    model.zero_grad()
    try:
        data, target = next(iter(data_loader))
    except StopIteration:
        print("Data loader is empty.")
        return None

    data, target = data.to(device), target.to(device)
    output = model(data)
    loss = criterion(output, target)
    loss.backward()
    grads = [p.grad.clone() for p in model.parameters() if p.grad is not None]
    return grads

# Training loop with gradient averaging
for epoch in range(2):
    grads1 = compute_gradients(loader1, model_dp, criterion_dp)
    grads2 = compute_gradients(loader2, model_dp, criterion_dp)

    if grads1 is None or grads2 is None:
        print(f"Epoch {epoch+1} skipped due to empty data loader.")
        continue

    valid_params = [p for p in model_dp.parameters() if p.grad is not None]

    if not grads1 or not grads2 or len(grads1) != len(valid_params) or len(grads2) != len(valid_params):
        print(f"Epoch {epoch+1} skipped due to gradient mismatch.")
        continue

    # Average gradients
    with torch.no_grad():
        for p, g1, g2 in zip(valid_params, grads1, grads2):
            p.grad = (g1 + g2) / 2.0

    optimizer_dp.step()
    print(f"Epoch {epoch+1} completed with averaged gradients.")

print("-" * 30 + "\n")


# --- Task 3: Adapt and Run Step 4 (Model Parallelism) ---

print("--- Running Adapted Model Parallelism Example (Step 4) ---")

# Define devices
device0 = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device1 = torch.device("cuda:1" if torch.cuda.device_count() > 1 else device0)
print(f"Using device0: {device0}")
print(f"Using device1: {device1} (Will be same as device0 if not multi-GPU)")

# Split model across two devices - ADAPTED
class SplitNet(nn.Module):
    def __init__(self):
        super(SplitNet, self).__init__()
        # 8 input features
        self.part1 = nn.Linear(8, 256).to(device0)
        # 2 output features
        self.part2 = nn.Linear(256, 2).to(device1)

    def forward(self, x):
        x = x.to(device0)
        x = F.relu(self.part1(x))
        x = x.to(device1)
        return self.part2(x)

model_mp = SplitNet()
# Criterion - ADAPTED for regression, and move to device1
criterion_mp = nn.MSELoss().to(device1)

# Optimizer needs parameters from both parts
optimizer_mp = optim.Adam([
    {'params': model_mp.part1.parameters()},
    {'params': model_mp.part2.parameters()}
], lr=0.001)

# One-batch demonstration
train_loader_mp = DataLoader(full_dataset, batch_size=64, shuffle=True)
data, target = next(iter(train_loader_mp))

# Move data to the correct starting devices
data = data.to(device0)
target = target.to(device1) # Target moves to where the loss is computed

optimizer_mp.zero_grad()
output = model_mp(data)
loss = criterion_mp(output, target)
loss.backward()
optimizer_mp.step()

print("Model parallelism step completed successfully.")
print("-" * 30 + "\n")

--- Loading and Describing ENB2012_data.csv ---
Number of training samples: 768
Features (8): ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']
Target variables (2): ['Y1', 'Y2']
This is a regression task, not a classification task.
------------------------------

--- Running Adapted Data Parallelism Example (Step 3) ---
Using device: cpu
Epoch 1 completed with averaged gradients.
Epoch 2 completed with averaged gradients.
------------------------------

--- Running Adapted Model Parallelism Example (Step 4) ---
Using device0: cpu
Using device1: cpu (Will be same as device0 if not multi-GPU)
Model parallelism step completed successfully.
------------------------------



# Task 2

In [5]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split, TensorDataset
import numpy as np
import time

# --- Setup: Load Data and Define Device ---

print("--- Loading and Describing ENB2012_data.csv ---")
# Load data
file_name = 'ENB2012_data.csv'
df = pd.read_csv(file_name)

# Prepare data for PyTorch
features = df.columns.drop(['Y1', 'Y2'])
targets = ['Y1', 'Y2']
X = torch.tensor(df[features].values.astype(np.float32))
y = torch.tensor(df[targets].values.astype(np.float32))
full_dataset = TensorDataset(X, y)
full_loader = DataLoader(full_dataset, batch_size=64)

# Define device
# Note: On a single-GPU or CPU machine, all devices will be the same.
# This is a *simulation* of parallelism.
device_count = torch.cuda.device_count() if torch.cuda.is_available() else 0
if device_count > 0:
    device_list = [torch.device(f"cuda:{i}") for i in range(device_count)]
else:
    device_list = [torch.device("cpu")] * 8 # Max devices we'll simulate

device = device_list[0]
print(f"Running simulation on main device: {device}")
print(f"Total available devices for simulation: {max(1, device_count)}")

# --- Helper Functions ---

# Helper for Data Parallelism (DP)
def compute_gradients_dp(data_loader, model, criterion, device):
    model.zero_grad()
    try:
        data, target = next(iter(data_loader))
    except StopIteration:
        return None # Loader is empty
    data, target = data.to(device), target.to(device)
    output = model(data)
    loss = criterion(output, target)
    loss.backward()
    grads = [p.grad.clone() for p in model.parameters() if p.grad is not None]
    return grads

# Helper to get final loss
def get_final_loss(model, data_loader, criterion, device):
    model.eval() # Set model to evaluation mode
    total_loss = 0
    with torch.no_grad():
        for data, target in data_loader:
            # For MP models, data and target go to specific devices
            if hasattr(model, 'part1'):
                 data = data.to(model.part1.weight.device) # First device
                 target = target.to(model.part_final.weight.device) # Last device
            else:
                 data, target = data.to(device), target.to(device)
            output = model(data)
            total_loss += criterion(output, target).item()
    return total_loss / len(data_loader)

# --- Task 2.1: Data Parallelism Benchmark ---

print("\n--- Starting Data Parallelism Benchmark ---")
# Using MSE (Mean Squared Error) instead of "accuracy"
dp_results = []
splits_to_test = [2, 4, 8]
N_EPOCHS = 5 # Number of simulated steps

# Define the model architecture for DP
class DPNet(nn.Module):
    def __init__(self):
        super(DPNet, self).__init__()
        self.fc1 = nn.Linear(8, 128)
        self.fc2 = nn.Linear(128, 2)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)

for n_splits in splits_to_test:
    print(f"Testing DP with {n_splits} splits...")
    model_dp = DPNet().to(device)
    optimizer_dp = optim.Adam(model_dp.parameters(), lr=0.001)
    criterion_dp = nn.MSELoss()

    # Create data splits
    split_size = len(full_dataset) // n_splits
    split_sizes = [split_size] * (n_splits - 1)
    split_sizes.append(len(full_dataset) - sum(split_sizes))

    start_time = time.time()

    for epoch in range(N_EPOCHS):
        datasets = random_split(full_dataset, split_sizes)
        loaders = [DataLoader(ds, batch_size=64, shuffle=True) for ds in datasets]

        all_grads = []
        for loader in loaders:
            grads = compute_gradients_dp(loader, model_dp, criterion_dp, device)
            if grads:
                all_grads.append(grads)

        if not all_grads:
            continue

        # Average gradients
        with torch.no_grad():
            valid_params = [p for p in model_dp.parameters() if p.grad is not None]
            for p_idx, p in enumerate(valid_params):
                sum_grad = torch.stack([g[p_idx] for g in all_grads]).sum(dim=0)
                p.grad = sum_grad / len(all_grads)

        optimizer_dp.step()

    training_time = time.time() - start_time
    final_loss = get_final_loss(model_dp, full_loader, criterion_dp, device)
    dp_results.append((n_splits, final_loss, training_time))

print("--- DP Benchmark Complete ---")

# --- Task 2.2: Model Parallelism Benchmark ---

print("\n--- Starting Model Parallelism Benchmark ---")
mp_results = []
parts_to_test = [2, 4]
N_EPOCHS = 5 # Train for 5 epochs

# Helper to get a device from our list, looping if needed
def get_device(idx):
    return device_list[idx % len(device_list)]

# Define the MP model architectures
class MPNet_2Parts(nn.Module):
    def __init__(self):
        super(MPNet_2Parts, self).__init__()
        self.part1 = nn.Linear(8, 256).to(get_device(0))
        self.part2 = nn.Linear(256, 2).to(get_device(1))
        self.part_final = self.part2

    def forward(self, x):
        x = x.to(self.part1.weight.device)
        x = F.relu(self.part1(x))
        x = x.to(self.part2.weight.device)
        x = self.part2(x)
        return x

class MPNet_4Parts(nn.Module):
    def __init__(self):
        super(MPNet_4Parts, self).__init__()
        self.part1 = nn.Linear(8, 128).to(get_device(0))
        self.part2 = nn.Linear(128, 256).to(get_device(1))
        self.part3 = nn.Linear(256, 128).to(get_device(2))
        self.part4 = nn.Linear(128, 2).to(get_device(3))
        self.part_final = self.part4

    def forward(self, x):
        x = x.to(self.part1.weight.device)
        x = F.relu(self.part1(x))
        x = x.to(self.part2.weight.device)
        x = F.relu(self.part2(x))
        x = x.to(self.part3.weight.device)
        x = F.relu(self.part3(x))
        x = x.to(self.part4.weight.device)
        x = self.part4(x)
        return x

for n_parts in parts_to_test:
    print(f"Testing MP with {n_parts} parts...")
    if n_parts == 2:
        model_mp = MPNet_2Parts()
    else:
        model_mp = MPNet_4Parts()

    optimizer_mp = optim.Adam(model_mp.parameters(), lr=0.001)
    final_device = model_mp.part_final.weight.device
    criterion_mp = nn.MSELoss().to(final_device)

    model_mp.train()
    start_time = time.time()

    for epoch in range(N_EPOCHS):
        for data, target in full_loader:
            data = data.to(get_device(0)) # Data to first device
            target = target.to(final_device) # Target to last device

            optimizer_mp.zero_grad()
            output = model_mp(data)
            loss = criterion_mp(output, target)
            loss.backward()
            optimizer_mp.step()

    training_time = time.time() - start_time
    final_loss = get_final_loss(model_mp, full_loader, criterion_mp, device)
    mp_results.append((n_parts, final_loss, training_time))

print("--- MP Benchmark Complete ---")

# --- Task 2.3: Report Results ---

print("\n\n--- Benchmark Results ---")

# Data Parallelism Table
dp_df = pd.DataFrame(dp_results, columns=['Split Datasets', 'Final MSE', 'Training Time (s)'])
print("\nData Parallelism Results (Simulation):")
print(dp_df.to_string(index=False))

# Model Parallelism Table
mp_df = pd.DataFrame(mp_results, columns=['Model Parts', 'Final MSE', 'Training Time (s)'])
print("\nModel Parallelism Results (Simulation):")
print(mp_df.to_string(index=False))

--- Loading and Describing ENB2012_data.csv ---
Running simulation on main device: cpu
Total available devices for simulation: 1

--- Starting Data Parallelism Benchmark ---
Testing DP with 2 splits...
Testing DP with 4 splits...
Testing DP with 8 splits...
--- DP Benchmark Complete ---

--- Starting Model Parallelism Benchmark ---
Testing MP with 2 parts...
Testing MP with 4 parts...
--- MP Benchmark Complete ---


--- Benchmark Results ---

Data Parallelism Results (Simulation):
 Split Datasets   Final MSE  Training Time (s)
              2  156.507947           0.020960
              4  119.146825           0.035024
              8 1712.233561           0.061678

Model Parallelism Results (Simulation):
 Model Parts  Final MSE  Training Time (s)
           2 103.379667           0.115690
           4  40.459468           0.235382
