### Lab 3.1: Batching and Regularization

In this lab you will learn how to set up a dataset to be processed in batches, rather than processing the entire dataset in each training iteration, and explore neural network regularization.

In [1]:
import numpy as np
import torch

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
  
# metadata 
print(adult.metadata) 
  
# variable information 
print(adult.variables)

In [None]:
X.columns

In [4]:
y = y['income'].map({'<=50K':0,'<=50K.':0,'>50K':1,'>50K.':1})

In [5]:
X = X[['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']]

In [6]:
y = y.values
X = X.values.astype('float64')

To make the learning algorithm work more smoothly, we we will subtract the mean of each feature.

Here `np.mean` calculates a mean, and `axis=0` tells NumPy to calculate the mean over the rows (calculate the mean of each column).

In [7]:
X -= np.mean(X,axis=0)

Now we will convert our `X` and `y` arrays to torch Tensors.

In [8]:
X = torch.tensor(X).float()
y = torch.tensor(y).long()

### Exercises

1. Divide the data into train and test splits.
2. Create a neural network for this dataset.
3. Use `TensorDataset` and `DataLoader` to batch the dataset during training.  
4. Use `weight_decay` parameter to `optim.SGD` to introduce L2 regularization during training. Evaluate the effect of regularization on test set accuracy.

In [9]:
# 1. Divide data into train and test splits
import sklearn
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=0, train_size = .75)

In [13]:
# 2. Create a neural network for this dataset. 
mlp_model = torch.nn.Sequential(
    torch.nn.Linear(6, 100), # 6 inputs, 1 hidden layer of size 100
    
    # hidden activation function, the magic happens
    torch.nn.ReLU(),
    
    torch.nn.Linear(100, 2) # 100 inputs, 2 outputs
)

# Create a cross-entropy loss function and a stochastic gradient descent (SGD) optimizer
loss_fn = torch.nn.CrossEntropyLoss()
lr = 1e-2
opt = torch.optim.SGD(mlp_model.parameters(), lr=lr)

In [18]:

# 3. Use TensorDataset and Dataloader to batch the dataset during training.

batch = 32

train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch, shuffle=True)

test_dataset = torch.utils.data.TensorDataset(X_test, y_test)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch, shuffle=False)
epochs = 100
total_loss = 0

for epoch in range(epochs):
    for batch_X, batch_y in train_dataloader:
        opt.zero_grad()  # Zero out gradients

        z = mlp_model(batch_X)  # Forward pass
        loss = loss_fn(z, batch_y)  # Compute loss

        loss.backward()  # Backpropagation
        opt.step()  # Apply gradients

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_dataloader):.4f}")



IndexError: Target 1 is out of bounds.

In [17]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

# Load and preprocess the dataset
from ucimlrepo import fetch_ucirepo

# Fetch dataset
adult = fetch_ucirepo(id=2)

# Data (as pandas DataFrames)
X = adult.data.features
y = adult.data.targets

# Preprocessing
y = y['income'].map({'<=50K': 0, '<=50K.': 0, '>50K': 1, '>50K.': 1})
X = X[['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']]
y = y.values
X = X.values.astype('float64')

# Normalize the data (subtract the mean for each feature)
X -= np.mean(X, axis=0)

# Convert data to PyTorch tensors
X = torch.tensor(X).float()
y = torch.tensor(y).long()

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.75)

# Define the dataset and dataloader
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)



# Define the neural network
class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 2)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize the model, loss function, and optimizer
model = NeuralNetwork(input_size=X_train.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)  # L2 regularization with weight_decay

# Training the model
epochs = 100
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_dataloader:
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_dataloader):.4f}")

# Evaluate the model on the test set
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch_X, batch_y in test_dataloader:
        outputs = model(batch_X)
        _, predicted = torch.max(outputs, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()

accuracy = correct / total
print(f"Test Set Accuracy: {accuracy:.4f}")


KeyboardInterrupt: 