<a href="https://colab.research.google.com/github/federicovilla55/optML_mini_project/blob/setup/Simple_MLP_setup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install openml



In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt
import time
from getpass import getpass
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

**CONNECT TO GIT**

You will need to create and store a Personal Access Token PAT and use this to connect with git. (this takes 30 seconds to create)

In [None]:
token = getpass("Enter your GitHub token: ")
repo_url = f"https://{token}@github.com/federicovilla55/optML_mini_project.git"

!git clone {repo_url}
%cd optML_mini_project
!git status

Enter your GitHub token: ··········
Cloning into 'optML_mini_project'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 7 (delta 0), reused 7 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (7/7), done.


Where to work in git

In [None]:
# To create a branch:

#!git checkout -b branch_name

# Switch to an existing branch:

#!git checkout existing_branch_name

#List all branches:

#!git branch

#Push a new branch to GitHub (if needed):

#!git push -u origin branch_name

Total 0 (delta 0), reused 0 (delta 0), pack-reused 0
remote: 
remote: Create a pull request for 'setup' on GitHub by visiting:[K
remote:      https://github.com/federicovilla55/optML_mini_project/pull/new/setup[K
remote: 
To https://github.com/federicovilla55/optML_mini_project.git
 * [new branch]      setup -> setup
Branch 'setup' set up to track remote branch 'setup' from 'origin'.


Run this add the end to push it to git

In [None]:
!ls -la /content

total 20
drwxr-xr-x 1 root root 4096 Mar 31 17:03 .
drwxr-xr-x 1 root root 4096 Mar 31 15:34 ..
drwxr-xr-x 4 root root 4096 Mar 24 13:34 .config
drwxr-xr-x 3 root root 4096 Mar 31 17:03 optML_mini_project
drwxr-xr-x 1 root root 4096 Mar 24 13:34 sample_data


In [None]:
!cp /content/Simple_MLP_setup.ipynb /content/optML_mini_project/
%cd /content/optML_mini_project
!ls -la
!git add Simple_MLP_setup.ipynb
!git commit -m "test"
!git push

cp: cannot stat '/content/Simple_MLP_setup.ipynb': No such file or directory
/content/optML_mini_project
total 20
drwxr-xr-x 3 root root 4096 Mar 31 17:03 .
drwxr-xr-x 1 root root 4096 Mar 31 17:03 ..
drwxr-xr-x 8 root root 4096 Mar 31 17:17 .git
-rw-r--r-- 1 root root  119 Mar 31 17:03 project.ipynb
-rw-r--r-- 1 root root  230 Mar 31 17:03 README.md
fatal: pathspec 'Simple_MLP_setup.ipynb' did not match any files
On branch setup
Your branch is up to date with 'origin/setup'.

nothing to commit, working tree clean
Everything up-to-date


**DATA SET**

UCI Adult dataset (also known as the "Census Income" dataset). This dataset is widely used for binary classification tasks and contains demographic information (such as age, work class, education, marital status, etc.) with the goal of predicting whether an individual earns more than $50K per year.

Size: Originally about 48,842 instances; after cleaning, around 45,000 instances.

Features: 14 demographic/categorical features (after one-hot encoding) and standardized numerical features.

In [None]:
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

# 1. Load and preprocess the UCI Adult dataset
adult = fetch_openml(name='adult', version=2, as_frame=True)
df = adult.frame

# Drop rows with missing values and separate features and target.
df = df.replace('?', np.nan).dropna()
y = df['class'].apply(lambda x: 1 if x.strip() == '>50K' else 0).values
X = df.drop(columns=['class'])

# One-hot encode categorical variables.
X = pd.get_dummies(X)

# Standardize numerical features.
scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X[X.columns])

# Convert to numpy arrays.
X_np = X.values.astype(np.float32)
y_np = y.astype(np.int64)

# Split the dataset into training (80%) and testing (20%).
X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, test_size=0.2, random_state=seed, stratify=y_np)

# Create TensorDatasets and DataLoaders.
train_dataset = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
test_dataset = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

#    The input dimension is determined by the number of features in the preprocessed data.
input_dim = X_np.shape[1]
num_classes = 2  # binary classification

Function that creates the architecture

In [None]:
# Define a helper function to get the activation function
def get_activation_fn(name):
    if name.lower() == 'relu':
        return nn.ReLU()
    elif name.lower() == 'sigmoid':
        return nn.Sigmoid()
    elif name.lower() == 'tanh':
        return nn.Tanh()
    else:
        raise ValueError("Unsupported activation function. Choose from 'relu', 'sigmoid', or 'tanh'.")

# Customizable MLP Model
class MLP(nn.Module):
    def __init__(self, input_dim, num_classes, amount_layers=2, neurons_per_layer=256,
                 activation='relu', dropout_rate=0.0):
        """
        input_dim: Number of input features.
        num_classes: Number of classes in the output.
        amount_layers: Number of hidden layers.
        neurons_per_layer: Number of neurons in each hidden layer.
        activation: Activation function to use ('relu', 'sigmoid', or 'tanh').
        dropout_rate: Dropout rate to apply after each hidden layer.
        """
        super(MLP, self).__init__()
        self.hidden_layers = nn.ModuleList()
        self.activation = get_activation_fn(activation)

        # First hidden layer: from input_dim to neurons_per_layer.
        self.hidden_layers.append(nn.Linear(input_dim, neurons_per_layer))

        # Additional hidden layers.
        for _ in range(amount_layers - 1):
            self.hidden_layers.append(nn.Linear(neurons_per_layer, neurons_per_layer))

        # Output layer.
        self.output_layer = nn.Linear(neurons_per_layer, num_classes)

        # Optional dropout layer.
        self.use_dropout = dropout_rate > 0
        if self.use_dropout:
            self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        # Pass input through each hidden layer with activation and optional dropout.
        for layer in self.hidden_layers:
            x = self.activation(layer(x))
            if self.use_dropout:
                x = self.dropout(x)
        # Output layer (no activation applied; use appropriate loss later).
        x = self.output_layer(x)
        return x

In [None]:
# 3. Define the training function.
def train_model(model, optimizer, train_loader, grad_noise_std=0.0):
    model.train()
    criterion = nn.CrossEntropyLoss()   #Loss function
    running_loss = 0.0
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()

        # Optionally inject gradient noise
        if grad_noise_std > 0.0:
            with torch.no_grad():
                for param in model.parameters():
                    if param.grad is not None:
                        noise = torch.randn_like(param.grad) * grad_noise_std
                        param.grad.add_(noise)

        optimizer.step()
        running_loss += loss.item() * data.size(0)
    avg_loss = running_loss / len(train_loader.dataset)
    return avg_loss

In [None]:
# 5. Evaluate the model on the test data.
def evaluate_model(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in data_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            pred = output.argmax(dim=1)
            correct += (pred == target).sum().item()
            total += target.size(0)
    return correct / total

In [None]:
# 5. Run cross-validation experiment.
def run_experiment(name, train_loader, neurons_per_layer, amount_layers, activation, learning_rate, epochs, dropout_rate=0.0, weight_decay=0.0, grad_noise_std=0.0, use_asgd=False, num_workers=0):
    print(f"\nStarting experiment: {name}")
    model = MLP(input_dim, num_classes, amount_layers, neurons_per_layer, activation, dropout_rate).to(device)
    train_losses = []

    start_time = time.time()
    # Choose optimizer.
    if use_asgd:
        optimizer = optim.ASGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    else:
        optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    for epoch in range(1, epochs+1):
        train_loss = train_model(model, optimizer, train_loader, grad_noise_std=grad_noise_std)
        train_losses.append(train_loss)
        print(f"Epoch {epoch}: Train Loss={train_loss:.4f}")

    elapsed_time = time.time() - start_time
    print(f"\nTraining completed in {elapsed_time:.2f} seconds.")

    # 4. Plot the training loss after each epoch.
    plt.figure()
    plt.plot(range(1, epochs + 1), train_losses, marker='o')
    plt.xlabel("Epoch")
    plt.ylabel("Training Loss")
    plt.title("Training Loss vs. Epoch")
    plt.grid(True)
    plt.show()

    return model

**EXPERIMENTS**

In [None]:
# Hyperparameters for architecture
amount_layers = 3
neurons_per_layer = 128
activation = 'relu'       # choose 'relu', 'sigmoid', or 'tanh'


In [None]:
# General Hyperparameters for training
batch_size = 64
epochs = 10
learning_rate = 0.01

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Experiment 1: Baseline (no explicit regularization)

baseline_trained_model = run_experiment("Baseline", train_loader, neurons_per_layer, amount_layers, activation, learning_rate, epochs, dropout_rate=0.0, weight_decay=0.0, grad_noise_std=0.0, use_asgd=False, num_workers=0)



In [None]:
# Experiment 2: Dropout Regularization

dropout_trained_model = run_experiment("Dropout", train_loader, neurons_per_layer, amount_layers, activation, learning_rate, epochs, dropout_rate=0.5, weight_decay=0.0, grad_noise_std=0.0, use_asgd=False, num_workers=0)


In [None]:
# Experiment 3: Weight Decay (L2 Regularization)

weight_decay_trained_model = run_experiment("Weight Decay", train_loader, neurons_per_layer, amount_layers, activation, learning_rate, epochs, dropout_rate=0.0, weight_decay=1e-4, grad_noise_std=0.0, use_asgd=False, num_workers=0)


In [None]:
# Experiment 4: Gradient Noise Injection

grad_noise_trained_model = run_experiment("Gradient Noise", train_loader, neurons_per_layer, amount_layers, activation, learning_rate, epochs, dropout_rate=0.0, weight_decay=0.0, grad_noise_std=0.01, use_asgd=False, num_workers=0)


In [None]:
# Experiment 5: ASGD with no explicit regularization and 2 DataLoader workers.

asgd_trained_model = run_experiment("ASGD (2 workers)", train_loader, neurons_per_layer, amount_layers, activation, learning_rate, epochs, dropout_rate=0.0, weight_decay=0.0, grad_noise_std=0.0, use_asgd=True, num_workers=2)
