In [2]:
## This is the code used for training a 2-layer net with corr loss on the output layer only
import torch
import torch.nn as nn
import torch.optim as optim
import math
import numpy as np
import matplotlib.pyplot as plt

# Define the Ramp activation function
class RampActivation(nn.Module):
    def __init__(self):
        super(RampActivation, self).__init__()

    def forward(self, x):
        return torch.clamp(x, min=0, max=1)


# Define a simple 2-layer network
class TwoLayerNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(TwoLayerNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.ramp = RampActivation()  # Replace ReLU with Ramp
        self.fc2 = nn.Linear(hidden_size, output_size)

        # Initialize fc1 with Rademacher distribution
        self.fc1.weight.data = (2 * torch.bernoulli(torch.full_like(self.fc1.weight.data, 0.5)) - 1) 
        self.fc1.bias.data.fill_(0)  # Bias initialized to 0

        # Initialize fc2 with weights and biases set to 0
        self.fc2.weight.data.fill_(0)
        self.fc2.bias.data.fill_(0)


    def forward(self, x):
        x = self.fc1(x)
        x = self.ramp(x)
        x = self.fc2(x)
        return x



def run_ramp(input_size,hidden_size,num_epochs):
    
    # Parameters
    output_size = 1
    batch_size = 1000
    learning_rate = 0.01
    
    # Create model, optimizer, and loss function
    model = TwoLayerNet(input_size, hidden_size, output_size)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    loss_fn = lambda output, target: -(output * target).mean()
    
    
    # Freeze weights of the first layer
    model.fc1.weight.requires_grad = False
    model.fc2.bias.requires_grad = False
    model.fc1.bias.requires_grad = False
    
    lossvec, accvec = list(),list()
    
    
    # Training loop
    for epoch in range(num_epochs):
        # Forward pass
    
    
        inputs = 1 - 2 * torch.bernoulli(torch.ones((batch_size, input_size))*1/2)
        targets = inputs[:, :].prod(axis=1).reshape(-1, 1)
        outputs = model(inputs)

        
        # Compute loss
        loss = loss_fn(outputs, targets)
        acc = (1+(torch.sign(outputs)*targets).mean())/2
    
        # Backward pass
        optimizer.zero_grad()
        loss.backward()

        lossvec.append(loss.item())
        accvec.append(acc.item())

        if acc.item() >1-1e-12:
            break
            
        optimizer.step()

    return lossvec,accvec




In [3]:
# Define a simple 2-layer network
class TwoLayerNet_relu(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(TwoLayerNet_relu, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

        # Initialize fc1 with Rademacher distribution
        self.fc1.weight.data = (2 * torch.bernoulli(torch.full_like(self.fc1.weight.data, 0.5)) - 1) 
        self.fc1.bias.data.fill_(0)  # Bias initialized to 0

        # Initialize fc2 with weights and biases set to 0
        self.fc2.weight.data.fill_(0)
        self.fc2.bias.data.fill_(0)


    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x



def run_relu(input_size,hidden_size,num_epochs):
    
    # Parameters
    output_size = 1
    batch_size = 1000
    learning_rate = 0.01
    
    # Create model, optimizer, and loss function
    model = TwoLayerNet_relu(input_size, hidden_size, output_size)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    loss_fn = lambda output, target: -(output * target).mean()
        
    
    # Freeze weights of the first layer
    model.fc1.weight.requires_grad = False
    model.fc2.bias.requires_grad = False
    model.fc1.bias.requires_grad = False
    
    lossvec, accvec = list(),list()
    
    
    # Training loop
    for epoch in range(num_epochs):

        inputs = 1 - 2 * torch.bernoulli(torch.ones((batch_size, input_size))*1/2)
        targets = inputs[:, :].prod(axis=1).reshape(-1, 1)
        outputs = model(inputs)

        loss = loss_fn(outputs, targets)
        acc = (1+(torch.sign(outputs)*targets).mean())/2
    
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
 

        lossvec.append(loss.item())
        accvec.append(acc.item())

        if acc.item() >1-1e-12:
            break
            

        optimizer.step()
    return lossvec,accvec




In [1]:
T=150

Texp = 10

elist = [0.5,1,1.5, 2]
labels = ['$w=\sqrt{d}$','$w=d$','$w=d^{1.5}$','$w=d^2$'] 

dlist = [50,100,150,200]

fig, ax = plt.subplots(figsize=(6, 3.5))  


for i, e in enumerate(elist):
    acc_mean, acc_std = [], []
    for d in dlist:
        acctemp = []
        h = int(d**e + 10)
        for exp in range(Texp):
            lossvec, accvec = run_ramp(d, h, T)
            print(f'e: {e}, d: {d}, exp: {exp}')
            print(f'Max accuracy: {max(accvec)}')
            acctemp.append(max(accvec))

        acc_mean.append(np.mean(acctemp))
        acc_std.append(np.std(acctemp))

    acc_mean = np.array(acc_mean)
    acc_std = np.array(acc_std)
    
    
    ax.plot(dlist, acc_mean, label=labels[i], marker='o')  
    ax.fill_between(dlist, acc_mean - acc_std, acc_mean + acc_std, alpha=0.2)  


ax.legend(fontsize=12, loc='best', frameon=True, shadow=True)


ax.set_ylabel('Test Accuracy', fontsize=14)
ax.set_xlabel('Input Dimension', fontsize=14)
ax.grid(visible=True, linestyle='--', linewidth=0.5)
ax.tick_params(axis='both', labelsize=12)


plt.tight_layout()
plt.show()

        


In [26]:
fig.savefig('corr_layerwise_width_ramp.pdf',bbox_inches='tight')