In [1]:
import math
import torch
from torch import Tensor
import torch.nn as nn

## Dataset

In [2]:
from src.gaussian_dataset import GaussianDataset
from torch.utils.data import DataLoader

N = 10
D = 5
data_size = 1000

train_size = int(0.8 * data_size)
test_size = data_size - train_size

ds_train = GaussianDataset(
    num_samples=train_size,
    shape=(N, D),
    var1=1.0,
    var2=0.8,
    static=False,
)

ds_test = GaussianDataset(
    num_samples=test_size,
    shape=(N, D),
    var1=1.0,
    var2=0.8,
    static=True,
)

dl_train = DataLoader(
    dataset=ds_train,
    batch_size=32,
    shuffle=False,
)

dl_test = DataLoader(
    dataset=ds_test,
    batch_size=32,
    shuffle=False,
)

## Models

In [3]:
from src.training import BinaryTrainer
from src.permutation import Permutation, create_all_permutations, create_permutations_from_generators
from src.layers import LinearEquivariant, LinearInvariant

device = torch.device("cpu")

  from .autonotebook import tqdm as notebook_tqdm


### Canonization Based

In [4]:
class CanonicalModel(nn.Module):
    def __init__(self, model: nn.Module) -> None:
        super().__init__()
        self.model = model.to(device)

    def forward(self, x: Tensor) -> Tensor:
        x = torch.sort(x, dim=-1, descending=True).values
        return self.model(x)

In [None]:
layers = nn.Sequential(
    nn.Flatten(start_dim=1),
    nn.Linear(in_features=N * D, out_features=10 * D),
    nn.ReLU(),
    nn.Linear(in_features=10 * D, out_features=10 * D),
    nn.ReLU(),
    nn.Linear(in_features=10 * D, out_features=1),
    nn.Sigmoid(),
)

model = CanonicalModel(layers)

trainer = BinaryTrainer(
    model=model,
    loss_fn=nn.BCELoss(),
    optimizer=torch.optim.Adam(model.parameters(), lr=0.001),
    device=device,
    log=False,
)

trainer.fit(
    dl_train=dl_train,
    dl_test=dl_test,
    num_epochs=300,
    print_every=10,
)

In [5]:
from src.layers import PositionalEncoding

layers = nn.Sequential(
    PositionalEncoding(d_model=D, max_len=N),
    nn.TransformerEncoder(
        nn.TransformerEncoderLayer(batch_first=True, d_model=D, nhead=1),
        num_layers=2,
    ),
    nn.Sigmoid(),
)

model = CanonicalModel(layers)

trainer = BinaryTrainer(
    model=model,
    loss_fn=nn.BCELoss(),
    optimizer=torch.optim.Adam(model.parameters(), lr=0.001),
    device=device,
    log=False,
)

trainer.fit(
    dl_train=dl_train,
    dl_test=dl_test,
    num_epochs=300,
    print_every=10,
)



--- EPOCH 1/300 ---
train_batch:   0%|          | 0/25 [00:00<?, ?it/s]


ValueError: Using a target size (torch.Size([32])) that is different to the input size (torch.Size([1600])) is deprecated. Please ensure they have the same size.

### Symmetrization Network

In [None]:
from typing import Iterable, Callable, Iterator
from collections import deque


class SymmetryModel(nn.Module):
    def __init__(
        self,
        model: nn.Module,
        perm_creator: Callable[[None], Iterator[Permutation]],
        chunksize: int = 1,
    ) -> None:
        super().__init__()
        self.model = model
        self.perm_creator = perm_creator
        self.chunksize = chunksize

    def _chunk(self, data: Iterable[Permutation], chunksize: int) -> Iterable[list[Permutation]]:
        data_iter: Iterable[Permutation] = iter(data)
        buffer: deque[Permutation] = deque()

        while True:
            try:
                buffer.append(next(data_iter))
            except StopIteration:
                break

            if len(buffer) == chunksize:
                yield list(buffer)
                buffer.clear()

        if buffer:
            yield list(buffer)

    def forward(self, x: Tensor) -> Tensor:
        total = 0
        result = None

        perms = self.perm_creator()
        for perm_chunk in self._chunk(perms, self.chunksize):
            chunksize = len(perm_chunk)
            total += chunksize
            permuted = torch.vstack([perm(x) for perm in perm_chunk])

            output: Tensor = self.model.forward(permuted)
            output = output.reshape(chunksize, output.shape[0] // chunksize, *output.shape[1:])
            output = torch.sum(output, dim=0)

            if result is None:
                result = output
            else:
                result = result + output

        result = result / total

        return result

In [None]:
layers = nn.Sequential(
    nn.Flatten(start_dim=1),
    nn.Linear(in_features=N * D, out_features=10 * D),
    nn.ReLU(),
    nn.Linear(in_features=10 * D, out_features=10 * D),
    nn.ReLU(),
    nn.Linear(in_features=10 * D, out_features=1),
    nn.Sigmoid(),
)

shift_perm = torch.arange(N) + 1
shift_perm[-1] = 0

model = SymmetryModel(
    layers,
    perm_creator=lambda: create_permutations_from_generators([Permutation(shift_perm)]),
    chunksize=10,
)

trainer = BinaryTrainer(
    model=model,
    loss_fn=nn.BCELoss(),
    optimizer=torch.optim.Adam(model.parameters(), lr=0.01),
    device=device,
    log=False,
)

trainer.fit(
    dl_train=dl_train,
    dl_test=dl_test,
    num_epochs=300,
    print_every=10,
)

### Sampled Symmetrization Network

In [None]:
layers = nn.Sequential(
    nn.Flatten(start_dim=1),
    nn.Linear(in_features=N * D, out_features=10 * D),
    nn.ReLU(),
    nn.Linear(in_features=10 * D, out_features=10 * D),
    nn.ReLU(),
    nn.Linear(in_features=10 * D, out_features=1),
    nn.Sigmoid(),
)

num = int(math.factorial(N) * 0.05)
num = 10

model = SymmetryModel(
    layers,
    perm_creator=lambda: (Permutation(torch.randperm(N, dtype=torch.long)) for _ in range(num)),
    chunksize=10,
)

trainer = BinaryTrainer(
    model=model,
    loss_fn=nn.BCELoss(),
    optimizer=torch.optim.Adam(model.parameters(), lr=0.001),
    device=device,
    log=False,
)

trainer.fit(
    dl_train=dl_train,
    dl_test=dl_test,
    num_epochs=300,
    print_every=10,
)

### Linear Equivariant

In [None]:
model = nn.Sequential(
    LinearEquivariant(in_channels=D, out_channels=10),
    nn.ReLU(),
    LinearEquivariant(in_channels=10, out_channels=10),
    nn.ReLU(),
    LinearInvariant(in_channels=10, out_channels=1),
    nn.Sigmoid(),
)

trainer = BinaryTrainer(
    model=model,
    loss_fn=nn.BCELoss(),
    optimizer=torch.optim.Adam(model.parameters(), lr=0.001),
    device=device,
    log=False,
)

trainer.fit(
    dl_train=dl_train,
    dl_test=dl_test,
    num_epochs=300,
    print_every=10,
)

### Standard with Augmentation

In [None]:
class Augmentation(nn.Module):
    def __init__(self) -> None:
        super().__init__()

    def forward(self, x: Tensor) -> Tensor:
        """
        Randomly permute the input tensor along the channel dimension.

        Args:
            x (Tensor): Input tensor of shape (batch_size, d, channel)
        """
        rnd = torch.randn_like(x)
        indices = rnd.argsort(dim=-1)
        result = torch.gather(x, -1, indices)
        return result


model = nn.Sequential(
    Augmentation(),
    nn.Flatten(start_dim=1),
    nn.Linear(in_features=N * D, out_features=10 * D),
    nn.ReLU(),
    nn.Linear(in_features=10 * D, out_features=10 * D),
    nn.ReLU(),
    nn.Linear(in_features=10 * D, out_features=1),
    nn.Sigmoid(),
)

trainer = BinaryTrainer(
    model=model,
    loss_fn=nn.BCELoss(),
    optimizer=torch.optim.Adam(model.parameters(), lr=0.001),
    device=device,
    log=False,
)

trainer.fit(
    dl_train=dl_train,
    dl_test=dl_test,
    num_epochs=300,
    print_every=10,
)

---
---

### Question 4: Challenges encountered during Implementation:

##### Numeric Errors:

The first challenge encountered is in the implementation of the invariant and equivariant layers.
The main implementation challenge rose from the fact that in the lecture, the equivariant layer is formulated as follows:

$$ F(x) : \mathbb{R}^{n \times d} \rightarrow \mathbb{R}^{n \times d'} $$

$$ F(x)_j = \sum _{i=1} ^ {d} L_{ij}(x) $$ 
where $L_{ij}(x)$ is a single feature linear equivariant layer.

Technically, this implementation is indeed correct, but the summation over all $L_{ij}(x)$ might causes layer outputs to blow-up.  
As result, the outputs of the $F \circ a \circ F ...$ become very large.

Our network is composed of these layers $\phi \circ F \circ a \circ F ...$, when $\phi$ is the sigmoid function that returns values between 0 and 1.

Since the last layer of the network is a sigmoid function, and the results of the previous layers are very large (their absolute value), the sigmoid function saturates and returns either 0.0 or 1.0. Because the sigmoid function got saturated, the propagated gradients become 0, hence the network does not learn.

To resolve this issue we defined the equivariant layer as follows:

$$ F(x)_j = \frac{1}{d} \sum _{i=1} ^ {d} L_{ij}(x) $$ 

This formulation still retains the equivariance property, but it prevents the layer outputs from blowing-up.

*Note: We applied the same averaging technique to the invariant layers as well.*

##### Overfitting:

Another big issue we encountered was overfitting. To overcome it, we added an option to dynamically generate the data every time the `Dataset` is accessed. 
This way, the model never sees the same data twice, and not able to overfit. That indeed resolved completely the overfitting issue.
For the comparative analysis, we didn't use this option.

### Question 8:

Currently, we're using the symmetry group $S_n$ over the channel dimensions.
A better symmetry group to use would be $S_n \times S_d$ when $S_n$ acts on the channel dimension and $S_d$ acts on the feature dimension. The reason this symmetry group is suitable is because each feature is a vector of length $d$ generated from a normal distribution, and any permutation of the vector does not change the probability of it being generated, nor the underlying distribution that generated it. Since the model tries to detect the underlying distribution, it should be invariant to permutations of the feature dimensions.

Formally:

$$ \Pr(x_1, x_2, ... x_n \sim \mathcal{N}(0, I) \; | \; x_1, x_2, ... x_n) = 
\Pr(\sigma \cdot x_1, \sigma \cdot x_2, ... \sigma \cdot x_n \sim \mathcal{N}(0, I) \; | \; x_1, x_2, ... ,x_n, \forall \sigma \in S_d) $$

when $x_i$ is a feature vector of length $d$ and $\sigma$ is a permutation of the feature dimensions
(remember that each input sample is composed of $n$ feature vectors of length $d$).