[![open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1DSSHN_lRd0A_tPBwYBi6zlOd_9N1DBJ3#scrollTo=dpz7yKFTYXPZ)

## HW Requirement

• Develop your own WAN algorithm and make the 
corresponding code.\
• Once you have the code, you will apply the code to learn 
your dataset to get a better code. Better means the 
better hyperparameter setting regarding your dataset.\
• The training and test dataset is 80%/20%.\
• The performance comparison benchmark is your best 
weight-tuning module.

## Model

In [2]:
import torch
from torch import nn, optim, Generator
from torch.utils.data import DataLoader, Dataset, random_split
from numpy.random import choice
from typing import Iterable, Callable, Type, Optional, Union, Tuple, List


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from operator import mul


def product(nums: Iterable[Type], func: Callable[[Type, Type], Type] = mul) -> Type:
    """return product of iterable"""
    _it = iter(nums)
    v: Type = next(_it)
    for _v in _it:
        v = func(v, _v)
    return v


In [4]:
from collections import deque


class TwoLayerNetwork(nn.Module):

    def __init__(self, input_size: int, hidden_size: int, num_classes: int, init_method: Callable[[torch.Tensor], torch.Tensor], active_func: Callable[[], nn.modules.module.Module],
                 DO: float, if_BN: bool, store_size: int = 1):
        super(TwoLayerNetwork, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.if_BN = if_BN
        # dropout
        self.do = nn.Dropout(DO)
        # first layer
        self.fc1 = nn.Linear(input_size, hidden_size)
        # batch norm
        self.bn1 = nn.BatchNorm1d(hidden_size)
        # activation
        self.active_func = active_func()
        # second layer
        self.fc2 = nn.Linear(hidden_size, num_classes)
        # initialize
        for param in self.parameters():
            init_method(param)
        self.storage: deque[List[nn.Parameter]] = deque(maxlen=store_size)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        out: torch.Tensor = self.do(x)
        out = self.fc1(out)
        if self.if_BN:
            out = self.bn1(out)
        out = self.active_func(out)
        out = self.fc2(out)
        return out


In [5]:
class WD_Regularization(nn.Module):
    def __init__(self):
        super(WD_Regularization, self).__init__()


class L2_Regularization(WD_Regularization):
    def __init__(self, weight_decay: float):
        super(L2_Regularization, self).__init__()
        if weight_decay <= 0:
            raise ValueError("param weight_decay can not <=0!!")
        self.weight_decay = weight_decay

    def forward(self, model: nn.Module) -> Union[torch.Tensor, float]:
        reg = 0
        for name, parameter in model.named_parameters():
            if "weight" in name:
                reg += torch.sum(parameter**2)
        return self.weight_decay * reg


In [6]:
def validate(model: TwoLayerNetwork, device: str, valset: Dataset[torch.Tensor], criterion: nn.modules.loss._Loss) \
        -> Tuple[float, float]:
    """return loss, accuracy"""
    # Validate the model
    model.to(device)
    val_loss = 0.0
    val_correct = 0
    model.eval()
    with torch.no_grad():
        for x, y in DataLoader(valset, batch_size=32, shuffle=True):
            x: torch.Tensor = x.view(-1, model.input_size).to(device)
            y: torch.Tensor = y.to(device)
            outputs: torch.Tensor = model(x)
            loss: torch.Tensor = criterion(outputs, y)
            val_loss += loss.item() * x.size(0)
            _, predicted = torch.max(outputs.data, 1)
            val_correct += (predicted == y).sum().item()
        val_loss /= len(valset)
        val_accuracy = val_correct / len(valset)
    return val_loss, val_accuracy


In [7]:
def train(model: TwoLayerNetwork, opt: Callable[..., optim.Optimizer], device: str, epochs: float, learning_rate: float, trainset: Dataset[torch.Tensor], valset: Dataset[torch.Tensor], criterion: nn.modules.loss._Loss,
          sched: Optional[Callable[[optim.Optimizer], optim.lr_scheduler._LRScheduler]], wd_reg: Optional[WD_Regularization], learning_goal: float, min_lr: float, if_lr_adjust: bool, if_BN: bool, drop_rate: float) \
        -> List[Tuple[float, float, float, float]]:
    """
    Params:
        model
        opt
        device
        epochs
        learing_rate
        criterion
        y: label of data
        wd_reg, BN, DO: regularization
    Results:
        history: train_loss, train_accuracy, val_loss, val_accuracy of each epochs
    """
    def forward_backward(optimizer: optim.Optimizer, criterion: nn.modules.loss._Loss, wd_reg: Optional[WD_Regularization], model: TwoLayerNetwork, y: torch.Tensor,
                         BN: Optional[nn.modules.batchnorm._BatchNorm], DO: Optional[nn.modules.dropout._DropoutNd]) \
            -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Params:
            optimizer
            criterion
            model
            y: label of data
            wd_reg, BN, DO: regularization
        Results:
            ouputs: f(x)
            loss_all: f(x) - y
        """
        optimizer.zero_grad()
        outputs = model(x)
        outputs = outputs if not DO else DO(outputs)
        loss_all: torch.Tensor = criterion(
            outputs, y) + wd_reg(model) if wd_reg else criterion(outputs, y)
        loss_all.backward()
        optimizer.step()
        return loss_all, outputs
    if epochs < 1:
        raise ValueError("Invalid epoch!!")
    if not 0 <= drop_rate < 1:
        raise ValueError("Invalid dropout rate!!")
    # init
    epoch = 0
    init_lr = learning_rate
    origin_if_BN = model.if_BN
    model.if_BN = if_BN
    pre_loss = float("inf") if if_lr_adjust else None
    batch_norm = nn.BatchNorm1d(model.hidden_size).to(
        device) if if_BN else None
    drop_out = nn.Dropout(drop_rate).to(device) if drop_rate != 0. else None
    model.to(device)
    # if not model.storage[-1]
    model.storage.append(list(model.parameters()))
    optimizer = opt(model.storage[-1], lr=learning_rate)
    scheduler = sched(optimizer) if sched else None
    history = []
    # Train the model
    while epoch < epochs:
        train_loss = 0.0
        train_correct = 0
        model.train()
        for x, y in DataLoader(trainset, batch_size=32, shuffle=True):
            x: torch.Tensor = x.view(-1, model.input_size).to(device)
            y: torch.Tensor = y.to(device)
            loss_all, outputs = forward_backward(
                optimizer, criterion, wd_reg, model, y, batch_norm, drop_out)
            # Learning rate adjustment
            if pre_loss:
                while pre_loss <= loss_all.item():
                    # learning rate vanishing
                    if learning_rate < min_lr:
                        # return history
                        learning_rate = init_lr
                        optimizer = opt(model.storage[-1], lr=learning_rate)
                        loss_all, outputs = forward_backward(
                            optimizer, criterion, wd_reg, model, y, batch_norm, drop_out)
                        # raise ValueError(f"{learning_rate} < {min_lr}")
                        break
                    learning_rate *= 0.7
                    optimizer = opt(model.storage[-1], lr=learning_rate)
                    loss_all, outputs = forward_backward(
                        optimizer, criterion, wd_reg, model, y, batch_norm, drop_out)
                learning_rate *= 1.2
                pre_loss = loss_all.item()
            train_loss += loss_all.item() * x.size(0)
            predicted: torch.Tensor = torch.max(outputs.data, 1)[1]
            train_correct += (predicted == y).sum().item()
            model.storage.append(list(model.parameters()))
        train_loss /= len(trainset)
        train_accuracy = train_correct / len(trainset)
        # Validate the model
        val_loss, val_accuracy = validate(
            model=model, device=device, valset=valset, criterion=criterion)
        # Log statics
        history.append((train_loss, train_accuracy, val_loss, val_accuracy))
        # Stopping criteria
        if learning_goal < val_accuracy:
            return history
        # Update loop
        if scheduler:
            scheduler.step()
        epoch += 1
    # restore model
    model.if_BN = origin_if_BN
    return history


In [8]:
def test(model: TwoLayerNetwork, device: str, testset: Dataset[torch.Tensor]) -> float:
    """return accuracy"""
    return validate(model=model, device=device, valset=testset, criterion=nn.CrossEntropyLoss())[1]


In [9]:
def analogizing(model: TwoLayerNetwork, device: str, trainset: Dataset[torch.Tensor], learning_goal: float, criterion: nn.modules.loss._Loss):
    x = torch.stack([x for x, _ in trainset]
                    ).view(-1, model.input_size).to("cpu")
    y = torch.Tensor([y for _, y in trainset]).to("cpu")
    total_amount = len(x)
    # get wrong correct indices
    new_fc1_w = model.fc1.weight.data.to(device)
    new_fc1_b = model.fc1.bias.data.to(device)
    new_fc2_w = model.fc2.weight.data.to(device)
    relu = nn.ReLU()
    logits: torch.Tensor = x
    model.eval()
    with torch.no_grad():
        outputs = relu(x.to(device) @ new_fc1_w.T + new_fc1_b
                       ) @ new_fc2_w.T + model.fc2.bias.data.to(device)
        logits, predicted = torch.max(outputs.data, 1)
        success_condition = predicted == y.to(device)
        wrong_indices = torch.nonzero(success_condition != True).to("cpu")
    #
    init_func: Callable[[torch.Tensor], torch.Tensor] = lambda x: nn.init.xavier_uniform_(
        tensor=x) if len(x.shape) > 1 else x
    wrong_pointer = len(wrong_indices)
    train_correct: int = 0
    loss: float = float("inf")
    history: List[Tuple[float, float]] = []
    while wrong_pointer > 0 and loss > learning_goal:
        fc1 = nn.Linear(len(x[0]), 3).to(device)
        fc2 = nn.Linear(3, product(model.fc2.bias.size()),
                        False).to(device)
        # with torch.no_grad():
        target = torch.zeros(*torch.Size((total_amount,)))
        wrong_pointer -= 1
        pointer = wrong_indices[wrong_pointer]
        catagory = int(y[pointer])
        target[pointer] = catagory
        target = target.to(device)
        fc2.weight.data[:, :] = 0
        fc2.weight.data[catagory, 0] = -2
        fc2.weight.data[catagory, 1] = 1
        fc2.weight.data[catagory, 2] = 1
        delta = 0
        intercept = 0
        nonz = x
        # randomly generate hyperplane which only contain the target x
        while nonz.size() != (1, 2) or nonz.tolist()[0][0] != pointer:
            for p in fc1.parameters():
                init_func(p)
            distances = x.to(device) @ fc1.weight.data[0].T
            intercept = distances[pointer]
            distances -= intercept
            # get the shortest distance of other x to hyperplane
            if (delta := torch.min(torch.abs(torch.cat(
                    (distances[:pointer], distances[pointer + 1:])
            )))) == 0:
                continue
            fc1.bias.data[1] = -intercept + (delta / 2)
            fc1.bias.data[2] = -intercept - (delta / 2)
            # check if delta too small for float32(default)
            if fc1.bias.data[1] == fc1.bias.data[2]:
                continue
            fc1.bias.data[0] = -intercept
            fc1.weight.data[1:] = fc1.weight.data[0]
            outputs = relu(x.to(device) @ fc1.weight.data.T + fc1.bias.data
                           ) @ fc2.weight.data.T
            nonz = torch.nonzero(outputs)
        # adjust weight in order to make the output of correct category greater than the others
        fc2.weight.data *= logits[pointer].item(
        ) / outputs[pointer].sum() + 1
        new_fc1_w = torch.cat((new_fc1_w, fc1.weight.data)).to(device)
        new_fc1_b = torch.cat((new_fc1_b, fc1.bias.data)).to(device)
        new_fc2_w = torch.cat((new_fc2_w, fc2.weight.data), 1).to(device)
        with torch.no_grad():
            outputs = relu(x.to(device) @ new_fc1_w.T + new_fc1_b
                           ) @ new_fc2_w.T + model.fc2.bias.data.to(device)
            loss = criterion(
                outputs, y.to(device=device, dtype=torch.long)).item()
            _, predicted = torch.max(outputs.data, 1)
            train_correct = (predicted == y.to(device)).sum().item()
        history.append((loss, train_correct / total_amount))
    # construct new model
    new_model = TwoLayerNetwork(model.input_size, len(new_fc1_b), product(
        model.fc2.bias.size()), lambda _: _, lambda: model.active_func, model.do.p, model.if_BN)
    for name, param in model.named_parameters():
        layer_name, variable_type = name.split(".")
        if layer_name == "fc1":
            setattr(getattr(getattr(new_model, layer_name), variable_type),
                    "data", eval(f"new_{layer_name}_{variable_type[0]}"))
        elif layer_name == "fc2":
            if variable_type == "weight":
                setattr(getattr(getattr(new_model, layer_name), variable_type),
                        "data", eval(f"new_{layer_name}_{variable_type[0]}"))
            elif variable_type == "bias":
                new_model.fc2.bias.data[:] = model.fc2.bias.data[:]
            else:
                pass
                setattr(getattr(new_model, layer_name), variable_type, param)
        else:
            setattr(getattr(new_model, layer_name), variable_type, param)
    return new_model

# Dataset

### pytorch dataset

In [10]:
# load pytorch dataset
from torchvision import datasets, transforms


def getPytorchData(train: float = 0.8, remain: float = 0.1) \
        -> Tuple[Dataset[torch.Tensor], Dataset[torch.Tensor], Dataset[torch.Tensor], int, int]:
    """
    Params:
        train: train_amount / total_amount or 1 - valid_amount / total_amount
        remain: reduce data amount to save time
    Results:
        trainloader, valloader, testloader: dataloader
        datum_size: size of datum
        class_amount: amount of types
    """
    # preprocess: flatten, normalize, drop 90%, split
    transform = transforms.transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])
    if not 0 < train <= 1:
        raise ValueError()
    if not 0 < remain <= 1:
        raise ValueError()
    # Split the training set into training and validation sets
    trainset = datasets.FashionMNIST(
        root="./data/", train=True, download=False, transform=transform)
    train_count = int(train * remain * len(trainset))
    valid_count = int((1 - train) * remain * len(trainset))
    if train_count == 0 or valid_count == 0:
        raise ValueError()
    datum_size = product(trainset[0][0].size())
    class_amount = len(trainset.classes)
    testset = datasets.FashionMNIST(
        root="./data/", train=False, download=False, transform=transform)
    print(train_count, valid_count, len(testset))
    trainset, valset, _ = random_split(
        trainset, (train_count, valid_count, len(trainset) - train_count - valid_count), Generator().manual_seed(42))
    # Create dataloaders to load the data in batches
    # trainloader = DataLoader(trainset, batch_size=32, shuffle=True)
    # valloader = DataLoader(valset, batch_size=32, shuffle=True)
    # testloader = DataLoader(testset, batch_size=32, shuffle=True)
    return trainset, valset, testset, datum_size, class_amount


# WAN

### setting

In [16]:
device = "cuda" if torch.cuda.is_available(
) else "mps" if torch.backends.mps.is_available() else "cpu"
trainset, valset, testset, input_size, output_size = getPytorchData()
criterion = nn.CrossEntropyLoss()


4800 1199 10000


In [19]:
# hidden_size = 11
# epochs = 300
# init: Callable[[torch.Tensor], torch.Tensor] = lambda x: nn.init.xavier_uniform_(
#     tensor=x) if len(x.shape) > 1 else x
# active = nn.ReLU
# model = TwoLayerNetwork(input_size, hidden_size,
#                         output_size, init, active, 0., False)
# optimize = optim.SGD
# schedule = None
# learning_goal = 0.8 #
# learning_rate = 0.001
# min_lr = learning_rate * 1e-5
# l2_reg = L2_Regularization(0.001)
# RG_EB_LG_UA_BN_DO_baseline = test(model, device, testset)
# RG_EB_LG_UA_BN_DO_history = train(model, optimize, device, epochs, learning_rate,
#                    trainset, valset, criterion, schedule, l2_reg, learning_goal, min_lr, True, False, 0.)
# RG_EB_LG_UA_BN_DO_result = test(model, device, testset)
# print(RG_EB_LG_UA_BN_DO_baseline, RG_EB_LG_UA_BN_DO_history, RG_EB_LG_UA_BN_DO_result, sep="\n")
# model_path = r"./data/rg_eb_lg_ua_"
# torch.save(model, model_path)


0.1058
[(2.122917129993439, 0.241875, 1.8850674040621773, 0.371976647206005), (1.8249681035677592, 0.37166666666666665, 1.6204896290367896, 0.4445371142618849), (1.560087955792745, 0.463125, 1.3555002062394284, 0.5754795663052544), (1.314503967364629, 0.6285416666666667, 1.1403866603137853, 0.6788990825688074), (1.1036804835001628, 0.6920833333333334, 0.9596145864920979, 0.7097581317764804), (0.9680635674794515, 0.7283333333333334, 0.8577197387976085, 0.7381150959132611), (0.8847484676043192, 0.7395833333333334, 0.7977340597724596, 0.7447873227689742), (0.8284859573841095, 0.75, 0.7567762555034087, 0.7481234361968306), (0.7906862459580104, 0.759375, 0.7200900295856498, 0.7698081734778982), (0.7592342120409011, 0.7695833333333333, 0.6948466327808815, 0.7764804003336113), (0.7355515336990357, 0.7725, 0.6783705689292634, 0.7689741451209341), (0.7138904406627019, 0.7764583333333334, 0.6551016444360543, 0.7748123436196831), (0.6969207811355591, 0.7833333333333333, 0.6392939514274693, 0.7856

In [20]:
model_path = r"./data/rg_eb_lg_ua_"
model: TwoLayerNetwork = torch.load(model_path)
model


TwoLayerNetwork(
  (do): Dropout(p=0.0, inplace=False)
  (fc1): Linear(in_features=784, out_features=11, bias=True)
  (bn1): BatchNorm1d(11, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (active_func): ReLU()
  (fc2): Linear(in_features=11, out_features=10, bias=True)
)

### WAN

In [21]:
new_model = analogizing(model, device, trainset, 0.4, criterion)
new_model

TwoLayerNetwork(
  (do): Dropout(p=0.0, inplace=False)
  (fc1): Linear(in_features=784, out_features=2063, bias=True)
  (bn1): BatchNorm1d(2063, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (active_func): ReLU()
  (fc2): Linear(in_features=2063, out_features=10, bias=True)
)

In [23]:
new_model_path = r"./data/anlg_rg_eb_lg_ua_"
torch.save(new_model, new_model_path)

In [22]:
print(validate(model, device, trainset, criterion))
print(validate(model, device, valset, criterion))
print(validate(model, device, testset, criterion))
print(validate(new_model, device, trainset, criterion))
print(validate(new_model, device, valset, criterion))
print(validate(new_model, device, testset, criterion))


(0.5762234632174174, 0.80625)
(0.5797953142336352, 0.8081734778982486)
(0.6225299583435059, 0.7828)
(0.3981437009572983, 0.9222916666666666)
(0.5982372802779314, 0.8006672226855713)
(0.6369789076805115, 0.776)
