In [5]:
import torch
from torch import nn, optim, Generator
from torch.utils.data import DataLoader, Dataset, random_split
from numpy.random import choice
from typing import Iterable, Callable, Type, Optional, Union, Tuple, List

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [7]:
# download data(zipped csv) from kaggle with username and apikey
# import os
# import json
# from kaggle.api.kaggle_api_extended import KaggleApi
# with open("./kaggle.json", "r") as j:
#     for (k, v) in json.load(j).items():
#         os.environ[k] = v
# api = KaggleApi()
# api.authenticate()
# # https://www.kaggle.com/competitions/cafa-5-protein-function-prediction
# # datasetname
# api.dataset_download_files('arnabchaki/data-science-salaries-2023', path="./data/", unzip=True)

In [8]:
from operator import mul


def product(nums: Iterable[Type], func: Callable[[Type, Type], Type] = mul) -> Type:
    """return product of iterable"""
    _it = iter(nums)
    v: Type = next(_it)
    for _v in _it:
        v = func(v, _v)
    return v

In [29]:
ds_salaries: pd.DataFrame = pd.read_csv("./data/ds_salaries.csv")
nonnumerical_column_encoders = {
    c: LabelEncoder() for c, dt in ds_salaries.dtypes.items() if dt == 'O'}
ds_salaries[list(nonnumerical_column_encoders.keys())] = pd.DataFrame(
    e.fit_transform(ds_salaries[c]) for c, e in nonnumerical_column_encoders.items()).T
#

In [30]:
feature = torch.from_numpy(
    ds_salaries[ds_salaries.columns[:4].append(ds_salaries.columns[7:])].to_numpy(dtype=np.float32))


In [33]:
feature


tensor([[2.0230e+03, 3.0000e+00, 2.0000e+00,  ..., 1.0000e+02, 2.5000e+01,
         0.0000e+00],
        [2.0230e+03, 2.0000e+00, 0.0000e+00,  ..., 1.0000e+02, 7.0000e+01,
         2.0000e+00],
        [2.0230e+03, 2.0000e+00, 0.0000e+00,  ..., 1.0000e+02, 7.0000e+01,
         2.0000e+00],
        ...,
        [2.0200e+03, 0.0000e+00, 2.0000e+00,  ..., 1.0000e+02, 7.0000e+01,
         2.0000e+00],
        [2.0200e+03, 0.0000e+00, 0.0000e+00,  ..., 1.0000e+02, 7.0000e+01,
         0.0000e+00],
        [2.0210e+03, 3.0000e+00, 2.0000e+00,  ..., 5.0000e+01, 3.8000e+01,
         0.0000e+00]])

In [32]:
nn.functional.normalize(feature, dim=1)


tensor([[9.9776e-01, 1.4796e-03, 9.8642e-04,  ..., 4.9321e-02, 1.2330e-02,
         0.0000e+00],
        [9.9697e-01, 9.8564e-04, 0.0000e+00,  ..., 4.9282e-02, 3.4497e-02,
         9.8564e-04],
        [9.9697e-01, 9.8564e-04, 0.0000e+00,  ..., 4.9282e-02, 3.4497e-02,
         9.8564e-04],
        ...,
        [9.9722e-01, 0.0000e+00, 9.8735e-04,  ..., 4.9368e-02, 3.4557e-02,
         9.8735e-04],
        [9.9746e-01, 0.0000e+00, 0.0000e+00,  ..., 4.9379e-02, 3.4565e-02,
         0.0000e+00],
        [9.9908e-01, 1.4831e-03, 9.8870e-04,  ..., 2.4718e-02, 1.8785e-02,
         0.0000e+00]])

In [34]:
salary = torch.reshape(torch.tensor(
    ds_salaries.salary_in_usd.to_numpy(dtype=np.float32)), shape=(ds_salaries.salary_in_usd.size,))


In [36]:
nn.functional.normalize(salary, dim=0)


tensor([0.0093, 0.0032, 0.0027,  ..., 0.0113, 0.0108, 0.0102])

In [56]:
from torchvision import transforms


class DS_SalaryDataset(Dataset):
    """DS Salary dataset."""

    def __init__(self, csv_path="./data/ds_salaries.csv", transform=transforms.transforms.Normalize(0.5, 0.5)):
        """
        Args:
            csv_path (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        ds_salaries: pd.DataFrame = pd.read_csv(csv_path)
        self.nonnumerical_column_encoders = {
            c: LabelEncoder() for c, dt in ds_salaries.dtypes.items() if dt == 'O'}
        ds_salaries[list(self.nonnumerical_column_encoders.keys())] = pd.DataFrame(
            e.fit_transform(ds_salaries[c]) for c, e in self.nonnumerical_column_encoders.items()).T
        #
        self.feature = nn.functional.normalize(torch.from_numpy(
            ds_salaries[ds_salaries.columns[:4].append(ds_salaries.columns[7:])].to_numpy(dtype=np.float32)), dim=1)

        # target
        self.salary = nn.functional.normalize(torch.reshape(torch.tensor(
            ds_salaries.salary_in_usd.to_numpy(dtype=np.float32)), shape=(ds_salaries.salary_in_usd.size,)), dim=0)

    def __len__(self):
        return self.salary.size()[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return self.feature[idx], self.salary[idx]


def getCustomizedData():
    # preprocess
    dataset = DS_SalaryDataset()
    # train test split
    train_count = int(0.7 * len(dataset))
    valid_count = int(0.2 * len(dataset))
    test_count = len(dataset) - train_count - valid_count
    print(train_count, valid_count, test_count)
    trainset, valset, testset = random_split(
        dataset, (train_count, valid_count, test_count), Generator().manual_seed(42))
    datum_size = product(trainset[0][0].size())
    return trainset, valset, testset, datum_size

In [38]:
getCustomizedData()


2628 751 376


(<torch.utils.data.dataset.Subset at 0x14f9d7797f0>,
 <torch.utils.data.dataset.Subset at 0x14f9d779a60>,
 <torch.utils.data.dataset.Subset at 0x14f9d779f40>,
 8)

In [10]:
from collections import deque


class TwoLayerNetwork(nn.Module):

    def __init__(self, input_size: int, hidden_size: int, num_classes: int, init_method: Callable[[torch.Tensor], torch.Tensor], active_func: Callable[[], nn.modules.module.Module],
                 DO: float, if_BN: bool, store_size: int = 1):
        super(TwoLayerNetwork, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.if_BN = if_BN
        # dropout
        self.do = nn.Dropout(DO)
        # first layer
        self.fc1 = nn.Linear(input_size, hidden_size)
        # batch norm
        self.bn1 = nn.BatchNorm1d(hidden_size)
        # activation
        self.active_func = active_func()
        # second layer
        self.fc2 = nn.Linear(hidden_size, num_classes)
        # initialize
        for param in self.parameters():
            init_method(param)
        self.storage: deque[List[nn.Parameter]] = deque(maxlen=store_size)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        out: torch.Tensor = self.do(x)
        out = self.fc1(out)
        if self.if_BN:
            out = self.bn1(out)
        out = self.active_func(out)
        out = self.fc2(out)
        return out

In [41]:
class WD_Regularization(nn.Module):
    def __init__(self):
        super(WD_Regularization, self).__init__()


class L2_Regularization(WD_Regularization):
    def __init__(self, weight_decay: float):
        super(L2_Regularization, self).__init__()
        if weight_decay <= 0:
            raise ValueError("param weight_decay can not <=0!!")
        self.weight_decay = weight_decay

    def forward(self, model: nn.Module) -> Union[torch.Tensor, float]:
        reg = 0
        for name, parameter in model.named_parameters():
            if name in ("fc1.weight", "fc2.weight"):
                reg += torch.sum(parameter**2)
        return self.weight_decay * reg


In [44]:
def validate(model: TwoLayerNetwork, device: str, valset: Dataset[torch.Tensor], criterion: nn.modules.loss._Loss) \
        -> Tuple[float, float]:
    """return loss, accuracy"""
    # Validate the model
    model.to(device)
    criterion.to(device)
    val_loss = 0.0
    model.eval()
    with torch.no_grad():
        for x, y in DataLoader(valset, batch_size=32, shuffle=True):
            x: torch.Tensor = x.view(-1, model.input_size).to(device)
            y: torch.Tensor = y.to(device)
            outputs: torch.Tensor = model(x)
            loss: torch.Tensor = criterion(outputs, y)
            val_loss += loss.item() * x.size(0)
        val_loss /= len(valset)
    return val_loss

In [48]:
def train(model: TwoLayerNetwork, opt: Callable[..., optim.Optimizer], device: str, epochs: float, learning_rate: float, trainset: Dataset[torch.Tensor], valset: Dataset[torch.Tensor], criterion: nn.modules.loss._Loss,
          sched: Optional[Callable[[optim.Optimizer], optim.lr_scheduler._LRScheduler]], wd_reg: Optional[WD_Regularization], learning_goal: float, min_lr: float, if_lr_adjust: bool, if_BN: bool, drop_rate: float) \
        -> List[Tuple[float, float, float, float]]:
    """
    Params:
        model
        opt
        device
        epochs
        learing_rate
        criterion
        y: label of data
        wd_reg, BN, DO: regularization
    Results:
        history: train_loss, train_accuracy, val_loss, val_accuracy of each epochs
    """
    def forward_backward(optimizer: optim.Optimizer, criterion: nn.modules.loss._Loss, wd_reg: Optional[WD_Regularization], model: TwoLayerNetwork, y: torch.Tensor,
                         BN: Optional[nn.modules.batchnorm._BatchNorm], DO: Optional[nn.modules.dropout._DropoutNd]) \
            -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Params:
            optimizer
            criterion
            model
            y: label of data
            wd_reg, BN, DO: regularization
        Results:
            ouputs: f(x)
            loss_all: f(x) - y
        """
        optimizer.zero_grad()
        outputs = model(x)
        outputs = outputs if not DO else DO(outputs)
        loss_all: torch.Tensor = criterion(
            outputs, y) + wd_reg(model) if wd_reg else criterion(outputs, y)
        loss_all.backward()
        optimizer.step()
        return loss_all, outputs
    if epochs < 1:
        raise ValueError("Invalid epoch!!")
    if not 0 <= drop_rate < 1:
        raise ValueError("Invalid dropout rate!!")
    # init
    epoch = 0
    init_lr = learning_rate
    origin_if_BN = model.if_BN
    model.if_BN = if_BN
    pre_loss = float("inf") if if_lr_adjust else None
    batch_norm = nn.BatchNorm1d(model.hidden_size).to(
        device) if if_BN else None
    drop_out = nn.Dropout(drop_rate).to(device) if drop_rate != 0. else None
    model.to(device)
    # if not model.storage[-1]
    model.storage.append(list(model.parameters()))
    optimizer = opt(model.storage[-1], lr=learning_rate)
    scheduler = sched(optimizer) if sched else None
    history = []
    # Train the model
    while epoch < epochs:
        train_loss = 0.0
        train_correct = 0
        model.train()
        for x, y in DataLoader(trainset, batch_size=32, shuffle=True):
            x: torch.Tensor = x.view(-1, model.input_size).to(device)
            y: torch.Tensor = y.to(device=device, dtype=torch.float32)
            loss_all, outputs = forward_backward(
                optimizer, criterion, wd_reg, model, y, batch_norm, drop_out)
            # Learning rate adjustment
            if pre_loss:
                while pre_loss <= loss_all.item():
                    # learning rate vanishing
                    if learning_rate < min_lr:
                        # return history
                        learning_rate = init_lr
                        optimizer = opt(model.storage[-1], lr=learning_rate)
                        loss_all, outputs = forward_backward(
                            optimizer, criterion, wd_reg, model, y, batch_norm, drop_out)
                        # raise ValueError(f"{learning_rate} < {min_lr}")
                        break
                    learning_rate *= 0.7
                    optimizer = opt(model.storage[-1], lr=learning_rate)
                    loss_all, outputs = forward_backward(
                        optimizer, criterion, wd_reg, model, y, batch_norm, drop_out)
                learning_rate *= 1.2
                pre_loss = loss_all.item()
            train_loss += loss_all.item() * x.size(0)
            model.storage.append(list(model.parameters()))
        train_loss /= len(trainset)
        # Validate the model
        val_loss = validate(
            model=model, device=device, valset=valset, criterion=criterion)
        # Log statics
        history.append((train_loss, val_loss))
        # Stopping criteria
        if learning_goal > train_loss:
            return history
        # Update loop
        if scheduler:
            scheduler.step()
        epoch += 1
    # restore model
    model.if_BN = origin_if_BN
    return history

In [46]:
def test(model: TwoLayerNetwork, device: str, testset: Dataset[torch.Tensor]) -> float:
    """return accuracy"""
    return validate(model=model, device=device, valset=testset, criterion=nn.MSELoss())

In [15]:
def analogizing(model: TwoLayerNetwork, device: str, trainset: Dataset[torch.Tensor], learning_goal: float, criterion: nn.modules.loss._Loss):
    x = torch.stack([x for x, _ in trainset]
                    ).view(-1, model.input_size).to("cpu")
    y = torch.Tensor([y for _, y in trainset]).to("cpu")
    total_amount = len(x)
    # get wrong correct indices
    new_fc1_w = model.fc1.weight.data.to(device)
    new_fc1_b = model.fc1.bias.data.to(device)
    new_fc2_w = model.fc2.weight.data.to(device)
    relu = nn.ReLU()
    logits: torch.Tensor = x
    model.eval()
    with torch.no_grad():
        outputs = relu(x.to(device) @ new_fc1_w.T + new_fc1_b
                       ) @ new_fc2_w.T + model.fc2.bias.data.to(device)
        logits, predicted = torch.max(outputs.data, 1)
        success_condition = predicted == y.to(device)
        wrong_indices = torch.nonzero(success_condition != True).to("cpu")
    #
    init_func: Callable[[torch.Tensor], torch.Tensor] = lambda x: nn.init.xavier_uniform_(
        tensor=x) if len(x.shape) > 1 else x
    wrong_pointer = len(wrong_indices)
    train_correct: int = 0
    loss: float = float("inf")
    history: List[Tuple[float, float]] = []
    while wrong_pointer > 0 and loss > learning_goal:
        fc1 = nn.Linear(len(x[0]), 3).to(device)
        fc2 = nn.Linear(3, product(model.fc2.bias.size()),
                        False).to(device)
        # with torch.no_grad():
        target = torch.zeros(*torch.Size((total_amount,)))
        wrong_pointer -= 1
        pointer = wrong_indices[wrong_pointer]
        catagory = int(y[pointer])
        target[pointer] = catagory
        target = target.to(device)
        fc2.weight.data[:, :] = 0
        fc2.weight.data[catagory, 0] = -2
        fc2.weight.data[catagory, 1] = 1
        fc2.weight.data[catagory, 2] = 1
        delta = 0
        intercept = 0
        nonz = x
        # randomly generate hyperplane which only contain the target x
        while nonz.size() != (1, 2) or nonz.tolist()[0][0] != pointer:
            for p in fc1.parameters():
                init_func(p)
            distances = x.to(device) @ fc1.weight.data[0].T
            intercept = distances[pointer]
            distances -= intercept
            # get the shortest distance of other x to hyperplane
            if (delta := torch.min(torch.abs(torch.cat(
                    (distances[:pointer], distances[pointer + 1:])
            )))) == 0:
                continue
            fc1.bias.data[1] = -intercept + (delta / 2)
            fc1.bias.data[2] = -intercept - (delta / 2)
            # check if delta too small for float32(default)
            if fc1.bias.data[1] == fc1.bias.data[2]:
                continue
            fc1.bias.data[0] = -intercept
            fc1.weight.data[1:] = fc1.weight.data[0]
            outputs = relu(x.to(device) @ fc1.weight.data.T + fc1.bias.data
                           ) @ fc2.weight.data.T
            nonz = torch.nonzero(outputs)
        # adjust weight in order to make the output of correct category greater than the others
        fc2.weight.data *= logits[pointer].item(
        ) / outputs[pointer].sum() + 1
        new_fc1_w = torch.cat((new_fc1_w, fc1.weight.data)).to(device)
        new_fc1_b = torch.cat((new_fc1_b, fc1.bias.data)).to(device)
        new_fc2_w = torch.cat((new_fc2_w, fc2.weight.data), 1).to(device)
        with torch.no_grad():
            outputs = relu(x.to(device) @ new_fc1_w.T + new_fc1_b
                           ) @ new_fc2_w.T + model.fc2.bias.data.to(device)
            loss = criterion(
                outputs, y.to(device=device, dtype=torch.long)).item()
            _, predicted = torch.max(outputs.data, 1)
            train_correct = (predicted == y.to(device)).sum().item()
        history.append((loss, train_correct / total_amount))
    # construct new model
    new_model = TwoLayerNetwork(model.input_size, len(new_fc1_b), product(
        model.fc2.bias.size()), lambda _: _, lambda: model.active_func, model.do.p, model.if_BN)
    for name, param in model.named_parameters():
        layer_name, variable_type = name.split(".")
        if layer_name == "fc1":
            setattr(getattr(getattr(new_model, layer_name), variable_type),
                    "data", eval(f"new_{layer_name}_{variable_type[0]}"))
        elif layer_name == "fc2":
            if variable_type == "weight":
                setattr(getattr(getattr(new_model, layer_name), variable_type),
                        "data", eval(f"new_{layer_name}_{variable_type[0]}"))
            elif variable_type == "bias":
                new_model.fc2.bias.data[:] = model.fc2.bias.data[:]
            else:
                pass
                setattr(getattr(new_model, layer_name), variable_type, param)
        else:
            setattr(getattr(new_model, layer_name), variable_type, param)
    return new_model


In [57]:
device = "cuda" if torch.cuda.is_available(
) else "mps" if torch.backends.mps.is_available() else "cpu"
trainset, valset, testset, input_size = getCustomizedData()
criterion = nn.MSELoss()


2628 751 376


In [17]:
hidden_size = 32
init: Callable[[torch.Tensor], torch.Tensor] = lambda x: nn.init.xavier_uniform_(
    tensor=x) if len(x.shape) > 1 else x
active = nn.ReLU
model = TwoLayerNetwork(input_size, hidden_size, 1,
                        init, active, 0., False).to(device)

### Baseline

### Train

In [60]:
hidden_size = 64
epochs = 20
init: Callable[[torch.Tensor], torch.Tensor] = lambda x: nn.init.xavier_uniform_(
    tensor=x) if len(x.shape) > 1 else x
active = nn.ReLU
model = TwoLayerNetwork(input_size, hidden_size,
                        1, init, active, 0., True)
optimize = optim.SGD
schedule = None
learning_rate = 0.001
min_lr = learning_rate * 1e-5
l2_reg = L2_Regularization(0.0001)
baseline = test(model, device, testset)
learning_goal = baseline * 0.02
history = train(model, optimize, device, epochs, learning_rate,
                trainset, valset, criterion, schedule, l2_reg, learning_goal, min_lr, True, False, 0.)
result = test(model, device, testset)
print(baseline, history, result, sep="\n")
model_path = r"./data/final"
torch.save(model, model_path)

0.050912660407893204
[(0.014791576618639132, 6.024629773929239e-05), (0.0017932431949189151, 6.103930424340194e-05), (0.0017934633336346058, 6.0895479822473365e-05), (0.0017929784528332758, 6.084292637991574e-05), (0.0017931283259540343, 6.11068695996294e-05), (0.0017926669374811414, 6.116548242771654e-05), (0.0017922889606749337, 6.085791536608398e-05), (0.0017926629256709411, 6.091984465509621e-05), (0.0017921632546307787, 6.0715429585719404e-05), (0.0017918162058494543, 6.112065367365303e-05), (0.001791715381498331, 6.033067929166426e-05), (0.0017916151507413143, 6.060882911411456e-05), (0.0017914815631849604, 6.0591439284765894e-05), (0.0017912373828447936, 6.0565408475837485e-05), (0.0017910158234014767, 6.074030030569615e-05), (0.001790999388463988, 6.066077001017398e-05), (0.0017906635226412496, 6.058076548401096e-05), (0.001790369498292537, 6.103196702144957e-05), (0.001790301861876517, 6.081939027014563e-05), (0.0017903644628425894, 5.97822542432367e-05)]
5.315698024091073e-05