In [127]:
import pandas as pd
import numpy as np

In [104]:
# download data(zipped csv) from kaggle with username and apikey
import os
import json
from kaggle.api.kaggle_api_extended import KaggleApi
with open("./kaggle.json", "r") as j:
    for (k, v) in json.load(j).items():
        os.environ[k] = v
api = KaggleApi()
api.authenticate()
# https://www.kaggle.com/competitions/cafa-5-protein-function-prediction
# datasetname
api.dataset_download_files('arnabchaki/data-science-salaries-2023', path="./data/", unzip=True)


In [105]:
ds_salaries = pd.read_csv("./data/ds_salaries.csv")
print(ds_salaries.shape)
ds_salaries.head()

(3755, 11)


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [106]:
ds_salaries.describe()

Unnamed: 0,work_year,salary,salary_in_usd,remote_ratio
count,3755.0,3755.0,3755.0,3755.0
mean,2022.373635,190695.6,137570.38988,46.271638
std,0.691448,671676.5,63055.625278,48.58905
min,2020.0,6000.0,5132.0,0.0
25%,2022.0,100000.0,95000.0,0.0
50%,2022.0,138000.0,135000.0,0.0
75%,2023.0,180000.0,175000.0,100.0
max,2023.0,30400000.0,450000.0,100.0


In [126]:
print([len(ds_salaries.value_counts(c))
       if len(ds_salaries.value_counts(c)) > 10
       else f"{c}: {ds_salaries.value_counts(c)}"
       for c in ds_salaries.columns])

['work_year: work_year\n2023    1785\n2022    1664\n2021     230\n2020      76\ndtype: int64', 'experience_level: experience_level\n3    2516\n2     805\n0     320\n1     114\ndtype: int64', 'employment_type: employment_type\n2    3718\n3      17\n0      10\n1      10\ndtype: int64', 93, 815, 20, 1035, 78, 'remote_ratio: remote_ratio\n0      1923\n100    1643\n50      189\ndtype: int64', 72, 'company_size: company_size\n1    3153\n0     454\n2     148\ndtype: int64']


In [108]:
ds_salaries.columns

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')

In [110]:
ds_salaries.dtypes

work_year              int64
experience_level      object
employment_type       object
job_title             object
salary                 int64
salary_currency       object
salary_in_usd          int64
employee_residence    object
remote_ratio           int64
company_location      object
company_size          object
dtype: object

In [112]:
from sklearn.preprocessing import LabelEncoder

In [117]:
nonnumerical_column_encoders = {
    c: LabelEncoder() for c, dt in ds_salaries.dtypes.items() if dt == 'O'}

In [123]:
ds_salaries[list(nonnumerical_column_encoders.keys())] = pd.DataFrame(
    e.fit_transform(ds_salaries[c]) for c, e in nonnumerical_column_encoders.items()).T

In [125]:
ds_salaries.dtypes

work_year             int64
experience_level      int32
employment_type       int32
job_title             int32
salary                int64
salary_currency       int32
salary_in_usd         int64
employee_residence    int32
remote_ratio          int64
company_location      int32
company_size          int32
dtype: object

In [134]:
ds_salaries.columns

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')

In [133]:
ds_salaries.columns[:4].append(ds_salaries.columns[7:])

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'employee_residence', 'remote_ratio', 'company_location',
       'company_size'],
      dtype='object')

In [156]:
len(ds_salaries.salary_in_usd)

3755

In [157]:
ds_salaries.salary_in_usd.size

3755

In [151]:
torch.tensor(ds_salaries.salary_in_usd.to_numpy())

tensor([ 85847,  30000,  25500,  ..., 105000, 100000,  94665])

In [135]:
import torch
from torch import nn, optim, Generator
from torch.utils.data import DataLoader, Dataset, random_split
from numpy.random import choice
from typing import Iterable, Callable, Type, Optional, Union, Tuple, List


  from .autonotebook import tqdm as notebook_tqdm


In [136]:
from operator import mul


def product(nums: Iterable[Type], func: Callable[[Type, Type], Type] = mul) -> Type:
    """return product of iterable"""
    _it = iter(nums)
    v: Type = next(_it)
    for _v in _it:
        v = func(v, _v)
    return v


In [158]:
class DS_SalaryDataset(Dataset):
    """DS Salary dataset."""

    def __init__(self):
        """
        Args:
            csv_path (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        ds_salaries: pd.DataFrame = pd.read_csv("./data/ds_salaries.csv")
        self.nonnumerical_column_encoders = {
            c: LabelEncoder() for c, dt in ds_salaries.dtypes.items() if dt == 'O'}
        ds_salaries[list(self.nonnumerical_column_encoders.keys())] = pd.DataFrame(
            e.fit_transform(ds_salaries[c]) for c, e in self.nonnumerical_column_encoders.items()).T
        #
        self.feature = torch.from_numpy(
            ds_salaries[ds_salaries.columns[:4].append(ds_salaries.columns[7:])].to_numpy(dtype=np.float32))
        # two status
        self.salary = torch.reshape(torch.tensor(
            ds_salaries.salary_in_usd.to_numpy()), shape=(ds_salaries.salary_in_usd.size,))

    def __len__(self):
        return self.salary.size()[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return self.feature[idx], self.salary[idx]


def getCustomizedData():
    # preprocess
    dataset = DS_SalaryDataset()
    # train test split
    train_count = int(0.7 * len(dataset))
    valid_count = int(0.2 * len(dataset))
    test_count = len(dataset) - train_count - valid_count
    print(train_count, valid_count, test_count)
    trainset, valset, testset = random_split(
        dataset, (train_count, valid_count, test_count), Generator().manual_seed(42))
    datum_size = product(trainset[0][0].size())
    return trainset, valset, testset, datum_size

In [138]:
from collections import deque


class TwoLayerNetwork(nn.Module):

    def __init__(self, input_size: int, hidden_size: int, num_classes: int, init_method: Callable[[torch.Tensor], torch.Tensor], active_func: Callable[[], nn.modules.module.Module],
                 DO: float, if_BN: bool, store_size: int = 1):
        super(TwoLayerNetwork, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.if_BN = if_BN
        # dropout
        self.do = nn.Dropout(DO)
        # first layer
        self.fc1 = nn.Linear(input_size, hidden_size)
        # batch norm
        self.bn1 = nn.BatchNorm1d(hidden_size)
        # activation
        self.active_func = active_func()
        # second layer
        self.fc2 = nn.Linear(hidden_size, num_classes)
        # initialize
        for param in self.parameters():
            init_method(param)
        self.storage: deque[List[nn.Parameter]] = deque(maxlen=store_size)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        out: torch.Tensor = self.do(x)
        out = self.fc1(out)
        if self.if_BN:
            out = self.bn1(out)
        out = self.active_func(out)
        out = self.fc2(out)
        return out


In [139]:
class WD_Regularization(nn.Module):
    def __init__(self):
        super(WD_Regularization, self).__init__()


class L2_Regularization(WD_Regularization):
    def __init__(self, weight_decay: float):
        super(L2_Regularization, self).__init__()
        if weight_decay <= 0:
            raise ValueError("param weight_decay can not <=0!!")
        self.weight_decay = weight_decay

    def forward(self, model: nn.Module) -> Union[torch.Tensor, float]:
        reg = 0
        for name, parameter in model.named_parameters():
            if "weight" in name:
                reg += torch.sum(parameter**2)
        return self.weight_decay * reg


In [140]:
def validate(model: TwoLayerNetwork, device: str, valset: Dataset[torch.Tensor], criterion: nn.modules.loss._Loss) \
        -> Tuple[float, float]:
    """return loss, accuracy"""
    # Validate the model
    model.to(device)
    val_loss = 0.0
    val_correct = 0
    model.eval()
    with torch.no_grad():
        for x, y in DataLoader(valset, batch_size=32, shuffle=True):
            x: torch.Tensor = x.view(-1, model.input_size).to(device)
            y: torch.Tensor = y.to(device)
            outputs: torch.Tensor = model(x)
            loss: torch.Tensor = criterion(outputs, y)
            val_loss += loss.item() * x.size(0)
            _, predicted = torch.max(outputs.data, 1)
            val_correct += (predicted == y).sum().item()
        val_loss /= len(valset)
        val_accuracy = val_correct / len(valset)
    return val_loss, val_accuracy


In [141]:
def train(model: TwoLayerNetwork, opt: Callable[..., optim.Optimizer], device: str, epochs: float, learning_rate: float, trainset: Dataset[torch.Tensor], valset: Dataset[torch.Tensor], criterion: nn.modules.loss._Loss,
          sched: Optional[Callable[[optim.Optimizer], optim.lr_scheduler._LRScheduler]], wd_reg: Optional[WD_Regularization], learning_goal: float, min_lr: float, if_lr_adjust: bool, if_BN: bool, drop_rate: float) \
        -> List[Tuple[float, float, float, float]]:
    """
    Params:
        model
        opt
        device
        epochs
        learing_rate
        criterion
        y: label of data
        wd_reg, BN, DO: regularization
    Results:
        history: train_loss, train_accuracy, val_loss, val_accuracy of each epochs
    """
    def forward_backward(optimizer: optim.Optimizer, criterion: nn.modules.loss._Loss, wd_reg: Optional[WD_Regularization], model: TwoLayerNetwork, y: torch.Tensor,
                         BN: Optional[nn.modules.batchnorm._BatchNorm], DO: Optional[nn.modules.dropout._DropoutNd]) \
            -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Params:
            optimizer
            criterion
            model
            y: label of data
            wd_reg, BN, DO: regularization
        Results:
            ouputs: f(x)
            loss_all: f(x) - y
        """
        optimizer.zero_grad()
        outputs = model(x)
        outputs = outputs if not DO else DO(outputs)
        loss_all: torch.Tensor = criterion(
            outputs, y) + wd_reg(model) if wd_reg else criterion(outputs, y)
        loss_all.backward()
        optimizer.step()
        return loss_all, outputs
    if epochs < 1:
        raise ValueError("Invalid epoch!!")
    if not 0 <= drop_rate < 1:
        raise ValueError("Invalid dropout rate!!")
    # init
    epoch = 0
    init_lr = learning_rate
    origin_if_BN = model.if_BN
    model.if_BN = if_BN
    pre_loss = float("inf") if if_lr_adjust else None
    batch_norm = nn.BatchNorm1d(model.hidden_size).to(
        device) if if_BN else None
    drop_out = nn.Dropout(drop_rate).to(device) if drop_rate != 0. else None
    model.to(device)
    # if not model.storage[-1]
    model.storage.append(list(model.parameters()))
    optimizer = opt(model.storage[-1], lr=learning_rate)
    scheduler = sched(optimizer) if sched else None
    history = []
    # Train the model
    while epoch < epochs:
        train_loss = 0.0
        train_correct = 0
        model.train()
        for x, y in DataLoader(trainset, batch_size=32, shuffle=True):
            x: torch.Tensor = x.view(-1, model.input_size).to(device)
            y: torch.Tensor = y.to(device)
            loss_all, outputs = forward_backward(
                optimizer, criterion, wd_reg, model, y, batch_norm, drop_out)
            # Learning rate adjustment
            if pre_loss:
                while pre_loss <= loss_all.item():
                    # learning rate vanishing
                    if learning_rate < min_lr:
                        # return history
                        learning_rate = init_lr
                        optimizer = opt(model.storage[-1], lr=learning_rate)
                        loss_all, outputs = forward_backward(
                            optimizer, criterion, wd_reg, model, y, batch_norm, drop_out)
                        # raise ValueError(f"{learning_rate} < {min_lr}")
                        break
                    learning_rate *= 0.7
                    optimizer = opt(model.storage[-1], lr=learning_rate)
                    loss_all, outputs = forward_backward(
                        optimizer, criterion, wd_reg, model, y, batch_norm, drop_out)
                learning_rate *= 1.2
                pre_loss = loss_all.item()
            train_loss += loss_all.item() * x.size(0)
            predicted: torch.Tensor = torch.max(outputs.data, 1)[1]
            train_correct += (predicted == y).sum().item()
            model.storage.append(list(model.parameters()))
        train_loss /= len(trainset)
        train_accuracy = train_correct / len(trainset)
        # Validate the model
        val_loss, val_accuracy = validate(
            model=model, device=device, valset=valset, criterion=criterion)
        # Log statics
        history.append((train_loss, train_accuracy, val_loss, val_accuracy))
        # Stopping criteria
        if learning_goal < val_accuracy:
            return history
        # Update loop
        if scheduler:
            scheduler.step()
        epoch += 1
    # restore model
    model.if_BN = origin_if_BN
    return history


In [142]:
def test(model: TwoLayerNetwork, device: str, testset: Dataset[torch.Tensor]) -> float:
    """return accuracy"""
    return validate(model=model, device=device, valset=testset, criterion=nn.CrossEntropyLoss())[1]


In [143]:
def analogizing(model: TwoLayerNetwork, device: str, trainset: Dataset[torch.Tensor], learning_goal: float, criterion: nn.modules.loss._Loss):
    x = torch.stack([x for x, _ in trainset]
                    ).view(-1, model.input_size).to("cpu")
    y = torch.Tensor([y for _, y in trainset]).to("cpu")
    total_amount = len(x)
    # get wrong correct indices
    new_fc1_w = model.fc1.weight.data.to(device)
    new_fc1_b = model.fc1.bias.data.to(device)
    new_fc2_w = model.fc2.weight.data.to(device)
    relu = nn.ReLU()
    logits: torch.Tensor = x
    model.eval()
    with torch.no_grad():
        outputs = relu(x.to(device) @ new_fc1_w.T + new_fc1_b
                       ) @ new_fc2_w.T + model.fc2.bias.data.to(device)
        logits, predicted = torch.max(outputs.data, 1)
        success_condition = predicted == y.to(device)
        wrong_indices = torch.nonzero(success_condition != True).to("cpu")
    #
    init_func: Callable[[torch.Tensor], torch.Tensor] = lambda x: nn.init.xavier_uniform_(
        tensor=x) if len(x.shape) > 1 else x
    wrong_pointer = len(wrong_indices)
    train_correct: int = 0
    loss: float = float("inf")
    history: List[Tuple[float, float]] = []
    while wrong_pointer > 0 and loss > learning_goal:
        fc1 = nn.Linear(len(x[0]), 3).to(device)
        fc2 = nn.Linear(3, product(model.fc2.bias.size()),
                        False).to(device)
        # with torch.no_grad():
        target = torch.zeros(*torch.Size((total_amount,)))
        wrong_pointer -= 1
        pointer = wrong_indices[wrong_pointer]
        catagory = int(y[pointer])
        target[pointer] = catagory
        target = target.to(device)
        fc2.weight.data[:, :] = 0
        fc2.weight.data[catagory, 0] = -2
        fc2.weight.data[catagory, 1] = 1
        fc2.weight.data[catagory, 2] = 1
        delta = 0
        intercept = 0
        nonz = x
        # randomly generate hyperplane which only contain the target x
        while nonz.size() != (1, 2) or nonz.tolist()[0][0] != pointer:
            for p in fc1.parameters():
                init_func(p)
            distances = x.to(device) @ fc1.weight.data[0].T
            intercept = distances[pointer]
            distances -= intercept
            # get the shortest distance of other x to hyperplane
            if (delta := torch.min(torch.abs(torch.cat(
                    (distances[:pointer], distances[pointer + 1:])
            )))) == 0:
                continue
            fc1.bias.data[1] = -intercept + (delta / 2)
            fc1.bias.data[2] = -intercept - (delta / 2)
            # check if delta too small for float32(default)
            if fc1.bias.data[1] == fc1.bias.data[2]:
                continue
            fc1.bias.data[0] = -intercept
            fc1.weight.data[1:] = fc1.weight.data[0]
            outputs = relu(x.to(device) @ fc1.weight.data.T + fc1.bias.data
                           ) @ fc2.weight.data.T
            nonz = torch.nonzero(outputs)
        # adjust weight in order to make the output of correct category greater than the others
        fc2.weight.data *= logits[pointer].item(
        ) / outputs[pointer].sum() + 1
        new_fc1_w = torch.cat((new_fc1_w, fc1.weight.data)).to(device)
        new_fc1_b = torch.cat((new_fc1_b, fc1.bias.data)).to(device)
        new_fc2_w = torch.cat((new_fc2_w, fc2.weight.data), 1).to(device)
        with torch.no_grad():
            outputs = relu(x.to(device) @ new_fc1_w.T + new_fc1_b
                           ) @ new_fc2_w.T + model.fc2.bias.data.to(device)
            loss = criterion(
                outputs, y.to(device=device, dtype=torch.long)).item()
            _, predicted = torch.max(outputs.data, 1)
            train_correct = (predicted == y.to(device)).sum().item()
        history.append((loss, train_correct / total_amount))
    # construct new model
    new_model = TwoLayerNetwork(model.input_size, len(new_fc1_b), product(
        model.fc2.bias.size()), lambda _: _, lambda: model.active_func, model.do.p, model.if_BN)
    for name, param in model.named_parameters():
        layer_name, variable_type = name.split(".")
        if layer_name == "fc1":
            setattr(getattr(getattr(new_model, layer_name), variable_type),
                    "data", eval(f"new_{layer_name}_{variable_type[0]}"))
        elif layer_name == "fc2":
            if variable_type == "weight":
                setattr(getattr(getattr(new_model, layer_name), variable_type),
                        "data", eval(f"new_{layer_name}_{variable_type[0]}"))
            elif variable_type == "bias":
                new_model.fc2.bias.data[:] = model.fc2.bias.data[:]
            else:
                pass
                setattr(getattr(new_model, layer_name), variable_type, param)
        else:
            setattr(getattr(new_model, layer_name), variable_type, param)
    return new_model

In [159]:
device = "cuda" if torch.cuda.is_available(
) else "mps" if torch.backends.mps.is_available() else "cpu"
trainset, valset, testset, input_size = getCustomizedData()
criterion = nn.CrossEntropyLoss()


2628 751 376


In [None]:
hidden_size = 11
epochs = 300
init: Callable[[torch.Tensor], torch.Tensor] = lambda x: nn.init.xavier_uniform_(
    tensor=x) if len(x.shape) > 1 else x
active = nn.ReLU
model = TwoLayerNetwork(input_size, hidden_size, 1, init, active, 0., False)
optimize = optim.SGD
schedule = None
learning_goal = 0.6 #
learning_rate = 0.001
min_lr = learning_rate * 1e-5
l2_reg = L2_Regularization(0.001)
RG_EB_LG_UA_BN_DO_baseline = test(model, device, testset)
RG_EB_LG_UA_BN_DO_history = train(model, optimize, device, epochs, learning_rate,
                   trainset, valset, criterion, schedule, l2_reg, learning_goal, min_lr, True, False, 0.)
RG_EB_LG_UA_BN_DO_result = test(model, device, testset)
print(RG_EB_LG_UA_BN_DO_baseline, RG_EB_LG_UA_BN_DO_history, RG_EB_LG_UA_BN_DO_result, sep="\n")
model_path = r"./data/rg_eb_lg_ua_"
torch.save(model, model_path)
