In [2]:
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# secret_value_0 = user_secrets.get_secret("wb")
#
# import os
#
# os.environ["WANDB_API_KEY"] = secret_value_0
# os.environ["WANDB_MODE"] = "online"
#
# import wandb

# wandb.init(project="test3", entity="ckodserteam")

In [3]:
import math
import random

import torch
import torch.nn as nn
from matplotlib import pyplot as plt
from torch import functional as F
import numpy as np
import math
import torch
import matplotlib.pyplot as plt
from tqdm import tqdm
import torchvision
from torchvision import transforms
from matplotlib import pyplot
from torch.utils.data import (
    Dataset,
    ConcatDataset,
    Subset,
    DataLoader,
    RandomSampler,
)

In [4]:
def norm(A):
    return torch.sum(A*A).item()

class LinearGreedy(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, weight):
        ctx.save_for_backward(input, weight)
        return torch.matmul(input, torch.transpose(weight, 0, 1))

    @staticmethod
    def backward(ctx, grad_output):
        '''
        :param ctx:
        :param grad_output:
        :return: real gradient for myweight, cut effect of neurons when calculating input grad.
        '''
        eps = 1e-8
        input, weight = ctx.saved_tensors
        weight_grad_initial=torch.matmul(torch.transpose(grad_output, 0, 1), input)
        grad_input_initial=torch.matmul(grad_output, weight)
        f = torch.sum(weight_grad_initial*weight_grad_initial, dim=1)
        f=torch.ones_like(f)
        f_normalize=f/(torch.mean(f)+eps)
        grad_output_normalized=grad_output/(f_normalize+eps)
        grad_input=torch.matmul(grad_output_normalized, weight)
        grad_weight=torch.matmul(torch.transpose(grad_output_normalized, 0, 1), input)

        # print("weight_grad", norm(weight_grad_initial), "->", norm(grad_weight))
        # print("grad_input", norm(grad_input_initial), "->", norm(grad_input))

        return grad_input, grad_weight


    
class GreedyLinearMult(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        linear = nn.Linear(input_size, output_size)
        self.weight = torch.nn.parameter.Parameter(data=linear.weight.clone(), requires_grad=True)
        self.bias = torch.nn.parameter.Parameter(data=linear.bias.clone(), requires_grad=True)
        self.register_parameter("Gweight", self.weight)
        self.register_parameter("Gbias", self.bias)
        with torch.no_grad():
            self.bias_initial_norm=torch.linalg.norm(self.bias.data)
            self.weigth_initial_norm=torch.linalg.matrix_norm(self.weight.data)
        
        
    def record(self):
        with torch.no_grad():
            self.bias_initial_norm=torch.linalg.norm(self.mybias.data)
            self.weigth_initial_norm=torch.linalg.matrix_norm(self.myweight.data)
    
    def normalize(self):
        with torch.no_grad():
            bias_norm=torch.linalg.norm(self.mybias.data)
            weigth_norm=torch.linalg.matrix_norm(self.myweight.data)
            self.myweight.data*=(self.weigth_initial_norm/weigth_norm)
            self.mybias.data*=(self.bias_initial_norm/bias_norm)  
            
    def forward(self, input):
        return LinearGreedy.apply(input, self.weight)+self.bias
                
class GLinear(nn.Module):
    def __init__(self, input_size, output_size, mode, activation):
        super().__init__()
        assert mode in ["greedy", "normal", "intel"]
        self.mode=mode
        self.activation=activation
        
        
        if self.mode == "normal":
            self.linear = nn.Linear(input_size, output_size)
        else:
            self.linear = GreedyLinearMult(input_size, output_size)
        
        if self.mode == "intel":
            self.bn=nn.BatchNorm1d(output_size, affine=False)
        
    
    def change_mode(self, new_mode):
        self.mode=new_mode
        if self.mode == "intel":
            self.bn=nn.BatchNorm1d(output_size, affine=False)
        raise NotImplementedError
        
    def forward(self, input: torch.tensor):
        x = self.linear(input)
        if self.mode == "intel":
            x=self.bn(x)
        if self.activation:
            x=self.activation(x)
        return x

In [5]:
#configs

lr = 0.1
epochs = 200
batch_size = 512
device = "cuda"
dataset_name = "FashionMNIST"
hidden_states=[1024*8,256*8*4,256*8*4,256*8*4,256*8, 128]
mode="greedy"

run="debug"
noise_eps, pgd_eps, iters=2, 0.5, 10

start_from=25
milestones=[50, 100, 150]

# runwandb=wandb.init(
#       # Set the project where this run will be logged
#       project="test4", 
#       # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10)
#       name=f"{run}", 
#       # Track hyperparameters and run metadata
#       config={
#     "learning_rate": lr,
#     "epochs": epochs,
#     "batch_size": batch_size,
#     "dataset_name":dataset_name,
#     "prevent_stategic_output_decline": prevent_stategic_output_decline,
#     "prevent_stategic_weight_increase": prevent_stategic_weight_increase,
#     "self_interest_neurons":self_interest_neurons,
#     "model_hidden_states": hidden_states,
#     "noise_eps": noise_eps, 
#     "pgd_eps": pgd_eps, 
#     "iters":iters,
#     "milestones":milestones,
#     "after_start_prevent_stategic_weight_increase": after_start_prevent_stategic_weight_increase,
#     "after_start_self_interest_neurons": after_start_self_interest_neurons,
#     "start_from":start_from,
#     "self_interest_layer":self_interest_layer
#     })

In [6]:
# datasets
mean = {
    'MNIST': np.array([0.1307]),
    'FashionMNIST': np.array([0.2859])
}
std = {
    'MNIST': 0.3081,
    'FashionMNIST': 0.2859
}
train_transforms = {
    'MNIST': [transforms.RandomCrop(28, padding=1, padding_mode='edge')],
    'FashionMNIST': [transforms.RandomCrop(28, padding=1, padding_mode='edge')]
}

dataset_class = torchvision.datasets.FashionMNIST
default_transform = [transforms.ToTensor(),
                     transforms.Normalize(mean[dataset_name], [std[dataset_name]] * len(mean[dataset_name]))]

train_transform = transforms.Compose(train_transforms[dataset_name] + default_transform)
test_transform = transforms.Compose(default_transform)
# build data sets
trainDataset = dataset_class(root="./data", train=True, transform=train_transform, download=True)
testDataset = dataset_class(root="./data", train=False, transform=test_transform, download=True)

# build data loaders
trainDataloader = DataLoader(trainDataset,
                             batch_size=batch_size, num_workers=1, sampler=None, drop_last=True, shuffle=True)

testDataloader = DataLoader(testDataset, 
                            batch_size=batch_size, num_workers=1, sampler=None, drop_last=True, shuffle=True)


In [7]:
# model
class ClassifierMLP(torch.nn.Module):
    def __init__(self, hidden_layer_size, class_num, mode):
        super(ClassifierMLP, self).__init__()
        layers = []
        input_size = 784
        self.mode=mode
        for i, h in enumerate(hidden_layer_size):
            layers.append(GLinear(input_size, h, mode, nn.ReLU()))
            input_size = h
        layers.append(GLinear(input_size, class_num, mode, None))
        self.deep = nn.Sequential(*layers)

    def forward(self, x):
        x = torch.flatten(x, start_dim=1)
        x=self.deep(x)
        return x

In [8]:
# change model state
def set_to_eval(model):
    model.eval()
            

def set_to_train(model):
    model.train()

In [9]:
# PGD Attack
# datasets
pgd_mean = {
    'MNIST': np.array([0.1307]),
    'FashionMNIST': np.array([0.2859])
}
pgd_std = {
    'MNIST': 0.3081,
    'FashionMNIST': 0.2859
}
def pgd_attack(model, images, labels, eps=0.3, alpha=2/255, iters=40, t=1):
    images = images.to(device)
    labels = labels.to(device)
    loss = nn.CrossEntropyLoss()
        
    ori_images = images.data
    min_val=((0-pgd_mean[dataset_name][0])/pgd_std[dataset_name])
    max_val=(1-pgd_mean[dataset_name][0])/pgd_mean[dataset_name][0]
    for i in range(iters) :    
        images.requires_grad = True
        outputs = model(images, t)

        model.zero_grad()
        cost = loss(outputs, labels).to(device)
        cost.backward()

        adv_images = images + alpha*images.grad.sign()
        eta = torch.clamp(adv_images - ori_images, min=-eps, max=eps)
        images = torch.clamp(ori_images + eta, min=min_val, max=max_val).detach_()
            
    return images

def add_noise(images, eps):
    min_val=((0-pgd_mean[dataset_name][0])/pgd_std[dataset_name])
    max_val=(1-pgd_mean[dataset_name][0])/pgd_std[dataset_name]
    eta=torch.normal(0.0, eps/2, images.shape, device=device)
    images = torch.clamp(images + eta, min=min_val, max=max_val).detach_() 
    return images

In [10]:
def track_model(model, epoch, step=0):
    for name, param in model.named_parameters():
        try:
            if len(param.data.shape)==1:
                wandb.log({f"w_{name}_norm2":torch.linalg.norm(param.data) , 'epoch': epoch, 'batch': step})
                wandb.log({f"grad_{name}_norm2":torch.linalg.norm(param.grad) , 'epoch': epoch, 'batch': step})
            else:
                wandb.log({f"w_{name}_norm2":torch.linalg.matrix_norm(param.data) , 'epoch': epoch, 'batch': step})
                wandb.log({f"grad_{name}_norm2":torch.linalg.matrix_norm(param.grad) , 'epoch': epoch, 'batch': step})
        except:
            pass
#                 print(name)

In [11]:
def normal_eval(model, testDataloader, epoch, loss_func):
    with torch.no_grad():
        correct = 0
        total = 0
        loss = 0
        for step, (x, y) in enumerate(testDataloader):
            # forward pass
            x, y = x.to(device), y.to(device)
            output = model(x)
            loss_c = loss_func(output, y)
            output = torch.argmax(output, dim=1)
            loss += loss_c.detach().item()
            total += x.shape[0]
            correct += torch.sum(y == output).detach().item()

        acc=(correct / total)
        loss=loss / len(testDataloader)
#         wandb.log({"test_loss": loss,'epoch': epoch})
#         wandb.log({"accuracy": acc,'epoch': epoch})
        print(f'normal accuracy={round(acc, 3)}, loss={round(loss, 3)}')
        
def robust_eval(model, testDataloader, epoch, loss_func, eps, iters, t=1):
    correct = 0
    total = 0
    loss = 0
    for step, (x, y) in enumerate(testDataloader):
        # forward pass
        x, y = x.to(device), y.to(device)
        x=pgd_attack(model, x, y, eps=eps, alpha=(eps/iters)*2.5, iters=iters, t=t)
        with torch.no_grad():
            output = model(x)
            loss_c = loss_func(output, y)
            output = torch.argmax(output, dim=1)
            loss += loss_c.detach().item()
            total += x.shape[0]
            correct += torch.sum(y == output).detach().item()

    acc=(correct / total)
    loss=loss / len(testDataloader)
    if t==1:
        wandb.log({"robust_test_loss": loss,'epoch': epoch})
        wandb.log({"robust_accuracy": acc,'epoch': epoch})
        print(f'robust accuracy={round(acc, 3)}, loss={round(loss, 3)}')
    else:
        return acc, loss
    
def strong_robust_eval(model, testDataloader, epoch, loss_func, eps, iters):
    t=0.001
    while t<100:
        t*=2
        if t!=1:
            acc, loss=robust_eval(model, testDataloader, epoch, loss_func, eps, iters, t)
            wandb.log({"robust_test_loss_t": loss,'t': t})
            wandb.log({"robust_accuracy": acc,'t': t})
            print(f'robust accuracy={round(acc, 3)}, loss={round(loss, 3)}, t={t}, steps={iters}')
    
def noise_robust_eval(model, testDataloader, epoch, loss_func, eps):
    correct = 0
    total = 0
    loss = 0
    for step, (x, y) in enumerate(testDataloader):
        # forward pass
        x, y = x.to(device), y.to(device)
        x=add_noise(x, eps=eps)
        with torch.no_grad():
            output = model(x)
            loss_c = loss_func(output, y)
            output = torch.argmax(output, dim=1)
            loss += loss_c.detach().item()
            total += x.shape[0]
            correct += torch.sum(y == output).detach().item()

    acc=(correct / total)
    loss=loss / len(testDataloader)
    wandb.log({"noise robust_test_loss": loss,'epoch': epoch})
    wandb.log({"noise robust_accuracy": acc,'epoch': epoch})
    print(f'noise robust accuracy={round(acc, 3)}, loss={round(loss, 3)}')

    
def check_cos_similarity(model, testDataloader, loss_func, layer):
    testing_model = ClassifierMLP(hidden_states, 10, prevent_stategic_output_decline, prevent_stategic_weight_increase, self_interest_neurons).to(device)
    testing_model.load_state_dict(model.state_dict())
    testing_optimizer = torch.optim.SGD(params=testing_model.parameters(), lr=0.0001)
    testing_optimizer.zero_grad()
    tx=None
    for x, y in testDataloader:
        x= x.to(device)
        tx= x
        break
    y=testing_model.assumsion_check_forward(tx, layer, 0)
    ty=torch.normal(0.0, 3, y.shape, device=device)
    loss=torch.sum((y*ty))
    
    loss.backward()
    testing_optimizer.step()
    with torch.no_grad():
        new_y=testing_model.assumsion_check_forward(tx, layer, 0)
        diff=new_y-y
        ty=ty/torch.sum(ty*ty)
        diff=diff/torch.sum(diff*diff)
        cos=torch.sum(ty*diff)
        return cos.cpu().item()
        
def assumsion_check(model, testDataloader, loss_func):
    for i in range(1,7):
        cos_i=check_cos_similarity(model, testDataloader, loss_func, i)
        print(f"cos_{i}={cos_i}")
        wandb.log({f"cos_{i}": cos_i,'epoch': epoch, 'batch': step})
        


In [12]:
device="cuda"
model = ClassifierMLP(hidden_states, 10, "greedy").to(device)
optimizer = torch.optim.SGD(params=model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.1)
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(name)
# print(model)

AssertionError: Torch not compiled with CUDA enabled

In [None]:
loss_func=torch.nn.CrossEntropyLoss()
epoch=0

In [116]:
for epoch in range(0,epochs):
#     if epoch==start_from:
#         model.self_interest_neurons=after_start_self_interest_neurons
#         model.prevent_stategic_weight_increase=after_start_prevent_stategic_weight_increase
#         self_interest_neurons=after_start_self_interest_neurons
#         prevent_stategic_weight_increase=after_start_prevent_stategic_weight_increase
        
    set_to_train(model)
    avgloss = 0
    with tqdm(total=len(trainDataloader), position=0, leave=False) as pbar:
        for step, (x, y) in enumerate(trainDataloader):
            # forward pass
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            output = model(x)
            loss = loss_func(output, y)
            print(f"loss:{loss}")
            # backward pass
            loss.backward()
            optimizer.step()
            pbar.update()
                
#             if step==int(len(trainDataloader)/2):
#                 model.normalize_weights()
                
            #tracking
            avgloss += loss.detach().item()
            if step%10==1:
                optimizer.zero_grad()
#                 model.normalize_weights()
                
            if step%40==1:
                print(f"train_loss:{avgloss/40}  epoch:{epoch}, batch:{step}")
#                 wandb.log({"train_loss": avgloss/20,'epoch': epoch, 'batch': step})
                avgloss=0
                
#             if step==0:
#                 track_model(model, epoch)
    scheduler.step()
#     wandb.log({"learning_rate_plot": scheduler.get_last_lr()[-1],'epoch': epoch})
#     optimizer.zero_grad()
#     model.normalize_weights()
#     set_to_eval(model)
    # normal_eval(model, testDataloader, epoch, loss_func)
#     robust_eval(model, testDataloader, epoch, loss_func, pgd_eps, iters)  
#     noise_robust_eval(model, testDataloader, epoch, loss_func, noise_eps)
#     set_to_train(model)
#     assumsion_check(model, testDataloader, loss_func)  

  0%|          | 0/117 [00:00<?, ?it/s]

loss:2.299886465072632


  1%|          | 1/117 [00:11<21:38, 11.20s/it]

loss:2.30244517326355


  2%|▏         | 2/117 [00:20<18:48,  9.81s/it]

train_loss:0.11505829095840454  epoch:0, batch:1
loss:2.3009252548217773


  3%|▎         | 3/117 [00:28<17:46,  9.35s/it]

loss:2.3012654781341553


  3%|▎         | 4/117 [00:37<17:11,  9.13s/it]

loss:2.2973484992980957


                                               

KeyboardInterrupt: 

In [None]:
# best_weights=wandb.restore('final_model_199.pt', run_path="ckodserteam/test3/3sem9zdl")
# print(best_weights.name)
# st=torch.load(best_weights.name)
# model.load_state_dict(st)

In [None]:
# set_to_eval(model)
# normal_eval(model, testDataloader, epoch, loss_func)
# robust_eval(model, testDataloader, epoch, loss_func, pgd_eps, iters)  
# noise_robust_eval(model, testDataloader, epoch, loss_func, noise_eps)
# set_to_train(model)
# assumsion_check(model, testDataloader, loss_func)  

In [None]:
# set_to_eval(model)
# strong_robust_eval(model, testDataloader, epoch, loss_func, pgd_eps, iters)

In [None]:
# torch.save(model.state_dict(),os.path.join(wandb.run.dir, f"final_model_{epoch}.pt"))
# wandb.save(f"final_model_{epoch}.pt")
        
# runwandb.finish()