<a href="https://colab.research.google.com/github/denny0323/LabelCorruption/blob/main/DFL/DFL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
path = '/content/drive/MyDrive/Colab Notebooks/'
sys.path.append(path)

In [None]:
import sys, os
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch.optim as optim
import matplotlib.pyplot as plt

from util_SB import *
from modelDFL import *
from gmeans import *

import numpy as np
import pandas as pd
import seaborn as sbn

from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
from scipy.stats import entropy

In [None]:
def predict(x, Siamese, features_reps):
    with torch.no_grad():
        similarities = torch.zeros(7)
        for i, output1 in enumerate(features_reps):
            output1 = output1.cuda()
            output2 = Siamese.forward_once(x.unsqueeze(0))
            similarities[i] = F.pairwise_distance(output1, output2, keepdim = True).detach().cpu()
    return torch.argmin(similarities)

In [None]:
train_set_track = WBMs(pkl_dir=path, train=True)

In [None]:
experiment_results = {}
batch_size = 256
test_batch_size = 100
train_set_track = WBMs(pkl_dir=path, train=True)

for NOISE_LEVEL in [0.20, 0.50, 0.70, 0.75]:

    train_set = WBMs(pkl_dir=path, train=True, NOISE_LEVEL=NOISE_LEVEL)
    print("# of noise samples : ", sum([y!=y_n for y, y_n in zip(train_set_track.Y, train_set.Y)]))
    test_set = WBMs(pkl_dir=path, train=False)
    test_set_siamese = WBMsSiamese_standard(test_set)
    test_loader = DataLoader(test_set_siamese, batch_size=test_batch_size, shuffle=False)

    ### 0. Load the base model
    Siamese = SiameseNet().cuda()
    Siamese.load_state_dict(
        torch.load(path+'/preSiamese/'+'Pretrain_best_epoch_192_valLoss_0.08648_valAcc_0.96296_noise_0.00_bestAccVal_0.96296.pth',
                  map_location='cuda')
    )

    indices = [-1] * 7
    representatives = [None] * 7

    for i, x, y in test_set:
        if any([idx<0 for idx in indices]):
            if indices[y] == -1:
                representatives[y] = x
                indices[y] = i
        else:
            break

    optimizer = optim.Adam(Siamese.parameters(), lr = 3e-6)
    epochs = 10; prog_iter = 0; best_acc = 0; new_class = max(train_set.Y)+1
    theta_low, theta_high = [0.3, 0.9]

    num_iter = (len(train_set.X)//batch_size)+1
    test_num_iter = (len(test_set.X)//test_batch_size)+1

    criterion = CLLoss()
    criterion_test = ContrastiveLoss()
    
    
    for epoch in range(1, epochs+1):   
        ### 1. Extract features
        Siamese = Siamese.to('cpu')
        with torch.no_grad():
            while True:   
                tensor_X = torch.empty(size=(train_set.X.size()[0], 7))
                for i in range(50, len(train_set), 50):
                    tmpdata = train_set.X[i-50:i]
                    tensor_X[i-50:i] = Siamese.forward_once(tmpdata)
                    del tmpdata
                label = train_set.Y

                # check np.nan
                isnan=[]
                for i, x in enumerate(tensor_X.cpu()):
                    if np.isnan(x.detach().numpy()).any():
                        isnan.append(i) 
                if isnan: 
                    del tensor_X, label
                    torch.cuda.empty_cache()
                else: 
                    #print('Process Success: All tensors are successfully extracted!')
                    break

            ### 2. Gmeans clustering
            cpu_X = tensor_X.cpu()
            gmeans = GMeans(min_obs=10, random_state=33, strictness=4)
            gmeans.fit(cpu_X.detach().numpy())

            clusters = gmeans.labels_
            centers = gmeans.centers_

            ##### check noise (outlier detection)
            threshold = 0.7
            isnoise = np.array([np.linalg.norm(x[0]-centers[x[1]]) >= threshold for x in zip(cpu_X.detach().numpy(), clusters)])

            ##### Cluster impurity measuring
            for c in list(set(clusters)):
                idx = clusters==c
                P_cy = torch.from_numpy(label[idx])
                P_cy_unique = torch.from_numpy(np.sort(np.unique(P_cy)))
                P_cy_unique_count = torch.stack([(P_cy==x_u).sum() for x_u in P_cy_unique])
                
                Imp_c = entropy(P_cy_unique_count)
                R_c = torch.mode(P_cy, 0)[0]

                ##### Class relabeling
                if Imp_c <= theta_low:
                    label[idx] = R_c

                elif theta_high < Imp_c:
                    label[idx] = new_class
                    new_class += 1

                else:
                    noise_num = 0
                    for lbl in P_cy_unique:
    #                     print(lbl, R_c)
                        if lbl != R_c:
                            noise_idx = label==int(lbl)
                            isnoise[noise_idx] = True
                            noise_num += len(noise_idx)
                    # print('%d samples are relabeled to noise' %noise_num)
        
        # 3. dicriminative feature learning
        siamese_dataset = TensorDataset(train_set.X, 
                                        torch.from_numpy(np.asarray(list(zip(label, clusters, isnoise))))
                                      )

        train_set_siamese = WBMsSiamese(siamese_dataset)
        train_loader = DataLoader(train_set_siamese, batch_size=batch_size, shuffle=True)
        
        Siamese = Siamese.cuda()
        Siamese.train()
        for batch_idx, data in enumerate(train_loader):
            img0, img1, Uij, Cij, Iij = data
            img0, img1, Uij, Cij, Iij = img0.cuda(), img1.cuda(), \
                                        Uij.cuda(), Cij.cuda(), Iij.cuda()    

            optimizer.zero_grad()

            output0, output1 = Siamese(img0, img1)

            loss = criterion(output0, output1, Uij, Cij, Iij)
            loss.backward()

            optimizer.step()

            if batch_idx % 10 == 0 or batch_idx+1 == num_iter:
                
                with torch.no_grad():
                    for _, test_data in enumerate(test_loader):
                        test1, test2, label_test = test_data
                        test1, test2, label_test = test1.cuda(), test2.cuda(), label_test.cuda()

                        output1_test, output2_test = Siamese.forward(test1, test2)
                        test_loss = criterion_test(output1_test, output2_test, label_test)

                        dist = F.pairwise_distance(output1_test, output2_test, keepdim = True)

                        pred = dist < 0.5
                        acc = torch.sum(pred == label_test).item()/test1.size()[0]

                        sys.stdout.write('\r')
                        sys.stdout.write('%s:%.2f-%s | Epoch [%3d/%3d] Iter[%3d/%3d] Train_loss: %.6f | Test_loss: %.6f | acc: %.4f | best_acc: %.4f'
                                %('WBMs', train_set.NOISE_LEVEL, train_set.noise_mode, epoch, epochs, batch_idx+1, num_iter, 
                                loss.item(), test_loss.item(), acc, best_acc))
                        sys.stdout.flush()
                        
                if acc > best_acc:
                    best_acc = acc
                    
            if batch_idx+1 == num_iter:
                sys.stdout.write('\n')                           
        
        del img0, img1, Uij, Cij, Iij
        del test1, test2, label_test, dist, test_data
        del tensor_X, cpu_X, gmeans, clusters, centers, isnoise
        del siamese_dataset, train_set_siamese, train_loader

    with torch.no_grad():
        features_reps = []
        for x in representatives:
            x = x.cuda()
            features_reps.append(Siamese.forward_once(x.unsqueeze(0)))

    from sklearn.metrics import f1_score, accuracy_score, average_precision_score
    test_loader_for_classification = DataLoader(test_set, batch_size=test_batch_size, shuffle=False)
    from sklearn.metrics import recall_score, precision_score
    # criterion_ml = nn.CrossEntropyLoss().cuda()
    test_loss=MacroF1=MicroF1=EMR=Recall=Precision=0.0

    Siamese.eval()
    with torch.no_grad():
        for id, x, y in test_loader_for_classification:
            x = x.cuda(); y = y.cuda()
            x_features = Siamese.forward_once(x)
            y_pred = torch.from_numpy(np.asarray([predict(xx, Siamese, features_reps) for xx in x]))
            rep_xs = torch.empty((len(x), 7))
            for i, yy in enumerate(y_pred):
                rep_xs[i] = features_reps[yy]
            
            loss = criterion_test(x_features.cuda(), rep_xs.cuda(), (y_pred.cuda()==y).int())*100
            # ml_loss = criterion_ml(torch.from_numpy(np.asarray(y_pred)).float(), torch.from_numpy(np.asarray(y.cpu())).float())/100
            
            test_loss += loss.data
            # ml_test_loss += ml_loss.data
            MicroF1 += f1_score(y.cpu(), y_pred, average='micro')
            MacroF1 += f1_score(y.cpu(), y_pred, average='macro')
            EMR += accuracy_score(y.cpu(), y_pred)
            Precision += precision_score(y.cpu(), y_pred, average='macro')
            Recall += recall_score(y.cpu(), y_pred, average='macro')
            
        test_loss /= len(test_loader)
        # ml_test_loss /= len(test_loader)
        MicroF1 /= len(test_loader)
        MacroF1 /= len(test_loader)
        EMR /= len(test_loader)
        Precision /= len(test_loader)
        Recall /= len(test_loader)
        
        print('┌ MicroF1: %5f\n│ MacroF1: %5f\n│ EMR: %5f\n│ Precision: %5f\n│ Recall: %5f\n└ ContrastiveLoss: %5f\n' %(MicroF1, MacroF1, EMR, Precision, Recall, test_loss))
    del train_set, test_set, test_set_siamese, test_loader
    experiment_results[NOISE_LEVEL] = {'MicroF1':MicroF1, 'MacroF1':MacroF1, 'EMR':EMR, 'Precision':Precision, 'Recall':Recall}

# of noise samples :  4632
WBMs:0.20-symm | Epoch [  1/ 10] Iter[ 91/ 91] Train_loss: 0.000000 | Test_loss: 0.265854 | acc: 0.7222 | best_acc: 0.8444
WBMs:0.20-symm | Epoch [  2/ 10] Iter[ 91/ 91] Train_loss: 0.152310 | Test_loss: 0.378389 | acc: 0.6889 | best_acc: 0.8444
WBMs:0.20-symm | Epoch [  3/ 10] Iter[ 91/ 91] Train_loss: 0.314406 | Test_loss: 0.252158 | acc: 0.7778 | best_acc: 0.8444
WBMs:0.20-symm | Epoch [  4/ 10] Iter[ 91/ 91] Train_loss: 0.198331 | Test_loss: 0.331085 | acc: 0.6556 | best_acc: 0.8444
WBMs:0.20-symm | Epoch [  5/ 10] Iter[ 91/ 91] Train_loss: 0.209942 | Test_loss: 0.383944 | acc: 0.6667 | best_acc: 0.8444
WBMs:0.20-symm | Epoch [  6/ 10] Iter[ 91/ 91] Train_loss: 0.079844 | Test_loss: 0.419296 | acc: 0.6222 | best_acc: 0.8444
WBMs:0.20-symm | Epoch [  7/ 10] Iter[ 91/ 91] Train_loss: 0.131067 | Test_loss: 0.502868 | acc: 0.6222 | best_acc: 0.8444
WBMs:0.20-symm | Epoch [  8/ 10] Iter[ 91/ 91] Train_loss: 0.157313 | Test_loss: 0.575982 | acc: 0.6111 | best_a

In [None]:
import pandas as pd
import numpy as np
result_df = pd.DataFrame.from_dict(experiment_results)

In [None]:
display(result_df)

Unnamed: 0,0.20,0.50,0.70,0.75
MicroF1,0.509655,0.532146,0.521552,0.544023
MacroF1,0.430109,0.444232,0.44199,0.483596
EMR,0.509655,0.532146,0.521552,0.544023
Precision,0.45352,0.461577,0.470303,0.503534
Recall,0.472578,0.495255,0.499282,0.543338


In [None]:
import os
os.system('shutdown -s -t 0')

256

In [None]:
display(result_df)

Unnamed: 0,0.20,0.50,0.70,0.75
MicroF1,0.170115,0.244119,0.373525,0.361648
MacroF1,0.129488,0.185097,0.275998,0.299626
EMR,0.170115,0.244119,0.373525,0.361648
Precision,0.163462,0.240035,0.340528,0.369727
Recall,0.228961,0.266648,0.30412,0.364257


In [None]:
display(result_df)

Unnamed: 0,0.20,0.50,0.70,0.75
MicroF1,0.593985,0.591034,0.568946,0.576111
MacroF1,0.480629,0.481915,0.460345,0.471385
EMR,0.593985,0.591034,0.568946,0.576111
Precision,0.534257,0.532627,0.472998,0.473816
Recall,0.534528,0.530063,0.51328,0.507456
