In [1]:
from __future__ import print_function, division
import os
import torch


# DataLoader은 Dataset을 샘플에 쉽게 접근할 수 있도록 순회가능한 객체(iterable)로 감쌉니다
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, datasets
from torchvision.transforms import ToTensor
import torchvision.models as models 

import pprint
from datetime import datetime



import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

from collections import defaultdict

from network_architectures import MNIST_BN_32_64_256
from active_learn import argparser

In [2]:
import numpy as np
original_data = datasets.MNIST(
        root="data",
        train=True,
        download=True,
        transform=ToTensor()
    )

test_data = datasets.MNIST(
        root="data",
        train=False,
        download=True,
        transform=ToTensor()
    )


original_all = []
original_dataset = []
original_label = [] 

for i, sample in enumerate(original_data) : 
    original_all.append(sample)
    feature = np.array(sample[0])
    original_dataset.append(feature)
    original_label.append([sample[1], i])
    

test_dataset = [] 
test_label = [] 

for i, test in enumerate(test_data) : 
    feature = np.array(test[0])
    test_dataset.append(feature)
    test_label.append([test[1], i])


unlabeled_dataset = original_dataset[:]
unlabeled_dataset_label = original_label[:]
labeled_dataset = [] 
labeled_dataset_label = []

c_labeled_dataset = [] 
c_labeled_dataset_label = []

count_subgraph = defaultdict(list)

In [3]:
PATH = './weights/MNIST/'

CAE = torch.load(PATH + 'CAE.pt')  
CAE.load_state_dict(torch.load(PATH + 'CAE_state_dict.pt'))  

sample_size = 50
if len(unlabeled_dataset) < sample_size:
    sample_size = len(unlabeled_dataset)

In [4]:
use_cuda = True

device = torch.device("cuda" if use_cuda else "cpu")
    # use_cuda가 true라면 kwargs를 다음과 같이 지정하기. 
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}


In [28]:
from active_learn import active_sample

un_sample_index, c_sample_index, radius  = active_sample(unlabeled_dataset, labeled_dataset, c_labeled_dataset, sample_size, model=CAE, device=device)

sample_data = [unlabeled_dataset[i] for i in un_sample_index]
sample_label = [unlabeled_dataset_label[i] for i in un_sample_index]

for i in un_sample_index[::-1] : 
    del unlabeled_dataset[i]
    del unlabeled_dataset_label[i]

if len(c_sample_index) != 0 : 
    c_sample_data = [c_labeled_dataset[i] for i in c_sample_index]
    c_sample_label = [c_labeled_dataset_label[i] for i in c_sample_index]
    sample_data = np.concatenate((sample_data, c_sample_data), axis=0) 
    sample_label = np.concatenate((sample_label, c_sample_label), axis=0) 

    for i in c_sample_index[::-1] : 
        np.delete(c_labeled_dataset, i, axis=0)
        np.delete(c_labeled_dataset_label, i, axis=0)

if len(labeled_dataset_label) == 0 :  
    labeled_dataset = sample_data[:]
    labeled_dataset_label = sample_label[:]
else : 
    labeled_dataset = np.concatenate((labeled_dataset,sample_data),axis=0)
    labeled_dataset_label = np.concatenate((labeled_dataset_label, sample_label), axis =0)


Max distance from cluster : 11.48


In [29]:
from active_learn import adjacency_subgraph, make_subgraph

subgraph, density_subgraph = make_subgraph(labeled_dataset_label, original_dataset, radius, CAE)
# 여기에서 sample_dataset을 sample_data로 변경 필요!

dist_class, adj_dist, classified_subgraph_index, pseudo_class_label = adjacency_subgraph(labeled_dataset, labeled_dataset_label, radius, CAE, 0)

print("Well work!")

from active_learn import first_classification, check_performance
f_classification = first_classification(classified_subgraph_index, pseudo_class_label, subgraph, density_subgraph, 0, 0)
num_classification, score, dic_score = check_performance(f_classification,original_label)


Well work!


In [30]:
num_classification, score

(3709, 0.9956861687786466)

In [31]:
# CS1 방법을 적용 후, unlabeled dataset 구분하기 
erase_dataset_ori_index = []
pre_index = [j[1] for j in c_labeled_dataset_label]

for i in f_classification.keys(): 
    index = f_classification[i]
    
    index = list(set(index) - set(pre_index))

    new_labeled_dataset = [original_dataset[j] for j in index]
    new_labeled_dataset_label = [ [i,j] for j in index ]
    new_erase_original_index = [new_labeled_dataset_label[j][1] for j in range(len(new_labeled_dataset_label))]

    if len(c_labeled_dataset_label) == 0 : 
        c_labeled_dataset = new_labeled_dataset
        c_labeled_dataset_label = new_labeled_dataset_label

    # new_labeled_dataset 이 없는 경우가 발생한다 조치 필요!        
    else : 
        c_labeled_dataset = np.concatenate((c_labeled_dataset, new_labeled_dataset), axis=0)
        c_labeled_dataset_label = np.concatenate((c_labeled_dataset_label, new_labeled_dataset_label), axis =0)
    
    erase_dataset_ori_index += new_erase_original_index

erase_unlabeled_index = [np.where(np.array(unlabeled_dataset_label).T[1] == i)[0][0]  for i in erase_dataset_ori_index]
erase_unlabeled_index.sort()


for i in erase_unlabeled_index[::-1] : 
    del unlabeled_dataset[i]
    del unlabeled_dataset_label[i]

In [9]:
def update_count_subgraph(count_subgraph, original_dataset_label, subgraph, num_class) : 
    all_index = [i[1] for i in original_dataset_label]
    for i in all_index : 
        count = [0]*num_class
        i_subgraph = np.where(subgraph[:, i]==1)[0]
        for j in i_subgraph : 
            count[labeled_dataset_label[j][0]] += 1

        if sum(count) != 0 : count_subgraph[i].append([count, radius[0]]) 

    
    return count_subgraph

def check_CS1(c_labeled_dataset_label, count_subgraph) : 
    c_index = [i[1] for i in c_labeled_dataset_label]
    c_index.sort()
    restore_index = [] 

    for num, index in enumerate(c_index) : 
        # count 개수만 불러오기. 아 이건 좀 아닌 것 같긴 하다. 처음에 거리가 멀리 있었던 값에 걸쳐질 수 있는 거니까. 
        # sample의 크기를 키워서 반지름의 크기를 줄이는 방법밖에 없나...?
        # 또는 최근의 횟수만 불러온다던가.. 
        count_list = np.array(count_subgraph[index])[:,0]
        for j,count in enumerate(count_list) : 
            if j ==0 : sum_count = count 
            else : sum_count = [x + y for x,y in zip(sum_count, count)] 

        c_label = c_labeled_dataset_label[num][0]
        # c_label 외에 다른 클래스에 subgraph에 속한 적이 있다면 
        if int(sum_count[c_label]) != sum(sum_count) : 
            restore_index.append(num)
    
    restore_index.sort()
    return restore_index


In [32]:
# unlabeled의 변화를 Folliwing 하지 못하기 때문에 original index로 접근해야 한다. 
update_count_subgraph(count_subgraph, original_label, subgraph, 10)

delete_c_index = check_CS1(c_labeled_dataset_label, count_subgraph) 

"""
for i in delete_c_index[::-1] : 
    unlabeled_dataset = np.concatenate((unlabeled_dataset, c_labeled_dataset[i]), axis =0)
    unlabeled_dataset_label = np.concatenate((unlabeled_dataset_label, c_labeled_dataset_label[i]), axis=0)
    del c_labeled_dataset[i]
    del c_labeled_dataset_label[i]
"""


  count_list = np.array(count_subgraph[index])[:,0]


'\nfor i in delete_c_index[::-1] : \n    unlabeled_dataset = np.concatenate((unlabeled_dataset, c_labeled_dataset[i]), axis =0)\n    unlabeled_dataset_label = np.concatenate((unlabeled_dataset_label, c_labeled_dataset_label[i]), axis=0)\n    del c_labeled_dataset[i]\n    del c_labeled_dataset_label[i]\n'

In [11]:
len(count_subgraph)

59950

In [89]:
import numpy as np


# batch 단위로 들어오는 구나. 그럼 data loarder을 통해서 input 값들을 조정할 필요가 있겠는 걸 
def mixup_data(x, y, mixup_alpha =4):
    lam = np.random.beta(mixup_alpha, mixup_alpha) # scalar 값 
    batch_size = x.size()[0]
    index = torch.randperm(batch_size).cuda() # shuffle 한 index 반환 
    mixed_x = lam * x + (1 - lam) * x[index]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam


def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)


class SC1_LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self):
        super(SC1_LabelSmoothingCrossEntropy, self).__init__()
    def forward(self, y, targets, smoothing=0.1): # y는 hard labeling. SC2 도 hard labeling 형태로 반환해야겠네 
        confidence = 1. - smoothing
        log_probs = F.log_softmax(y, dim=-1) # 예측 확률 계산
        true_probs = torch.zeros_like(log_probs)
        true_probs.fill_(smoothing / (y.shape[1] - 1))
        true_probs.scatter_(1, targets.data.unsqueeze(1), confidence) # 정답 인덱스의 정답 확률을 confidence로 변경
        return torch.mean(torch.sum(true_probs * -log_probs, dim=-1)) # negative log likelihood


class SC2_LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self):
        super(SC2_LabelSmoothingCrossEntropy, self).__init__()
        
    def forward(self, prob, _): # y는 hard labeling. SC2 도 hard labeling 형태로 반환해야겠네 
        prob = torch.tensor(prob)
        log_probs = F.log_softmax(prob, dim=-1) # 예측 확률 계산
        return torch.mean(torch.sum(prob * -log_probs, dim=-1)) # negative log likelihood


In [108]:


def cal_prob(unlabeled_index, count_subgraph) : 
    i_count_subgraph = count_subgraph[unlabeled_index].copy()
    num_iteration = len(i_count_subgraph)
    min_radius = i_count_subgraph[num_iteration-1][1]

    for i , p_count in enumerate(i_count_subgraph) : 
        i_count_subgraph[i] = p_count[0] / ((i+1) *p_count[1] / (min_radius *num_iteration))
        i_count_subgraph[i] = F.softmax(torch.Tensor(i_count_subgraph[i]))
    
    if num_iteration == 1 : result = i_count_subgraph[0].numpy()
        
    else : result = np.sum(i_count_subgraph, axis=0) / len(i_count_subgraph)
    return result 

In [109]:
a = cal_prob(122, count_subgraph)
print(a)
test = SC2_LabelSmoothingCrossEntropy() 

print(test.forward(a,_))

tensor([0.1409, 0.0770, 0.0770, 0.0770, 0.0770, 0.0770, 0.0770, 0.0770, 0.2433,
        0.0770])
tensor(2.2775)


  i_count_subgraph[i] = F.softmax(torch.Tensor(i_count_subgraph[i]))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  prob = torch.tensor(prob)


In [63]:
def MNIST_train(model, device, labeled_dataset, labeled_dataset_label, optimizer, criterion, epoch):
    model.train()
    

    labeled_dataset = torch.tensor(labeled_dataset)
    labeled_dataset_label = torch.tensor(labeled_dataset_label)
    
    all_data = [(labeled_dataset[i], labeled_dataset_label[i][0]) for i in range(len(labeled_dataset_label))]
    
    if criterion == "hard labeling" : batch_size = 4
    elif criterion == SC1_LabelSmoothingCrossEntropy : batch_size = 32
    else : batch_size = 100

    data_loader = DataLoader(all_data, batch_size= batch_size)

    for i, (data, target) in enumerate(data_loader):
        data = data.view(-1, 1, 28,28)
        target = target.type(torch.LongTensor)
        data, target = data.to(device), target.to(device)
        data, target_a, target_b, lam = mixup_data(data, target)

        optimizer.zero_grad()
        output = model(data) #여기가 문제가 생기는 지점 

        # loss 함수 수정 필요. 
        if criterion == "hard labeling" : loss = mixup_criterion(F.nll_loss, output, target_a, target_b, lam)
        else: loss = mixup_criterion(criterion, output, target_a, target_b, lam)
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, i, len(data_loader),
                100. * i / len(data_loader), loss.item()))
    return model

def MNIST_test(model, device, test_dataset, test_dataset_label, criterion) :
    model.eval()
    test_loss = 0
    correct = 0


    test_dataset = torch.tensor(test_dataset)
    test_dataset_label = torch.tensor(test_dataset_label)

    all_data = [(test_dataset[i], test_dataset_label[i][0]) for i in range(len(test_dataset_label))]
    data_loader = DataLoader(all_data, batch_size=32)

    # dataloader에 index가 가능한가? 
    with torch.no_grad():
        for data, target in data_loader:
            target = target.type(torch.LongTensor)
            data = data.view(-1, 1, 28,28)
            data, target = data.to(device), target.to(device)

            output = model(data)
            if criterion == "hard labeling" : F.nll_loss(output, target, reduction='sum').item()
            else : test_loss += criterion(output, target).item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_dataset),
        100. * correct / len(test_dataset)))

    return correct/len(test_dataset)


In [112]:
criterion = SC2_LabelSmoothingCrossEntropy() 

SC2_label = np.array([])
for item in unlabeled_dataset_label : 
    index = item[1] 
    prob = cal_prob(index, count_subgraph)
    SC2_label = np.append(SC2_label, np.array([prob, index]))


neural_2 = MNIST_BN_32_64_256(10).to(device)
optimizer2 = optim.Adam(neural_2.parameters(), lr=0.001) # setup the optimizer
scheduler2 = StepLR(optimizer2, step_size = 10, gamma=0.005)

for epoch in range(1, 4):
    neural_2 = MNIST_train(neural_2, device, unlabeled_dataset, SC2_label, optimizer2, criterion, epoch)        
    scheduler2.step()

accuracy = MNIST_test(neural_2, device, test_dataset, test_label, criterion) 

  i_count_subgraph[i] = F.softmax(torch.Tensor(i_count_subgraph[i]))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  SC2_label = np.append(SC2_label, np.array([prob, index]))


ValueError: only one element tensors can be converted to Python scalars

In [111]:
unlabeled_dataset_label

[[5, 0],
 [0, 1],
 [4, 2],
 [9, 4],
 [2, 5],
 [1, 6],
 [3, 7],
 [1, 8],
 [4, 9],
 [3, 10],
 [5, 11],
 [3, 12],
 [6, 13],
 [1, 14],
 [7, 15],
 [2, 16],
 [8, 17],
 [6, 18],
 [9, 19],
 [4, 20],
 [0, 21],
 [9, 22],
 [1, 24],
 [2, 25],
 [4, 26],
 [3, 27],
 [2, 28],
 [7, 29],
 [3, 30],
 [8, 31],
 [6, 32],
 [9, 33],
 [0, 34],
 [5, 35],
 [6, 36],
 [0, 37],
 [7, 38],
 [6, 39],
 [1, 40],
 [8, 41],
 [7, 42],
 [9, 43],
 [3, 44],
 [9, 45],
 [8, 46],
 [5, 47],
 [9, 48],
 [3, 49],
 [3, 50],
 [0, 51],
 [7, 52],
 [4, 53],
 [9, 54],
 [8, 55],
 [0, 56],
 [9, 57],
 [4, 58],
 [4, 61],
 [6, 62],
 [0, 63],
 [4, 64],
 [5, 65],
 [6, 66],
 [1, 67],
 [0, 69],
 [1, 70],
 [7, 71],
 [6, 73],
 [3, 74],
 [0, 75],
 [2, 76],
 [1, 78],
 [7, 79],
 [9, 80],
 [0, 81],
 [2, 82],
 [6, 83],
 [7, 84],
 [8, 85],
 [3, 86],
 [9, 87],
 [0, 88],
 [4, 89],
 [6, 90],
 [7, 91],
 [4, 92],
 [6, 93],
 [8, 94],
 [0, 95],
 [7, 96],
 [8, 97],
 [3, 98],
 [1, 99],
 [5, 100],
 [7, 101],
 [7, 103],
 [1, 104],
 [1, 105],
 [6, 106],
 [3, 107],
 [

In [58]:

neural_1 = MNIST_BN_32_64_256(10).to(device)
    #neural = RGB_48_96_192_gp().to(device)
    #neural = RGB_128_256_down_gp.to(device)

optimizer1 = optim.Adam(neural_1.parameters(), lr=0.001) # setup the optimizer
scheduler1 = StepLR(optimizer1, step_size = 10, gamma=0.005)

    # Label data만 사용  
for epoch in range(1, 20):
    neural_1 = MNIST_train(neural_1, device, labeled_dataset, labeled_dataset_label, optimizer1, "hard labeling", epoch)        
    scheduler1.step()
accuracy = MNIST_test(neural_1, device, test_dataset, test_label, 'hard labeling')


Test set: Average loss: 0.0000, Accuracy: 3922/10000 (39%)



In [59]:
criterion = SC1_LabelSmoothingCrossEntropy()

for epoch in range(1, 20):
    neural_1 = MNIST_train(neural_1, device, c_labeled_dataset, c_labeled_dataset_label, optimizer1, criterion, epoch)        
    scheduler1.step()

accuracy = MNIST_test(neural_1, device, test_dataset, test_label, criterion)


Test set: Average loss: 0.0649, Accuracy: 4701/10000 (47%)



In [50]:
SC2_label

[[tensor([0.1409, 0.0770, 0.0770, 0.0770, 0.0770, 0.0770, 0.0770, 0.0770, 0.2433,
          0.0770]),
  0],
 [tensor([0.3263, 0.0587, 0.0587, 0.1313, 0.0587, 0.0587, 0.0587, 0.0587, 0.1313,
          0.0587]),
  1],
 [tensor([0.0702, 0.0702, 0.0702, 0.0702, 0.1552, 0.0702, 0.0702, 0.1342, 0.1552,
          0.1342]),
  2],
 [tensor([0.0728, 0.0728, 0.0728, 0.0728, 0.0728, 0.0728, 0.0728, 0.1294, 0.2318,
          0.1294]),
  4],
 [tensor([0.1409, 0.0770, 0.0770, 0.0770, 0.0770, 0.0770, 0.0770, 0.0770, 0.2433,
          0.0770]),
  5],
 [tensor([0.0660, 0.2077, 0.2077, 0.0660, 0.0660, 0.0660, 0.0660, 0.1227, 0.0660,
          0.0660]),
  6],
 [tensor([0.1409, 0.0770, 0.0770, 0.0770, 0.0770, 0.0770, 0.0770, 0.0770, 0.2433,
          0.0770]),
  7],
 [tensor([0.0632, 0.3783, 0.1161, 0.0632, 0.0632, 0.0632, 0.0632, 0.0632, 0.0632,
          0.0632]),
  8],
 [tensor([0.0824, 0.0824, 0.0824, 0.0824, 0.0824, 0.0824, 0.0824, 0.0824, 0.2581,
          0.0824]),
  9],
 [tensor([0.1342, 0.0702, 0.

In [None]:
def check_class(subgraph, density_subgraph, M, labeled_dataset_label) : 
    num_sample = np.shape(subgraph)[1]

    classification =[-1]*num_sample 
    classified_index = []

    filtered_subgraph_index = []


    # xk 가 속한 subgraph의 label이 모두 같을 확률 
    for i in range(num_sample) : 
        in_subgraph_index = np.where(subgraph[ :,i] ==1)
        if len(in_subgraph_index[0])==0 : continue 
        i_class = labeled_dataset_label[in_subgraph_index[0][0]]

        for j in in_subgraph_index[1:] :  
            if i_class != labeled_dataset_label[j] : 
                i_class = -1
                continue
        if i_class != -1 : 
            classification[i] = i_class
            classified_index.append(i)
    
    score = len(classified_index)
    pseudo_label = [classification[i] for i in classified_index]

    return score, classified_index, pseudo_label