In [1]:
from __future__ import print_function, division
import os
import torch


# DataLoader은 Dataset을 샘플에 쉽게 접근할 수 있도록 순회가능한 객체(iterable)로 감쌉니다
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, datasets
from torchvision.transforms import ToTensor
import torchvision.models as models 

import pprint
from datetime import datetime



import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

from collections import defaultdict

from network_architectures import MNIST_BN_32_64_256
from active_learn import argparser

In [2]:
import numpy as np

original_data = datasets.EMNIST(
        root="data",
        split='letters',
        train=True,
        download=True,
        transform=ToTensor()
    )

test_data = datasets.MNIST(
        root="data",
        train=False,
        download=True,
        transform=ToTensor()
    )


original_all = []
original_dataset = []
original_label = [] 

for i, sample in enumerate(original_data) : 
    original_all.append(sample)
    feature = np.array(sample[0])
    original_dataset.append(feature)
    original_label.append([sample[1], i])
    

test_dataset = [] 
test_label = [] 

for i, test in enumerate(test_data) : 
    feature = np.array(test[0])
    test_dataset.append(feature)
    test_label.append([test[1], i])


unlabeled_dataset = original_dataset[:]
unlabeled_dataset_label = original_label[:]
labeled_dataset = [] 
labeled_dataset_label = []

sc1_labeled_dataset = [] 
sc1_labeled_dataset_label = []

sc2_labeled_dataset = [] 
sc2_labeled_dataset_label = []

count_subgraph = defaultdict(list)

In [4]:
for i in unlabeled_dataset_label[:100] : 
    print(i[0])

23
7
16
15
23
17
13
11
22
24
10
14
18
21
26
21
21
24
19
5
2
25
9
5
10
21
11
24
12
1
17
9
1
24
18
1
8
4
1
9
7
21
3
16
2
20
10
12
11
20
3
6
13
15
11
4
23
12
21
1
16
14
23
10
5
12
6
2
2
1
2
1
2
8
21
13
24
5
7
7
22
24
5
20
19
12
5
3
8
26
11
26
8
16
10
8
26
12
25
9


In [3]:
PATH = './weights/MNIST/'

CAE = torch.load(PATH + 'CAE.pt')  
CAE.load_state_dict(torch.load(PATH + 'CAE_state_dict.pt'))  

sample_size = 50
if len(unlabeled_dataset) < sample_size:
    sample_size = len(unlabeled_dataset)

In [4]:
use_cuda = True

device = torch.device("cuda" if use_cuda else "cpu")
    # use_cuda가 true라면 kwargs를 다음과 같이 지정하기. 
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}


In [5]:
from active_learn import active_sample

un_sample_index, c_sample_index, radius  = active_sample(unlabeled_dataset, labeled_dataset, sc1_labeled_dataset, sample_size, model=CAE, device=device)

sample_data = [unlabeled_dataset[i] for i in un_sample_index]
sample_label = [unlabeled_dataset_label[i] for i in un_sample_index]

for i in un_sample_index[::-1] : 
    del unlabeled_dataset[i]
    del unlabeled_dataset_label[i]

if len(c_sample_index) != 0 : 
    c_sample_data = [sc1_labeled_dataset[i] for i in c_sample_index]
    c_sample_label = [sc1_labeled_dataset_label[i] for i in c_sample_index]
    sample_data = np.concatenate((sample_data, c_sample_data), axis=0) 
    sample_label = np.concatenate((sample_label, c_sample_label), axis=0) 

    for i in c_sample_index[::-1] : 
        np.delete(sc1_labeled_dataset, i, axis=0)
        np.delete(sc1_labeled_dataset_label, i, axis=0)

if len(labeled_dataset_label) == 0 :  
    labeled_dataset = sample_data[:]
    labeled_dataset_label = sample_label[:]
else : 
    labeled_dataset = np.concatenate((labeled_dataset,sample_data),axis=0)
    labeled_dataset_label = np.concatenate((labeled_dataset_label, sample_label), axis =0)


Max distance from cluster : 17.69


In [6]:
from active_learn import adjacency_subgraph, make_subgraph

subgraph, density_subgraph = make_subgraph(labeled_dataset_label, original_dataset, radius, CAE)
# 여기에서 sample_dataset을 sample_data로 변경 필요!

dist_class, adj_dist, classified_subgraph_index, pseudo_class_label = adjacency_subgraph(labeled_dataset, labeled_dataset_label, radius, CAE, 0)

print("Well work!")

from active_learn import first_classification, check_performance
f_classification = first_classification(classified_subgraph_index, pseudo_class_label, subgraph, density_subgraph, 0, 0)
num_classification, score, dic_score = check_performance(f_classification,original_label)


Well work!


In [7]:
num_classification, score

(847, 0.9976387249114522)

In [8]:
# CS1 방법을 적용 후, unlabeled dataset 구분하기 
erase_dataset_ori_index = []
pre_index = [j[1] for j in sc1_labeled_dataset_label]

for i in f_classification.keys(): 
    index = f_classification[i]
    
    index = list(set(index) - set(pre_index))

    new_labeled_dataset = [original_dataset[j] for j in index]
    new_labeled_dataset_label = [ [i,j] for j in index ]
    new_erase_original_index = [new_labeled_dataset_label[j][1] for j in range(len(new_labeled_dataset_label))]

    if len(sc1_labeled_dataset_label) == 0 : 
        sc1_labeled_dataset = new_labeled_dataset
        sc1_labeled_dataset_label = new_labeled_dataset_label

    # new_labeled_dataset 이 없는 경우가 발생한다 조치 필요!        
    else : 
        sc1_labeled_dataset = np.concatenate((sc1_labeled_dataset, new_labeled_dataset), axis=0)
        sc1_labeled_dataset_label = np.concatenate((sc1_labeled_dataset_label, new_labeled_dataset_label), axis =0)
    
    erase_dataset_ori_index += new_erase_original_index

erase_unlabeled_index = [np.where(np.array(unlabeled_dataset_label).T[1] == i)[0][0]  for i in erase_dataset_ori_index]
erase_unlabeled_index.sort()


for i in erase_unlabeled_index[::-1] : 
    del unlabeled_dataset[i]
    del unlabeled_dataset_label[i]

In [9]:
# unlabeled의 변화를 Folliwing 하지 못하기 때문에 original index로 접근해야 한다. 
from active_learn import update_count_subgraph
update_count_subgraph(count_subgraph, original_label, labeled_dataset_label, subgraph, radius)


defaultdict(list,
            {0: [[[0, 0, 0, 0, 1, 0, 0, 0, 1, 0], 17.686516]],
             1: [[[1, 0, 0, 0, 0, 1, 0, 0, 1, 0], 17.686516]],
             2: [[[0, 0, 0, 0, 1, 0, 0, 0, 1, 0], 17.686516]],
             3: [[[0, 3, 0, 0, 0, 0, 0, 0, 0, 0], 17.686516]],
             4: [[[0, 0, 0, 0, 1, 0, 0, 0, 1, 0], 17.686516]],
             5: [[[1, 0, 0, 0, 1, 0, 0, 0, 1, 0], 17.686516]],
             6: [[[0, 2, 0, 0, 0, 0, 0, 1, 0, 0], 17.686516]],
             7: [[[1, 0, 0, 0, 1, 0, 0, 0, 1, 0], 17.686516]],
             8: [[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0], 17.686516]],
             9: [[[0, 0, 0, 0, 1, 0, 0, 0, 1, 0], 17.686516]],
             10: [[[1, 0, 0, 0, 1, 0, 0, 0, 1, 0], 17.686516]],
             11: [[[0, 0, 0, 0, 1, 0, 0, 0, 1, 0], 17.686516]],
             12: [[[1, 0, 0, 0, 0, 0, 0, 0, 1, 0], 17.686516]],
             13: [[[1, 0, 0, 0, 1, 0, 1, 0, 1, 0], 17.686516]],
             14: [[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0], 17.686516]],
             15: [[[0, 0, 0, 0, 

In [10]:
import numpy as np


# batch 단위로 들어오는 구나. 그럼 data loarder을 통해서 input 값들을 조정할 필요가 있겠는 걸 
def mixup_data(x, y, mixup_alpha =4):
    lam = np.random.beta(mixup_alpha, mixup_alpha) # scalar 값 
    batch_size = x.size()[0]
    index = torch.randperm(batch_size).cuda() # shuffle 한 index 반환 
    mixed_x = lam * x + (1 - lam) * x[index]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam


def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)


class SC1_LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self):
        super(SC1_LabelSmoothingCrossEntropy, self).__init__()
    def forward(self, y, targets, smoothing=0.1): # y는 hard labeling. SC2 도 hard labeling 형태로 반환해야겠네 
        confidence = 1. - smoothing
        log_probs = F.log_softmax(y, dim=-1) # 예측 확률 계산
        true_probs = torch.zeros_like(log_probs)
        true_probs.fill_(smoothing / (y.shape[1] - 1))
        true_probs.scatter_(1, targets.data.unsqueeze(1), confidence) # 정답 인덱스의 정답 확률을 confidence로 변경
        return torch.mean(torch.sum(true_probs * -log_probs, dim=-1)) # negative log likelihood


class SC2_LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self):
        super(SC2_LabelSmoothingCrossEntropy, self).__init__()
        
    def forward(self, prob, _): # y는 hard labeling. SC2 도 hard labeling 형태로 반환해야겠네 
        prob = torch.tensor(prob)
        log_probs = F.log_softmax(prob, dim=-1) # 예측 확률 계산
        return torch.mean(torch.sum(prob * -log_probs, dim=-1)) # negative log likelihood


In [11]:
from active_learn import second_classification 
sc2_classification = second_classification(unlabeled_dataset_label, count_subgraph, 0.6)
num_classification, score, dic_score = check_performance(sc2_classification,original_label)
print("SC2 performance : num_classification", num_classification, "score", score )

TypeError: max(): argument 'input' (position 1) must be Tensor, not numpy.ndarray

In [None]:

erase_dataset_ori_index = []

for i in sc2_classification.keys(): 
    index = sc2_classification[i]

    new_labeled_dataset = [original_dataset[j] for j in index]
    new_labeled_dataset_label = [ [i,j] for j in index ]

    if len(sc2_labeled_dataset_label) == 0 : 
        sc2_labeled_dataset = new_labeled_dataset[:]
        sc2_labeled_dataset_label = new_labeled_dataset_label[:]

    else : 
        sc2_labeled_dataset = np.concatenate((sc2_labeled_dataset, new_labeled_dataset), axis=0)
        sc2_labeled_dataset_label = np.concatenate((sc2_labeled_dataset_label, new_labeled_dataset_label), axis=0)

    erase_dataset_ori_index += index

erase_unlabeled_index = [np.where(np.array(unlabeled_dataset_label).T[1] == i)[0][0]  for i in erase_dataset_ori_index]
erase_unlabeled_index.sort()

for i in erase_unlabeled_index[::-1] : 
    del unlabeled_dataset[i]
    del unlabeled_dataset_label[i]

In [None]:
from active_learn import mixup_data, mixup_criterion

def MNIST_train(model, device, labeled_dataset, labeled_dataset_label, optimizer, criterion, epoch):
    model.train()
    

    labeled_dataset = torch.tensor(labeled_dataset)
    labeled_dataset_label = torch.tensor(labeled_dataset_label)
    
    all_data = [(labeled_dataset[i], labeled_dataset_label[i][0]) for i in range(len(labeled_dataset_label))]
    
    if criterion == "hard labeling" : batch_size = 4
    elif criterion == SC1_LabelSmoothingCrossEntropy : batch_size = 32
    else : batch_size = 100

    data_loader = DataLoader(all_data, batch_size= batch_size)

    for i, (data, target) in enumerate(data_loader):
        data = data.view(-1, 1, 28,28)
        target = target.type(torch.LongTensor)
        data, target = data.to(device), target.to(device)
        data, target_a, target_b, lam = mixup_data(data, target)

        optimizer.zero_grad()
        output = model(data) #여기가 문제가 생기는 지점 

        # loss 함수 수정 필요. 
        if criterion == "hard labeling" : loss = mixup_criterion(F.nll_loss, output, target_a, target_b, lam)
        else: loss = mixup_criterion(criterion, output, target_a, target_b, lam)
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, i, len(data_loader),
                100. * i / len(data_loader), loss.item()))
    return model

def MNIST_test(model, device, test_dataset, test_dataset_label, criterion) :
    model.eval()
    test_loss = 0
    correct = 0


    test_dataset = torch.tensor(test_dataset)
    test_dataset_label = torch.tensor(test_dataset_label)

    all_data = [(test_dataset[i], test_dataset_label[i][0]) for i in range(len(test_dataset_label))]
    data_loader = DataLoader(all_data, batch_size=32)

    # dataloader에 index가 가능한가? 
    with torch.no_grad():
        for data, target in data_loader:
            target = target.type(torch.LongTensor)
            data = data.view(-1, 1, 28,28)
            data, target = data.to(device), target.to(device)

            output = model(data)
            if criterion == "hard labeling" : F.nll_loss(output, target, reduction='sum').item()
            else : test_loss += criterion(output, target).item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_dataset),
        100. * correct / len(test_dataset)))

    return correct/len(test_dataset)


In [None]:

neural_1 = MNIST_BN_32_64_256(10).to(device)
    #neural = RGB_48_96_192_gp().to(device)
    #neural = RGB_128_256_down_gp.to(device)

optimizer1 = optim.Adam(neural_1.parameters(), lr=0.001) # setup the optimizer
scheduler1 = StepLR(optimizer1, step_size = 10, gamma=0.005)

    # Label data만 사용  
for epoch in range(1, 20):
    neural_1 = MNIST_train(neural_1, device, labeled_dataset, labeled_dataset_label, optimizer1, "hard labeling", epoch)        
    scheduler1.step()
accuracy = MNIST_test(neural_1, device, test_dataset, test_label, 'hard labeling')



  test_dataset = torch.tensor(test_dataset)



Test set: Average loss: 0.0000, Accuracy: 4900/10000 (49%)



In [None]:
from active_learn import SC1_LabelSmoothingCrossEntropy
criterion = SC1_LabelSmoothingCrossEntropy()

for epoch in range(1, 20):
    neural_1 = MNIST_train(neural_1, device, sc1_labeled_dataset, sc1_labeled_dataset_label, optimizer1, criterion, epoch)        
    scheduler1.step()

accuracy = MNIST_test(neural_1, device, test_dataset, test_label, criterion)


Test set: Average loss: 0.0714, Accuracy: 4922/10000 (49%)



In [None]:
criterion = SC1_LabelSmoothingCrossEntropy() 

for epoch in range(1, 20):
    neural_1 = MNIST_train(neural_1, device, sc2_labeled_dataset, sc2_labeled_dataset_label, optimizer1, criterion, epoch)        
    scheduler1.step()

accuracy = MNIST_test(neural_1, device, test_dataset, test_label, criterion) 


Test set: Average loss: 0.0795, Accuracy: 4594/10000 (46%)



In [None]:
def check_class(subgraph, density_subgraph, M, labeled_dataset_label) : 
    num_sample = np.shape(subgraph)[1]

    classification =[-1]*num_sample 
    classified_index = []

    filtered_subgraph_index = []


    # xk 가 속한 subgraph의 label이 모두 같을 확률 
    for i in range(num_sample) : 
        in_subgraph_index = np.where(subgraph[ :,i] ==1)
        if len(in_subgraph_index[0])==0 : continue 
        i_class = labeled_dataset_label[in_subgraph_index[0][0]]

        for j in in_subgraph_index[1:] :  
            if i_class != labeled_dataset_label[j] : 
                i_class = -1
                continue
        if i_class != -1 : 
            classification[i] = i_class
            classified_index.append(i)
    
    score = len(classified_index)
    pseudo_label = [classification[i] for i in classified_index]

    return score, classified_index, pseudo_label