In [None]:
from __future__ import print_function, division
import os
import torch


# DataLoader은 Dataset을 샘플에 쉽게 접근할 수 있도록 순회가능한 객체(iterable)로 감쌉니다
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, datasets
from torchvision.transforms import ToTensor
import torchvision.models as models 

import pprint
from datetime import datetime



import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

from collections import defaultdict

from auto_encoder import MNIST_BN_32_64_256, ae_train

from active_learn import argparser

In [2]:
import numpy as np
original_data = datasets.EMNIST(
        root="data",
        split='letters',
        train=True,
        download=True,
        transform=ToTensor()
    )


original_all = []
original_dataset = []
original_label = [] 

for i, sample in enumerate(original_data) : 
    original_all.append(sample)
    feature = np.array(sample[0])
    original_dataset.append(feature)
    original_label.append([sample[1], i])
    
unlabeled_dataset = original_dataset[:]
unlabeled_dataset_label = original_label[:]
labeled_dataset = [] 
labeled_dataset_label = []

c_labeled_dataset = [] 
c_labeled_dataset_label = []

count_subgraph = defaultdict(list)

In [3]:
use_cuda = True


device = torch.device("cuda" if use_cuda else "cpu")
    # use_cuda가 true라면 kwargs를 다음과 같이 지정하기. 
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

    # 데이터 변경시 수정 필요 
ae_training_data = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor())
    # 데이터 변경시 수정 필요 
ae_test_data = datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor())

kwargs = {'num_workers': 1, 'pin_memory': True} if True else {}

In [4]:
PATH = './weights/MNIST/'

CAE = torch.load(PATH + 'CAE.pt')  
CAE.load_state_dict(torch.load(PATH + 'CAE_state_dict.pt'))  

sample_size = 50
if len(unlabeled_dataset) < sample_size:
    sample_size = len(unlabeled_dataset)

In [13]:
from active_learn import get_features
from coreset import Coreset_Greedy

def active_sample(unlabeled_dataset, labeled_dataset, c_labeled_dataset, sample_size, model=None, device="cuda"):
    labeled_features = get_features(model, labeled_dataset, device) # (img_name, features)
    unlabeled_features = get_features(model, unlabeled_dataset, device)# (img_name, features)
    c_labeled_features = get_features(model, c_labeled_dataset, device)# (img_name, features)


    all_features = labeled_features +  unlabeled_features + c_labeled_features
        # label data의 index가 어디까지인지 표기. 
    labeled_indices = np.arange(0,len(labeled_features))

    coreset = Coreset_Greedy(all_features, len(labeled_features))

        # unlabeled 데이터에서 sample_size 만큼 center point 뽑기, 당시 반지름 뽑기
    new_batch, max_distance = coreset.sample(labeled_indices, sample_size)

    c_labeled_sample_index = [] 
    unlabeled_sample_index = [] 

    for index in new_batch : 
        if index >= len(labeled_features) + len(unlabeled_features) : 
            c_labeled_sample_index.append(index - len(labeled_features) - len(unlabeled_features))
        else : unlabeled_sample_index.append(index - len(labeled_features))

    unlabeled_sample_index.sort() 
    c_labeled_sample_index.sort() 

    return unlabeled_sample_index, c_labeled_sample_index, max_distance

In [28]:

un_sample_index, c_sample_index, radius  = active_sample(unlabeled_dataset, labeled_dataset, c_labeled_dataset, sample_size, model=CAE, device=device)

sample_data = [unlabeled_dataset[i] for i in un_sample_index]
sample_label = [unlabeled_dataset_label[i] for i in un_sample_index]

for i in un_sample_index[::-1] : 
    del unlabeled_dataset[i]
    del unlabeled_dataset_label[i]

if len(c_sample_index) != 0 : 
    c_sample_data = [c_labeled_dataset[i] for i in c_sample_index]
    c_sample_label = [c_labeled_dataset_label[i] for i in c_sample_index]
    sample_data = np.concatenate((sample_data, c_sample_data), axis=0) 
    sample_label = np.concatenate((sample_label, c_sample_label), axis=0) 

    for i in c_sample_index[::-1] : 
        np.delete(c_labeled_dataset, i, axis=0)
        np.delete(c_labeled_dataset_label, i, axis=0)

if len(labeled_dataset_label) == 0 :  
    labeled_dataset = sample_data[:]
    labeled_dataset_label = sample_label[:]
else : 
    labeled_dataset = np.concatenate((labeled_dataset,sample_data),axis=0)
    labeled_dataset_label = np.concatenate((labeled_dataset_label, sample_label), axis =0)


Max distance from cluster : 9.13


In [15]:

from active_learn import get_features
from sklearn.metrics import pairwise_distances

def make_subgraph(sampling_label, original_dataset, radii, model):
    x = [original_dataset[i[1]] for i in sampling_label] 
    dataset = original_dataset

    if model is not None : 
        x = get_features(model, x, device="cuda")
        dataset = get_features(model, dataset, device="cuda")

    dist = pairwise_distances(x,dataset, metric='euclidean')

    subgraph= dist.copy()
    density_subgraph = []
    for i, row in enumerate(dist) : 
        for j, distance in enumerate(row) : 
            if distance > radii or j == sampling_label[i][1] : subgraph[i,j] =int(0) 
            else : subgraph[i,j] = int(1) 
        
        density_subgraph.append(sum(subgraph[i]))
    

    return np.array(subgraph), density_subgraph

In [29]:
from active_learn import adjacency_subgraph

subgraph, density_subgraph = make_subgraph(labeled_dataset_label, original_dataset, radius, CAE)
# 여기에서 sample_dataset을 sample_data로 변경 필요!

dist_class, adj_dist, classified_subgraph_index, pseudo_class_label = adjacency_subgraph(labeled_dataset, labeled_dataset_label, radius, CAE, 0)

print("Well work!")

Well work!


In [30]:
from active_learn import first_classification, check_performance
f_classification = first_classification(classified_subgraph_index, pseudo_class_label, subgraph, density_subgraph, 0, 0)
num_classification, score, dic_score = check_performance(f_classification,original_label)


In [31]:
# CS1 방법을 적용 후, unlabeled dataset 구분하기 
erase_dataset_ori_index = []
pre_index = [j[1] for j in c_labeled_dataset_label]

for i in f_classification.keys(): 
    index = f_classification[i]
    
    index = list(set(index) - set(pre_index))

    new_labeled_dataset = [original_dataset[j] for j in index]
    new_labeled_dataset_label = [ [i,j] for j in index ]
    new_erase_original_index = [new_labeled_dataset_label[j][1] for j in range(len(new_labeled_dataset_label))]

    if len(c_labeled_dataset_label) == 0 : 
        c_labeled_dataset = new_labeled_dataset
        c_labeled_dataset_label = new_labeled_dataset_label

    # new_labeled_dataset 이 없는 경우가 발생한다 조치 필요!        
    else : 
        c_labeled_dataset = np.concatenate((c_labeled_dataset, new_labeled_dataset), axis=0)
        c_labeled_dataset_label = np.concatenate((c_labeled_dataset_label, new_labeled_dataset_label), axis =0)
    
    erase_dataset_ori_index += new_erase_original_index

erase_unlabeled_index = [np.where(np.array(unlabeled_dataset_label).T[1] == i)[0][0]  for i in erase_dataset_ori_index]
erase_unlabeled_index.sort()


for i in erase_unlabeled_index[::-1] : 
    del unlabeled_dataset[i]
    del unlabeled_dataset_label[i]

In [32]:
dic_score

defaultdict(list,
            {0: [1400, 0.9992857142857143],
             1: [2038, 1.0],
             4: [89, 0.8651685393258427],
             6: [262, 0.9847328244274809],
             7: [5, 1.0]})

In [1]:
# unlabeled의 변화를 Folliwing 하지 못하기 때문에 original index로 접근해야 한다. 
from active_learn import update_count_subgraph
update_count_subgraph(count_subgraph, original_label, subgraph)

NameError: name 'count_subgraph' is not defined

In [22]:
def check_CS1(c_labeled_dataset_label, count_subgraph) : 
    c_index = [i[1] for i in c_labeled_dataset_label]
    c_index.sort()
    restore_index = [] 

    for num, index in enumerate(c_index) : 
        # count 개수만 불러오기. 아 이건 좀 아닌 것 같긴 하다. 처음에 거리가 멀리 있었던 값에 걸쳐질 수 있는 거니까. 
        # sample의 크기를 키워서 반지름의 크기를 줄이는 방법밖에 없나...?
        # 또는 최근의 횟수만 불러온다던가.. 
        count_list = np.array(count_subgraph[index])[:,0]
        for j,count in enumerate(count_list) : 
            if j ==0 : sum_count = count 
            else : sum_count = [x + y for x,y in zip(sum_count, count)] 

        c_label = c_labeled_dataset_label[num][0]
        # c_label 외에 다른 클래스에 subgraph에 속한 적이 있다면 
        if int(sum_count[c_label]) != sum(sum_count) : 
            restore_index.append(num)
    
    restore_index.sort()
    return restore_index


In [15]:
delete_c_index = check_CS1(c_labeled_dataset_label, count_subgraph) 

"""
for i in delete_c_index[::-1] : 
    unlabeled_dataset = np.concatenate((unlabeled_dataset, c_labeled_dataset[i]), axis =0)
    unlabeled_dataset_label = np.concatenate((unlabeled_dataset_label, c_labeled_dataset_label[i]), axis=0)
    del c_labeled_dataset[i]
    del c_labeled_dataset_label[i]
"""


  count_list = np.array(count_subgraph[index])[:,0]


'\nfor i in delete_c_index[::-1] : \n    unlabeled_dataset = np.concatenate((unlabeled_dataset, c_labeled_dataset[i]), axis =0)\n    unlabeled_dataset_label = np.concatenate((unlabeled_dataset_label, c_labeled_dataset_label[i]), axis=0)\n    del c_labeled_dataset[i]\n    del c_labeled_dataset_label[i]\n'

In [None]:
def cal_prob(unlabeled_index, count_subgraph) : 
    i_count_subgraph = count_subgraph[unlabeled_index].copy()

    for i , p_count in enumerate(i_count_subgraph) : 
        i_count_subgraph[i] = p_count[0] / p_count[1]
        i_count_subgraph[i] = F.softmax(torch.Tensor(i_count_subgraph[i]))
    
    print("pre", i_count_subgraph)
    i_count_subgraph = np.array(i_count_subgraph)
    print("after", i_count_subgraph)
    result = np.sum(i_count_subgraph, axis=0) / len(i_count_subgraph)
    return result 


In [None]:
cal_prob(122, count_subgraph)

  i_count_subgraph[i] = F.softmax(torch.Tensor(i_count_subgraph[i]))
  i_count_subgraph = np.array(i_count_subgraph)


pre [tensor([0.0994, 0.0994, 0.1053, 0.0994, 0.0994, 0.0994, 0.0994, 0.0994, 0.0994,
        0.0994])]


ValueError: only one element tensors can be converted to Python scalars

In [None]:
count_subgraph

defaultdict(list,
            {0: [[[0, 1, 1, 0, 0, 0, 0, 1, 0, 0], 17.741312],
              [[0, 0, 2, 0, 0, 0, 0, 1, 0, 0], 11.084181]],
             1: [[[0, 0, 1, 0, 0, 0, 1, 0, 0, 0], 17.741312],
              [[1, 0, 1, 0, 0, 0, 0, 0, 0, 0], 11.084181]],
             2: [[[0, 0, 1, 0, 0, 0, 1, 1, 0, 0], 17.741312],
              [[0, 0, 0, 0, 1, 0, 0, 1, 0, 0], 11.084181]],
             3: [[[0, 3, 0, 0, 0, 0, 0, 0, 0, 0], 17.741312],
              [[0, 3, 0, 0, 0, 0, 0, 0, 0, 0], 11.084181]],
             4: [[[0, 1, 1, 0, 0, 0, 0, 1, 0, 0], 17.741312],
              [[0, 0, 1, 0, 1, 0, 0, 1, 0, 0], 11.084181]],
             5: [[[0, 0, 1, 0, 0, 0, 0, 1, 0, 0], 17.741312],
              [[0, 0, 1, 0, 1, 0, 0, 1, 0, 0], 11.084181]],
             6: [[[0, 2, 0, 0, 0, 0, 0, 1, 0, 0], 17.741312],
              [[0, 1, 0, 0, 0, 0, 0, 0, 0, 0], 11.084181]],
             7: [[[0, 1, 1, 0, 0, 0, 0, 0, 0, 0], 17.741312],
              [[0, 0, 2, 0, 0, 0, 0, 1, 0, 0], 11.084181]],
      