In [1]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt

# DataLoader은 Dataset을 샘플에 쉽게 접근할 수 있도록 순회가능한 객체(iterable)로 감쌉니다
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, datasets
from torchvision.transforms import ToTensor
import torchvision.models as models 

import pdb
from datetime import datetime
import argparse
# pretty print. 들여쓰기 등을 지원해준다. 

import pprint
import time
import csv

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

# local stuff
# 폴더에 있는 경우 A.B 형태로 기술 
from dsets.mnist import MNIST
from mymodels.mnist_net import Net
from auto_encoder import AutoEncoder, ConvAutoEncoder, ae_train
from train_test import MNIST_train, MNIST_test
from init_pool_tools import obtain_init_pool
from coreset import Coreset_Greedy

In [21]:
original_data = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
    )


    # MNIST Dataset을 가공할 수 있는 list로 변경. feature와 label 각각 저장 
original_all = []
original_dataset = []
original_label = [] 

for i, sample in enumerate(original_data) : 
    original_all.append(sample)
    feature = np.array(sample[0])
    original_dataset.append(feature)
    original_label.append([sample[1], i]) # original index 포함시키기. 

unlabeled_dataset = original_dataset[:]
unlabeled_dataset_label = original_label[:]
labeled_dataset = [] 
labeled_dataset_label = []

PATH = './weights/MNIST/'
AE = torch.load(PATH + 'AE.pt')  
AE.load_state_dict(torch.load(PATH + 'AE_state_dict.pt'))  

CAE = torch.load(PATH + 'CAE.pt')  
CAE.load_state_dict(torch.load(PATH + 'CAE_state_dict.pt'))  

print("Successfully loaded AE & CAE")

use_cuda = True

torch.manual_seed(23)
device = torch.device("cuda" if use_cuda else "cpu")
    # use_cuda가 true라면 kwargs를 다음과 같이 지정하기. 
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}


Successfully loaded AE & CAE


In [116]:
sample_size = 20

In [67]:
from sklearn.metrics import pairwise_distances

def make_subgraph(sampling_label, original_dataset, radii, model):
    x = [original_dataset[i[1]] for i in sampling_label] 
    dataset = original_dataset

    if model is not None : 
        x = get_features(model, x, device)
        dataset = get_features(model, dataset, device)

    dist = pairwise_distances(x,dataset, metric='euclidean')

    subgraph= dist.copy()
    density_subgraph = []
    for i, row in enumerate(dist) : 
        for j, distance in enumerate(row) : 
            if distance > radii or j == sampling_label[i][1] : subgraph[i,j] =int(0) 
            else : subgraph[i,j] = int(1) 
        
        density_subgraph.append(sum(subgraph[i]))
    

    return np.array(subgraph), density_subgraph


def get_features(model, dataset, device):
    features = []

    # 모델 evaluation 과정에서 사용하지 않아야 하는 layer들을 알아서 off 시키는 함수 
    # model을 evaluate를 할 수 있도록 세팅시키는 것 
    model.eval()
    # torch.no_grad : autograd engine을 비활성화시켜 필요한 메모리를 줄여주고 연산속도를 증가시킴 
    
    dataloader = DataLoader(dataset, batch_size = 64) # 여기서 Dataloader로 보내는 구나. 

    with torch.no_grad() : 
        for sample in dataloader:
            sample = sample.clone().detach().to(device)
        
            output = model.get_codes(sample)
            features = features + list(output.cpu().numpy()) 
    
    return features

def adjacency_subgraph(sample_dataset, sample_label, radii, model, M) :  
    if len(sample_label) <= 1 : return 
    dataset = sample_dataset
    num_subgraph = len(sample_label)
    if model is not None : 
        dataset = get_features(model,dataset, device)
    
    dist = pairwise_distances(dataset, dataset, metric='euclidean')
    adj_dist = dist.copy()
    
    for i, row in enumerate(dist) : 
        for j, distance in enumerate(row) : 
            if distance >= 2*radii[0] : adj_dist[i,j] = int(0)   
            elif 2*radii[0] > distance and distance >= radii[0]  : 
                adj_dist[i,j] = int(1)
            elif i==j : adj_dist[i,j] = int(0) # 자기자신은 제거
            else : 
                print('Break')

    classified_subgraph_index = []


    for i in range(num_subgraph) : 
        i_sub_class = "x"
        adj_index = np.where(adj_dist[ :,i] ==1)[0] 
        if len(adj_index)==0 : continue 
        i_sub_class = sample_label[i][0]

        for j in adj_index :  
            if i_sub_class != sample_label[j][0] : 
                i_sub_class = "x"
                continue
        if i_sub_class != "x" and len(adj_index) >= M : 
            classified_subgraph_index.append(i)
    
    if len(classified_subgraph_index) == 0 : classified_label = []
    else : classified_label = [sample_label[i] for i in classified_subgraph_index]

    return dist, adj_dist, classified_subgraph_index, classified_label

In [143]:
unlabeled_dataset = original_dataset[:]
unlabeled_dataset_label = original_label[:]
labeled_dataset = [] 
labeled_dataset_label = []

In [144]:
from active_learn import active_sample
from active_learn import first_classification, check_performance
ratio = 0.99

log_score = [] 
classified = [] 

if len(unlabeled_dataset) < sample_size:
    sample_size = len(unlabeled_dataset)
    

for i in range(sample_size) : 
    sample_data, sample_index,radius  = active_sample(unlabeled_dataset, labeled_dataset, 1, model=CAE, device=device)

    if i == 0 : 
        sample_dataset = [unlabeled_dataset[i] for i in sample_index]
        sample_label = [unlabeled_dataset_label[i] for i in sample_index]
    
    else : 
        new_sample_dataset = [unlabeled_dataset[i] for i in sample_index]
        new_sample_label = [unlabeled_dataset_label[i] for i in sample_index]
        sample_dataset = np.concatenate((sample_dataset, new_sample_dataset), axis=0)
        sample_label = np.concatenate((sample_label, new_sample_label), axis =0)


    # Sampling에 따른 Dataset 수정 
    if len(labeled_dataset_label) == 0 :  
        labeled_dataset = sample_data[:]
        labeled_dataset_label = sample_label[:]
    else : 
        labeled_dataset = np.concatenate((labeled_dataset,new_sample_dataset),axis=0)
        labeled_dataset_label = np.concatenate((labeled_dataset_label, new_sample_label), axis =0)

    for j in sample_index[::-1] : 
        del unlabeled_dataset[j]
        del unlabeled_dataset_label[j]

    print("Unlabeled pool size: ",len(unlabeled_dataset))
    print("Labeled pool size: ",len(labeled_dataset))

    if i == 0 : continue
    subgraph, density_subgraph = make_subgraph(sample_label, original_dataset, radius, CAE)
    # 마지막 숫자를 통해서 접하는 subgraph의 수 정할 수 있음. 
    dist_class, adj_dist, classified_subgraph_index, pseudo_class_label = adjacency_subgraph(sample_dataset, sample_label, radius, CAE, 0)
    if len(classified_subgraph_index) == 0 : continue 
    f_classification = first_classification(classified_subgraph_index, pseudo_class_label, subgraph, density_subgraph, ratio)
    num_classification, score, dic_score = check_performance(f_classification,original_label)
    log_score += (i, num_classification, score)
    test_set = set([]) 
    for j in classified_subgraph_index : 
        test_set.union(set(np.where(subgraph[ j,:] ==1)[0]))
        print(test_set)
    classified += list(test_set)

    


Max distance from cluster : 170.53
Unlabeled pool size:  59999
Labeled pool size:  1
Max distance from cluster : 110.01
Unlabeled pool size:  59998
Labeled pool size:  2
Max distance from cluster : 105.06
Unlabeled pool size:  59997
Labeled pool size:  3
Max distance from cluster : 101.23
Unlabeled pool size:  59996
Labeled pool size:  4
Max distance from cluster : 65.38
Unlabeled pool size:  59995
Labeled pool size:  5
Max distance from cluster : 62.99
Unlabeled pool size:  59994
Labeled pool size:  6
Max distance from cluster : 60.86
Unlabeled pool size:  59993
Labeled pool size:  7
Max distance from cluster : 57.20
Unlabeled pool size:  59992
Labeled pool size:  8
Max distance from cluster : 52.99
Unlabeled pool size:  59991
Labeled pool size:  9
Max distance from cluster : 52.65
Unlabeled pool size:  59990
Labeled pool size:  10
Max distance from cluster : 44.19
Unlabeled pool size:  59989
Labeled pool size:  11
set()
set()
Max distance from cluster : 43.58
Unlabeled pool size:  59

In [145]:
classified

[]

In [112]:
a = set(classified[0]).union(set(classified[2]))
# 이전에 중복인가? 

In [115]:
len(classified[0])

178