In [1]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt

# DataLoader은 Dataset을 샘플에 쉽게 접근할 수 있도록 순회가능한 객체(iterable)로 감쌉니다
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, datasets
from torchvision.transforms import ToTensor
import torchvision.models as models 

import pdb
from datetime import datetime
import argparse
# pretty print. 들여쓰기 등을 지원해준다. 

import pprint
import time
import csv

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

# local stuff
# 폴더에 있는 경우 A.B 형태로 기술 
from dsets.mnist import MNIST
from mymodels.mnist_net import Net
from auto_encoder import AutoEncoder, ConvAutoEncoder, ae_train
from train_test import train
from init_pool_tools import obtain_init_pool
from coreset import Coreset_Greedy

In [25]:
original_data = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
    )


    # MNIST Dataset을 가공할 수 있는 list로 변경. feature와 label 각각 저장 
original_all = []
original_dataset = []
original_label = [] 

for i, sample in enumerate(original_data) : 
    original_all.append(sample)
    feature = np.array(sample[0])
    original_dataset.append(feature)
    original_label.append([sample[1], i]) # original index 포함시키기. 

unlabeled_dataset = original_dataset[:]
unlabeled_dataset_label = original_label[:]
labeled_dataset = [] 
labeled_dataset_label = []

PATH = './weights/MNIST/'
AE = torch.load(PATH + 'AE.pt')  
AE.load_state_dict(torch.load(PATH + 'AE_state_dict.pt'))  

CAE = torch.load(PATH + 'CAE.pt')  
CAE.load_state_dict(torch.load(PATH + 'CAE_state_dict.pt'))  

print("Successfully loaded AE & CAE")

use_cuda = True

torch.manual_seed(23)
device = torch.device("cuda" if use_cuda else "cpu")
    # use_cuda가 true라면 kwargs를 다음과 같이 지정하기. 
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}


Successfully loaded AE & CAE


In [31]:
from active_learn import active_sample

sample_size = 50
sampling_method = 'ae_coreset'
sample_dataset, sample_index,radius  = active_sample(unlabeled_dataset, labeled_dataset, sample_size, method=sampling_method, model=CAE, device=device)

sample_data = [unlabeled_dataset[i] for i in sample_index]
sample_label = [unlabeled_dataset_label[i] for i in sample_index]

    # Sampling에 따른 Datset 수정
if len(labeled_dataset_label) == 0 :  
    labeled_dataset = sample_data[:]
    labeled_dataset_label = sample_label[:]
else : 
    labeled_dataset = np.concatenate((labeled_dataset,sample_data),axis=0)
    labeled_dataset_label = np.concatenate((labeled_dataset_label, sample_label), axis =0)


for i in sample_index[::-1] : 
    del unlabeled_dataset[i]
    del unlabeled_dataset_label[i]


Max distance from cluster : 11.56


In [33]:
from sklearn.metrics import pairwise_distances
from active_learn import get_features

def make_subgraph(sampling_label, original_dataset, radii, model):
    x = [original_dataset[i[1]] for i in sampling_label] 
    dataset = original_dataset

    if model == AE or model == CAE : 
        x = get_features(model, x, device)
        dataset = get_features(model, dataset, device)

    dist = pairwise_distances(x,dataset, metric='euclidean')

    subgraph= dist.copy()
    density_subgraph = []
    for i, row in enumerate(dist) : 
        for j, distance in enumerate(row) : 
            if distance > radii or j == sampling_label[i][1] : subgraph[i,j] =int(0) 
            else : subgraph[i,j] = int(1) 
        
        density_subgraph.append(sum(subgraph[i]))
    

    return np.array(subgraph), density_subgraph



In [39]:
subgraph, density_subgraph = make_subgraph(sample_label, original_dataset, radius, CAE)


In [40]:
print(subgraph)
print(density_subgraph)
print(min(density_subgraph) ,max(density_subgraph))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [1. 1. 0. ... 1. 1. 1.]
 [0. 0. 0. ... 0. 0. 0.]]
[1621.0, 4.0, 72.0, 106.0, 308.0, 5906.0, 21.0, 13.0, 3.0, 13.0, 10.0, 69.0, 213.0, 137.0, 61.0, 51.0, 3.0, 355.0, 6.0, 22.0, 31.0, 23714.0, 17.0, 1601.0, 11.0, 29.0, 12.0, 124.0, 40.0, 17.0, 48.0, 65.0, 1134.0, 11.0, 5803.0, 5.0, 2.0, 15.0, 10.0, 67.0, 1045.0, 296.0, 16.0, 89.0, 298.0, 37.0, 62.0, 68.0, 29671.0, 451.0]
2.0 29671.0


In [59]:
def adjacency_subgraph(sample_data, sample_label, radii, model, M) : 
    # labeled_dataset을 Sample_dataset으로 변경 필요. 
    dataset = sample_data
    num_subgraph = len(sample_label)
    if model == AE or model==CAE : 
        dataset = get_features(model,dataset, device)
    
    dist = pairwise_distances(dataset, dataset, metric='euclidean')
    adj_dist = dist.copy()
    
    for i, row in enumerate(dist) : 
        adj_d = np.where(row < 2*radii[0])[0] # 겹치는 것들의 index만 도출하기 
        for j, distance in enumerate(row) : 
            if distance >= 2*radii[0] : adj_dist[i,j] = int(0)   
            elif 2*radii[0] > distance and distance >= radii[0]  : 
                adj_dist[i,j] = int(1)
            elif i==j : adj_dist[i,j] = int(0) # 자기자신은 제거
            else : 
                print('Break')

    classified_subgraph_index = []


    for i in range(num_subgraph) : 
        i_sub_class = "x"
        adj_index = np.where(adj_dist[ :,i] ==1)[0] 
        if len(adj_index)==0 : continue 
        i_sub_class = sample_label[i][0]

        for j in adj_index :  
            if i_sub_class != sample_label[j][0] : 
                i_sub_class = "x"
                continue
        if i_sub_class != "x" and len(adj_index) >= M : 
            classified_subgraph_index.append(i)
    
    classified_label = [sample_label[i] for i in classified_subgraph_index]

    return dist, adj_dist, classified_subgraph_index, classified_label

In [72]:
# 확인해야 하는 것. M을 조건으로 추가했을 때 어떤가 
# 왜 접하고 있는 subgraph의 개수가 6개를 넘어가는 게 있지? 점이 중복된 건가? 

dist_class, adj_dist, classified_subgraph_index, pseudo_class_label = adjacency_subgraph(sample_dataset, sample_label, radius, CAE, 0)

In [73]:
print(dist_class)
print(classified_subgraph_index)
print(len(classified_subgraph_index))
print(pseudo_class_label)


[[  0.       175.12886   73.471    ...  46.456486  43.843826  75.7779  ]
 [175.12886    0.       177.05714  ... 132.01562  152.70993  151.18013 ]
 [ 73.471    177.05714    0.       ...  92.4026    39.476887  26.207045]
 ...
 [ 46.456486 132.01562   92.4026   ...   0.        52.92574   80.31994 ]
 [ 43.843826 152.70993   39.476887 ...  52.92574    0.        32.581097]
 [ 75.7779   151.18013   26.207045 ...  80.31994   32.581097   0.      ]]
[0, 1, 3, 7, 9, 10, 14, 19, 22, 23, 24, 29, 36, 39, 42, 43, 44, 46, 49]
19
[[1, 941], [4, 1448], [0, 4218], [4, 9880], [6, 14129], [6, 16796], [0, 18896], [6, 22962], [0, 27477], [1, 27489], [4, 28128], [0, 37511], [4, 44998], [0, 47470], [0, 50344], [6, 52206], [0, 56469], [0, 57634], [0, 59510]]


In [81]:
len(subgraph[0])

60000

In [124]:
from collections import defaultdict
def first_classification(classified_subgraph_index, pseudo_class_label, subgraph, density_subgraph, ratio) : 
    dense_classified_subgraph = [density_subgraph[i] for i in classified_subgraph_index]    
    sort_by_density = sorted(dense_classified_subgraph, reverse=True)
    rank = int(ratio*len(sort_by_density))
    M = sort_by_density[max(rank-1, 0)] # 밀도 상위 M % 의 subgraph만을 사용. 

    classification = defaultdict(list)
    for i, index in enumerate(classified_subgraph_index) : 
        if density_subgraph[index] < M : continue
        x_index = list(np.where(subgraph[index] == 1)[0])
        label = pseudo_class_label[i][0]
    
        classification[label] += x_index
    return classification

def check_performance(classification, original_label) : 
    score = defaultdict(list) 
    all_score = 0 
    all_count = 0 
    for i in sorted(list(classification.keys())) : 
        x_index = classification[i] 
        num_x = len(x_index)
        count = 0 
        for index in x_index :
            if original_label[index][0] == i : count += 1 
        
        i_score = count/num_x
        all_score += count
        all_count += num_x
        score[i] = [num_x, i_score]
    
    all_score = all_score/all_count
    
    return all_count, all_score, score




In [128]:
check = first_classification(classified_subgraph_index, pseudo_class_label, subgraph, density_subgraph, 0.99)
num_classification, score, dic_score = check_performance(check, original_label)

print(num_classification)
print(score)
print(dic_score)


4479
0.9995534717570886
defaultdict(<class 'list'>, {0: [1095, 1.0], 1: [3222, 0.999689633767846], 4: [28, 1.0], 6: [134, 0.9925373134328358]})


In [109]:
original_label[201]

[1, 201]

In [None]:
def check_class(subgraph, density_subgraph, M, labeled_dataset_label) : 
    num_sample = np.shape(subgraph)[1]

    classification =[-1]*num_sample 
    classified_index = []

    filtered_subgraph_index = []


    # xk 가 속한 subgraph의 label이 모두 같을 확률 
    for i in range(num_sample) : 
        in_subgraph_index = np.where(subgraph[ :,i] ==1)
        if len(in_subgraph_index[0])==0 : continue 
        i_class = labeled_dataset_label[in_subgraph_index[0][0]]

        for j in in_subgraph_index[1:] :  
            if i_class != labeled_dataset_label[j] : 
                i_class = -1
                continue
        if i_class != -1 : 
            classification[i] = i_class
            classified_index.append(i)
    
    score = len(classified_index)
    pseudo_label = [classification[i] for i in classified_index]

    return score, classified_index, pseudo_label

In [169]:
# dest_dir 위치에 결과 기록하기. 입력할 결과들을 입력값으로 넣음 
def log(dest_dir, episode_id, sample_method,  label_dataset_label, num_classification, ratio, accuracy):
    # log file은 dest_dir 위치에 log.csv를 두기 위한 주소이다. 
    log_file = os.path.join(dest_dir, 'log.csv')

    # 주소가 정확하지 않을  해당 위치에 파일이 존재하지 않을 때, log_rows를 다음과 같이 정한다. 
    if not os.path.exists(log_file):
        log_rows = [['Episode Id','Sample Method','Labeled Pool', 'Num of classification', 'Ratio', 'Accuracy']]
    # 파일이 존재할 때에는 데이터를 처리해서 불러온다. 
    else:
        log_rows = np.genfromtxt(log_file, delimiter=',', dtype=str, encoding='utf-8').tolist()

    # episod_id, sample_mthod, sample_time 등의 값을 추가한다. 
    log_rows.append([episode_id,sample_method, len(label_dataset_label), num_classification, ratio, accuracy])
    
    # 데이터를 저장한다. 파일이 없다면 새로 만든다. 
    np.savetxt(log_file,log_rows,'%s,%s,%s,%s,%s,%s',delimiter=',')

In [171]:
output_dir = "output/"
dataset_name = "MNIST"


dest_dir = os.path.join(output_dir, dataset_name)
episode_id = 0
sample_method = "ae_coreset"
num_classification = num_classification
ratio = 0.01
accuracy = score


if not os.path.exists(output_dir):
    os.mkdir(output_dir)
if not os.path.exists(dest_dir):
    os.mkdir(dest_dir)

now = datetime.now()
dest_dir_name = str(now.year) + str(now.month) + str(now.day) + str(now.hour) + str(now.minute) + str(now.second)
dest_dir_name = os.path.join(dest_dir, dest_dir_name)

if not os.path.exists(dest_dir_name):
    os.mkdir(dest_dir_name)
save_path = os.path.join(dest_dir_name,'init.pth')

In [172]:
log(dest_dir_name, episode_id, sample_method, labeled_dataset_label, num_classification, ratio, accuracy)
