In [1]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt

# DataLoader은 Dataset을 샘플에 쉽게 접근할 수 있도록 순회가능한 객체(iterable)로 감쌉니다
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, datasets
from torchvision.transforms import ToTensor
import torchvision.models as models 

import pdb
from datetime import datetime
import argparse
# pretty print. 들여쓰기 등을 지원해준다. 

import pprint
import time
import csv

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

# local stuff
# 폴더에 있는 경우 A.B 형태로 기술 
from dsets.mnist import MNIST
from mymodels.mnist_net import Net
from auto_encoder import AutoEncoder, ConvAutoEncoder, ae_train
from train_test import train
from init_pool_tools import obtain_init_pool
from coreset import Coreset_Greedy

In [2]:
original_data = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
    )


    # MNIST Dataset을 가공할 수 있는 list로 변경. feature와 label 각각 저장 
original_all = []
original_dataset = []
original_label = [] 

for sample in original_data : 
    original_all.append(sample)
    feature = np.array(sample[0])
    original_dataset.append(list(feature.reshape(len(feature[0])*len(feature[0][0]))))
    original_label.append(sample[1])

unlabeled_dataset = original_dataset[:]
unlabeled_dataset_label = original_label[:]
labeled_dataset = [] 
labeled_dataset_label = []

PATH = './weights/'
AE = torch.load(PATH + 'AE.pt')  
AE.load_state_dict(torch.load(PATH + 'AE_state_dict.pt'))  

CAE = torch.load(PATH + 'CAE.pt')  
CAE.load_state_dict(torch.load(PATH + 'CAE_state_dict.pt'))  

print("Successfully loaded AE & CAE")



use_cuda = True


torch.manual_seed(23)
device = torch.device("cuda" if use_cuda else "cpu")
    # use_cuda가 true라면 kwargs를 다음과 같이 지정하기. 
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}


Successfully loaded AE & CAE


In [3]:
from active_learn import active_sample, remove_rows

sample_size = 1000
sampling_method = 'ae_coreset'
sample_dataset, sample_index,radius  = active_sample(unlabeled_dataset, labeled_dataset, sample_size, method=sampling_method, model=AE, device=device)

sample_data = [unlabeled_dataset[i] for i in sample_index]
sample_label = [unlabeled_dataset_label[i] for i in sample_index]

    # Sampling에 따른 Datset 수정 
labeled_dataset = sample_data[:]
labeled_dataset_label = sample_label[:]

unlabeled_dataset = remove_rows(unlabeled_dataset, sample_data)
for i in sample_index[::-1] : 
    del unlabeled_dataset_label[i]


Max distance from cluster : 0.18


In [4]:
from sklearn.metrics import pairwise_distances
from active_learn import get_features

def make_subgraph(sampling_index, original_dataset, radii, model):

    x = [original_dataset[i] for i in sampling_index]
    dataset = original_dataset

    if model == AE or model == CAE : 
        x = get_features(model, x, device)
        dataset = get_features(model, dataset, device)

    dist = pairwise_distances(x,dataset, metric='euclidean')

    subgraph= dist.copy()
    density_subgraph = []
    for i, row in enumerate(dist) : 
        for j, distance in enumerate(row) : 
            if distance > radii or j == sampling_index[i] : subgraph[i,j] =int(0) # 자기자신은 제거  
            else : subgraph[i,j] = int(1) 
        
        density_subgraph.append(sum(subgraph[i]))
    

    return np.array(subgraph), density_subgraph




In [5]:
subgraph, density_subgraph = make_subgraph(sample_index, original_dataset, radius, AE)


In [6]:
print(subgraph)
print(density_subgraph)
print(min(density_subgraph) ,max(density_subgraph))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[24.0, 36.0, 162.0, 21.0, 17.0, 122.0, 29.0, 34.0, 10.0, 7.0, 47.0, 44.0, 11.0, 1.0, 147.0, 66.0, 24.0, 120.0, 19.0, 4.0, 16.0, 96.0, 15.0, 93.0, 112.0, 116.0, 42.0, 7.0, 72.0, 24.0, 325.0, 86.0, 11.0, 43.0, 232.0, 19.0, 131.0, 74.0, 18.0, 76.0, 11.0, 38.0, 237.0, 33.0, 14.0, 0.0, 18.0, 58.0, 8.0, 20.0, 6.0, 172.0, 18.0, 283.0, 303.0, 116.0, 33.0, 115.0, 1.0, 2.0, 15.0, 24.0, 151.0, 44.0, 1033.0, 31.0, 40.0, 31.0, 0.0, 11.0, 209.0, 166.0, 34.0, 49.0, 46.0, 20.0, 14.0, 0.0, 1.0, 1393.0, 233.0, 163.0, 4.0, 2.0, 2.0, 23.0, 305.0, 0.0, 6.0, 603.0, 9.0, 132.0, 2.0, 5.0, 1697.0, 47.0, 42.0, 25.0, 6.0, 27.0, 4.0, 40.0, 6.0, 154.0, 3.0, 84.0, 3.0, 32.0, 129.0, 122.0, 4.0, 49.0, 98.0, 10.0, 1.0, 19.0, 324.0, 11.0, 0.0, 0.0, 49.0, 8.0, 44.0, 18.0, 927.0, 420.0, 181.0, 15.0, 158.0, 6.0, 250.0, 21.0, 161.0, 85.0, 486.0, 19.0, 130.0, 985.0, 73.0

In [80]:
# 추가 수정 필요! 뭔가 이상함!
def adjacency_subgraph(labeled_dataset, labeled_dataset_label, radii, model) : 
    dataset = labeled_dataset
    num_subgraph = len(labeled_dataset_label)
    if model == AE or model==CAE : 
        dataset = get_features(model,dataset, device)
    
    dist = pairwise_distances(dataset, dataset, metric='euclidean')
    adj_dist = dist.copy()
    
    for i, row in enumerate(dist) : 
        adj_d = np.where(row < 2*radii[0])[0] 
        for j, distance in enumerate(row) : 
            if distance >= 2*radii[0] : adj_dist[i,j] = int(0)   
            # 뭔가 이상.. 여기선 문제없지만, 밖에서 다시 거리를 재면 성립안하는 경우가 있음. 
            elif 2*radii[0] > distance and distance >= radii[0]  : 
                adj_dist[i,j] = int(1)
            elif i==j : adj_dist[i,j] = int(0) # 자기자신은 제거
            else : 
                print('Break')

    classified_subgraph_index = []

    for i in range(num_subgraph) : 
        i_sub_class = "x"
        adj_index = np.where(adj_dist[ :,i] ==1)
        if len(adj_index[0])==0 : continue 
        i_sub_class = labeled_dataset_label[i]

        for j in adj_index[0] :  
            if i_sub_class != labeled_dataset_label[j] : 
                i_sub_class = "x"
                continue
        if i_sub_class != "x" : 
            classified_subgraph_index.append(i)
    
    classified_label = [labeled_dataset_label[i] for i in classified_subgraph_index]

    return dist, adj_dist, classified_subgraph_index, classified_label

In [81]:
# 확인해야 하는 것. M을 조건으로 추가했을 때 어떤가 
# 왜 접하고 있는 subgraph의 개수가 6개를 넘어가는 게 있지? 점이 중복된 건가? 

dist_class, adj_dist, classified_subgraph_index, pseudo_class_label = adjacency_subgraph(labeled_dataset, labeled_dataset_label, radius, AE)

In [82]:
print(dist_class)
print(classified_subgraph_index)
print(len(classified_subgraph_index))
print(pseudo_class_label)


[[0.        5.0313087 5.5453887 ... 4.70069   6.020795  2.71513  ]
 [5.0313087 0.        4.9629364 ... 6.3302293 2.0796616 2.7819836]
 [5.5453887 4.9629364 0.        ... 9.53124   3.6579835 5.7253575]
 ...
 [4.70069   6.3302293 9.53124   ... 0.        8.268933  3.945808 ]
 [6.020795  2.0796616 3.6579835 ... 8.268933  0.        4.448273 ]
 [2.71513   2.7819836 5.7253575 ... 3.945808  4.448273  0.       ]]
[0, 1, 4, 5, 7, 9, 10, 12, 14, 15, 17, 19, 20, 22, 23, 25, 26, 27, 29, 31, 33, 34, 35, 38, 40, 44, 45, 47, 49, 50, 52, 56, 58, 59, 60, 61, 66, 67, 68, 69, 72, 73, 76, 77, 82, 83, 85, 87, 88, 90, 92, 93, 96, 97, 98, 99, 100, 101, 102, 105, 106, 107, 108, 110, 113, 115, 117, 119, 121, 127, 130, 131, 132, 138, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 153, 154, 158, 159, 162, 163, 164, 166, 167, 168, 169, 170, 171, 173, 174, 178, 179, 180, 182, 183, 184, 186, 188, 190, 191, 192, 193, 194, 198, 203, 205, 207, 209, 210, 211, 214, 218, 221, 222, 224, 225, 226, 227, 228, 229, 230

In [167]:
from collections import defaultdict
def first_classification(classified_subgraph_index, pseudo_class_label, subgraph, density_subgraph, ratio) : 
    dense_classified_subgraph_index = [density_subgraph[i] for i in classified_subgraph_index]
    sorted_density = sorted(dense_classified_subgraph_index, reverse=True)
    rank = int(ratio*len(density_subgraph))

    M = sorted_density[rank-1]

    classification = defaultdict(list)
    for i, index in enumerate(classified_subgraph_index) : 
        if density_subgraph[index] < M : continue
        x_index = list(np.where(subgraph[index] == 1)[0])
        label = pseudo_class_label[i]
    
        classification[label] = classification[label] + x_index
    return classification

def check_performance(classification, original_label) : 
    score = defaultdict(list) 
    all_score = 0 
    all_count = 0 
    for i in sorted(list(classification.keys())) : 
        x_index = classification[i] 
        num_x = len(x_index)
        count = 0 
        for index in x_index :
            if original_label[index] == i : count += 1 
        
        i_score = count/num_x
        all_score += count
        all_count += num_x
        score[i] = [num_x, i_score]
    
    all_score = all_score/all_count
    
    return all_count, all_score, score




In [168]:
check = first_classification(classified_subgraph_index, pseudo_class_label, subgraph, density_subgraph, 0.02)
num_classification, score, dic_score = check_performance(check, original_label)

print(num_classification)
print(score)
print(dic_score)


7716
0.9612493519958528
defaultdict(<class 'list'>, {1: [3026, 0.9619960343688037], 2: [1290, 0.9604651162790697], 3: [1235, 0.928744939271255], 6: [1157, 0.9723422644770959], 7: [1008, 0.9871031746031746]})


In [169]:
# dest_dir 위치에 결과 기록하기. 입력할 결과들을 입력값으로 넣음 
def log(dest_dir, episode_id, sample_method,  label_dataset_label, num_classification, ratio, accuracy):
    # log file은 dest_dir 위치에 log.csv를 두기 위한 주소이다. 
    log_file = os.path.join(dest_dir, 'log.csv')

    # 주소가 정확하지 않을  해당 위치에 파일이 존재하지 않을 때, log_rows를 다음과 같이 정한다. 
    if not os.path.exists(log_file):
        log_rows = [['Episode Id','Sample Method','Labeled Pool', 'Num of classification', 'Ratio', 'Accuracy']]
    # 파일이 존재할 때에는 데이터를 처리해서 불러온다. 
    else:
        log_rows = np.genfromtxt(log_file, delimiter=',', dtype=str, encoding='utf-8').tolist()

    # episod_id, sample_mthod, sample_time 등의 값을 추가한다. 
    log_rows.append([episode_id,sample_method, len(label_dataset_label), num_classification, ratio, accuracy])
    
    # 데이터를 저장한다. 파일이 없다면 새로 만든다. 
    np.savetxt(log_file,log_rows,'%s,%s,%s,%s,%s,%s',delimiter=',')

In [171]:
output_dir = "output/"
dataset_name = "MNIST"


dest_dir = os.path.join(output_dir, dataset_name)
episode_id = 0
sample_method = "ae_coreset"
num_classification = num_classification
ratio = 0.01
accuracy = score


if not os.path.exists(output_dir):
    os.mkdir(output_dir)
if not os.path.exists(dest_dir):
    os.mkdir(dest_dir)

now = datetime.now()
dest_dir_name = str(now.year) + str(now.month) + str(now.day) + str(now.hour) + str(now.minute) + str(now.second)
dest_dir_name = os.path.join(dest_dir, dest_dir_name)

if not os.path.exists(dest_dir_name):
    os.mkdir(dest_dir_name)
save_path = os.path.join(dest_dir_name,'init.pth')

In [172]:
log(dest_dir_name, episode_id, sample_method, labeled_dataset_label, num_classification, ratio, accuracy)


In [176]:
import numpy as np 
a = list(np.arange(0.001,1, 0.001))
print(a)

[0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009000000000000001, 0.010000000000000002, 0.011, 0.012, 0.013000000000000001, 0.014000000000000002, 0.015, 0.016, 0.017, 0.018000000000000002, 0.019000000000000003, 0.02, 0.021, 0.022000000000000002, 0.023, 0.024, 0.025, 0.026000000000000002, 0.027000000000000003, 0.028, 0.029, 0.030000000000000002, 0.031, 0.032, 0.033, 0.034, 0.035, 0.036000000000000004, 0.037000000000000005, 0.038, 0.039, 0.04, 0.041, 0.042, 0.043000000000000003, 0.044000000000000004, 0.045, 0.046, 0.047, 0.048, 0.049, 0.05, 0.051000000000000004, 0.052000000000000005, 0.053000000000000005, 0.054, 0.055, 0.056, 0.057, 0.058, 0.059000000000000004, 0.060000000000000005, 0.061, 0.062, 0.063, 0.064, 0.065, 0.066, 0.067, 0.068, 0.069, 0.07, 0.07100000000000001, 0.07200000000000001, 0.07300000000000001, 0.074, 0.075, 0.076, 0.077, 0.078, 0.079, 0.08, 0.081, 0.082, 0.083, 0.084, 0.085, 0.08600000000000001, 0.08700000000000001, 0.08800000000000001, 0.089, 0.09, 0.091,