In [1]:
import numpy as np
from util import hierarchical_cluster, construct_dendrogram, add_clusters_to_nodes
import json

In [2]:
relative_folder = "./cifar10_model"
predictions_filename = "prediction_results-cifar10-resnet50-test.json"
def import_predictions(filepath):
    data = None
    with open(filepath, "r") as input_file:
        data = json.load(input_file)
    return data

resnet50_cifar10_data = import_predictions(f"{relative_folder}/{predictions_filename}")

In [3]:
print(resnet50_cifar10_data.keys())

dataset = resnet50_cifar10_data["dataset"]
classes = resnet50_cifar10_data["classes"]
model = resnet50_cifar10_data["model_name"]
instances = resnet50_cifar10_data["test_instances"]

print(model, dataset, classes, sep=", ")
print(f"{len(instances)} instances")

dict_keys(['dataset', 'model_name', 'model_info', 'classes', 'test_instances', 'train_instances'])
resnet50, cifar10, ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
10000 instances


In [4]:
def extract_features(instances):
	combined = np.array([instance["features"] for instance in instances])
	return combined
        
features = extract_features(instances)
print(instances[0].keys())
print(features.shape)

dict_keys(['index', 'filename', 'true_class', 'predicted_class', 'prediction_scores', 'features', 'features_2'])
(10000, 512)


In [5]:
clusters = hierarchical_cluster(features, method="ward")

In [6]:
dendrogram = construct_dendrogram(clusters, connect_parent=True)

In [7]:
assert len(dendrogram.leaves) == len(instances), "leaves should match the amount of instances"

In [8]:
# add the list of instance index to each node representing each cluster
add_clusters_to_nodes(dendrogram)

In [9]:
len(dendrogram.root["cluster"])

10000

In [10]:
# now I want to add counts
def add_node_count(node):
	node["node_count"] = len(node["cluster"])
    
dendrogram.for_each_node(add_node_count)

In [11]:
dendrogram.root["node_count"]

10000

In [12]:
# now I want to add extra information on the leaves (correspond to an image)
def add_info_to_leaves(dendrogram, instances, callback):
    for leaf in dendrogram.leaves:
        id = leaf["instance_index"]
        instance = instances[id]

        callback(leaf, instance) 

def add_filename_pred_and_true(leaf, instance):
    pred_key = "predicted_class"
    true_key = "true_class"
    file_key = "filename"

    filename = instance[file_key]
    pred = instance[pred_key]
    true = instance[true_key]

    leaf[file_key] = filename
    leaf[pred_key] = pred
    leaf[true_key] = true

add_info_to_leaves(dendrogram, instances, add_filename_pred_and_true)

In [13]:
# add accuracy and correct count to the nodes
def add_acc(node):
    correct_count = 0
    for id in node["cluster"]:
        instance = instances[id]
        correct_count += int(instance["predicted_class"] == instance["true_class"])

    node["correct_count"] = correct_count
    node["accuracy"] = correct_count / len(node["cluster"])

dendrogram.for_each_node(add_acc)

In [14]:
dendrogram.root["correct_count"]

9277

In [15]:
from scipy.spatial.distance import squareform, pdist

In [16]:
def pairwise_distance(features):
    dists = pdist(features)
    dists = squareform(dists)
    return dists

In [17]:
dists = pairwise_distance(features)

In [140]:
dists[0][0]

0.0

In [28]:
def compute_top_similar(dists, k=100):
	similar = np.argsort(dists, axis=1)[:, 1:k+1]
	return similar

In [29]:
similar = compute_top_similar(dists)
print(similar.shape)

(10000, 100)


In [30]:
print(similar[0])

[8271 4826 2720 2029  586 8204 4674 1033  950 5822 7207 9961 2787  115
 4422 5943 8072 1662 5381 4900 8467  367 9087 6225 3539 4637 1088 3819
 8019 1612 1524 9647 8818  464 2243  106 8077 7910 7255 6575 7865 8201
 3810 5100 2991 5535 8680 3594 2823 8336 5204 7275 6710 7228 1877 6196
 9380 7879 7221 1611 2684 9926 2498 8224 8299 7574  573 2173 2054 6938
 7291 2144  256 3067    8 2107 6627 5161 1269 5575 2178 4933 2410 7211
 5313 9460 8601 5033 1848 7569 9089 9142 7110 7598  336  103 7268 3231
 7212 2529]


In [31]:
def add_top_similar_attr(top_similar_matrix, id_transform = lambda i: i):
    def _add_attr(leaf, instance):
        index = id_transform(instance["index"])
        leaf["similar"] = top_similar_matrix[index].tolist()
    return _add_attr
    
    
add_info_to_leaves(dendrogram, instances, add_top_similar_attr(similar))

In [33]:
len(dendrogram.leaves[0]["similar"])

100

In [34]:
dendrogram.root.keys()

dict_keys(['leaf', 'node_index', 'parent', 'children', 'cluster', 'node_count', 'correct_count', 'accuracy'])

In [37]:
dendrogram.remove_attr("parent", "cluster")
with open("../cifar10/clusters/cifar10_resnet50.json", "w") as outfile:
    data_export = {"tree": dendrogram.dict_format, "classes": classes}
    json.dump(data_export, outfile)