In [1]:
import numpy as np
from util import hierarchical_cluster, construct_dendrogram, add_clusters_to_nodes
import json

In [4]:
relative_folder = "./image_data"
predictions_filename = "prediction_results-cifar10-resnet50-test.json"
def import_predictions(filepath):
    data = None
    with open(filepath, "r") as input_file:
        data = json.load(input_file)
    return data

resnet50_cifar10_data = import_predictions(f"{relative_folder}/{predictions_filename}")

In [22]:
print(resnet50_cifar10_data.keys())

dataset = resnet50_cifar10_data["dataset"]
classes = resnet50_cifar10_data["classes"]
model = resnet50_cifar10_data["model_name"]
instances = resnet50_cifar10_data["test_instances"]

print(model, dataset, classes, sep=", ")
print(f"{len(instances)} instances")

dict_keys(['dataset', 'model_name', 'model_info', 'classes', 'test_instances', 'train_instances'])
resnet50, cifar10, ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
10000 instances


In [47]:
def extract_features(instances):
	combined = np.array([instance["features"] for instance in instances])
	return combined
        
features = extract_features(instances)
print(instances[0].keys())
print(features.shape)

dict_keys(['index', 'filename', 'true_class', 'predicted_class', 'prediction_scores', 'features', 'features_2'])
(10000, 512)


In [26]:
clusters = hierarchical_cluster(features, method="ward")

In [38]:
dendrogram = construct_dendrogram(clusters, connect_parent=True)

In [39]:
assert len(dendrogram.leaves) == len(instances), "leaves should match the amount of instances"

In [41]:
# add the list of instance index to each node representing each cluster
add_clusters_to_nodes(dendrogram)

In [44]:
len(dendrogram.root["cluster"])

10000

In [45]:
# now I want to add counts
def add_node_count(node):
	node["node_count"] = len(node["cluster"])
    
dendrogram.for_each_node(add_node_count)

In [46]:
dendrogram.root["node_count"]

10000

In [50]:
# now I want to add extra information on the leaves (correspond to an image)
def add_info_to_leaves(dendrogram, instances, callback):
    for leaf in dendrogram.leaves:
        id = leaf["instance_index"]
        instance = instances[id]

        callback(leaf, instance) 

def add_filename_pred_and_true(leaf, instance):
    pred_key = "predicted_class"
    true_key = "true_class"
    file_key = "filename"

    filename = instance[file_key]
    pred = instance[pred_key]
    true = instance[true_key]

    leaf[file_key] = filename
    leaf[pred_key] = pred
    leaf[true_key] = true

add_info_to_leaves(dendrogram, instances, add_filename_pred_and_true)

In [52]:
# add accuracy and correct count to the nodes
def add_acc(node):
    correct_count = 0
    for id in node["cluster"]:
        instance = instances[id]
        correct_count += int(instance["predicted_class"] == instance["true_class"])

    node["correct_count"] = correct_count
    node["accuracy"] = correct_count / len(node["cluster"])

dendrogram.for_each_node(add_acc)

In [54]:
dendrogram.root["correct_count"]

9277

In [None]:
# add similar list to leaves
add_info_to_leaves(dendrogram, instances)

In [55]:
from scipy.spatial.distance import squareform, pdist

In [56]:
dists = pdist(features)
dists = squareform(dists)
print(dists.shape)

(10000, 10000)


In [138]:
def pairwise_distance(features):
    dists = pdist(features)
    dists = squareform(dists)
    return dists

In [139]:
dists = pairwise_distance(features)

In [140]:
dists[0][0]

0.0

In [141]:
def compute_top_similar(dists, k=10):
	similar = np.argpartition(dists, kth=k, axis=1)[:k]
	return similar

In [142]:
similar = compute_top_similar(dists)
print(similar.shape)

(10, 10000)


In [143]:
similar = similar.T


In [150]:
k = 10
t = 6471 
idx_unsorted = np.argpartition(dists[t], kth=k)[:k]
idx_sorted = idx_unsorted[np.argsort(dists[6471][idx_unsorted])]
print(idx_sorted)

[6471   64 5039 8802 1873 6459 3477 5157 1931 3427]


In [151]:
idx_unsorted = np.argpartition(dists, kth=k, axis=1)[:k]
idx_sorted = idx_unsorted[np.argsort(dists[:, idx_unsorted], axis=1)]

KeyboardInterrupt: 

In [None]:
idx = np.argsort(dis)