In [122]:
from adc.loader import *
from adc.distance_calculator import *
import math
import pandas as pd

In [63]:
# get index->label mapping
calc = DistanceCalculator(num_samples=60000, batch_size=100, model_name='name')
calc._get_original_samples()
labels_mapping_dict = calc.labels # key: sample index, value: label
possible_labels = list(set(calc.labels.values()))

In [74]:
# load data
data_tuples = {
    'linear_mse': ('linear_autoencoder_MSE_original_distances.pkl', 'linear_autoencoder_MSE_encoded_distances.pkl'),
    'linear_l1': ('linear_autoencoder_L1_original_distances.pkl', 'linear_autoencoder_L1_encoded_distances.pkl'),
    'relu_mse': ('relu_autoencoder_MSE_original_distances.pkl', 'relu_autoencoder_MSE_encoded_distances.pkl'),
    'relu_l1': ('relu_autoencoder_L1_original_distances.pkl', 'relu_autoencoder_L1_encoded_distances.pkl')
    }

def get_data(data_tuples, model_name):
    original = load_pickle(data_tuples[model_name][0])
    encoded = load_pickle(data_tuples[model_name][1])
    return original, encoded

In [124]:
def compute_score(original, encoded, dist_type='cityblock', size=20, samples_keys=list(range(60000))):
    # index based score
    inter_len_avg = 0
    for k in samples_keys:
        encoded_set = set(encoded[dist_type][0][k][:size])
        original_set = set(original[dist_type][0][k][:size])
        size = len(original_set)
        inter = encoded_set.intersection(original_set)
        inter_len = len(inter) / size
        inter_len_avg += inter_len
    inter_len_avg = inter_len_avg / len(samples_keys)
    return inter_len_avg

In [132]:
sizes = [5, 10, 20, 50]

### 1 - Does auto encoder keep original close samples (original vs. encoded) ?

In [134]:
tuples_list = []
for model_name in data_tuples.keys():
    orig, encoded = get_data(data_tuples, model_name)
    for dist_type in ['cityblock', 'euclidean']:
        scores_at_sizes = []
        for i,s in enumerate(sizes):
            score = compute_score(orig, encoded, dist_type=dist_type, size=s)
            scores_at_sizes.append(score)
        result_lists = [model_name, dist_type] + scores_at_sizes
        result_tuple = tuple(result_lists)
        tuples_list.append(result_tuple)
cols_sizes = ['score@%d' % s for s in sizes]
cols = ['model_name', 'distance_type'] + cols_sizes
df = pd.DataFrame.from_records(tuples_list, columns=cols)

In [135]:
df

Unnamed: 0,model_name,distance_type,score@5,score@10,score@20,score@50
0,linear_mse,cityblock,0.004993,0.008343,0.013582,0.02558
1,linear_mse,euclidean,0.00512,0.008242,0.013722,0.025741
2,linear_l1,cityblock,0.002263,0.003748,0.0063,0.012188
3,linear_l1,euclidean,0.002233,0.003678,0.006278,0.012261
4,relu_mse,cityblock,0.027783,0.043683,0.066798,0.1113
5,relu_mse,euclidean,0.027767,0.043457,0.067046,0.11222
6,relu_l1,cityblock,0.023263,0.037283,0.058863,0.101125
7,relu_l1,euclidean,0.02234,0.036687,0.057701,0.100684


In [43]:
def find_label_indices(label, labels_mapping_dict):
    indices = []
    for index, l in labels_mapping_dict.items():
        if l == label:
            indices.append(index)
    return indices

In [55]:
def compute_score_per_label(label, original, encoded, dist_type='cityblock', size=20, labels_mapping_dict=labels_mapping_dict):
    indices = find_label_indices(label, labels_mapping_dict)
    return compute_score(original, encoded, dist_type=dist_type, size=size, samples_keys=indices)

### 1 - (per label)

(size=20 only)

In [140]:
# per label
tuples_list = []
for model_name in data_tuples.keys():
    orig, encoded = get_data(data_tuples, model_name)
    for dist_type in ['cityblock', 'euclidean']:
        scores_at_labels = []
        for l in possible_labels:
            score = compute_score_per_label(l, orig, encoded, dist_type=dist_type)  # size=20
            scores_at_labels.append(score)
        result_list = [model_name, dist_type] + scores_at_labels
        result_tuple = tuple(result_list)
        tuples_list.append(result_tuple)

cols_labels = ['score@label-%d' % l for l in possible_labels]
cols = ['model_name', 'distance_type'] + cols_labels
df_per_label = pd.DataFrame.from_records(tuples_list, columns=cols)

In [141]:
df_per_label

Unnamed: 0,model_name,distance_type,score@label-0,score@label-1,score@label-2,score@label-3,score@label-4,score@label-5,score@label-6,score@label-7,score@label-8,score@label-9
0,linear_mse,cityblock,0.015963,0.047605,0.009114,0.009224,0.008268,0.004842,0.007325,0.014246,0.004256,0.009497
1,linear_mse,euclidean,0.015955,0.04779,0.0095,0.009452,0.008388,0.005294,0.007452,0.014046,0.004298,0.009607
2,linear_l1,cityblock,0.014207,0.009092,0.005623,0.005929,0.003817,0.00511,0.003819,0.005164,0.00423,0.005547
3,linear_l1,euclidean,0.014174,0.009322,0.005438,0.005749,0.003706,0.005054,0.003768,0.005124,0.004495,0.005455
4,relu_mse,cityblock,0.092124,0.098524,0.060255,0.051615,0.056804,0.058024,0.077585,0.065044,0.044095,0.059077
5,relu_mse,euclidean,0.092529,0.095802,0.061522,0.053009,0.056967,0.059804,0.076943,0.065523,0.045197,0.058825
6,relu_l1,cityblock,0.086907,0.10907,0.046165,0.03877,0.040842,0.05035,0.066492,0.071381,0.031781,0.03878
7,relu_l1,euclidean,0.086839,0.102484,0.046215,0.038705,0.039601,0.052131,0.065149,0.068428,0.032106,0.038334


### 2 - Do these close samples have the same label? (scoring)

For each size - calculate for each sample index the ratio of similar labels (instead of indices)

- improvement_metric: sum(original_similar_rate - encoded_similar_rate) / n
- similarity_metric: sum(|original_similar_rate - encoded_similar_rate|) / n

In [118]:
def compute_label_based_score(original, encoded, dist_type='cityblock', size=20, samples_keys=list(range(60000))):
    # label based score
    improvement_metric = 0
    similarity_metric = 0
    for k in samples_keys:
        curr_encoded = encoded[dist_type][2][k][:size]
        curr_original = original[dist_type][2][k][:size]
        original_label = labels_mapping_dict[k]
        original_similar_rate = curr_encoded.count(original_label) / size
        encoded_similar_rate = curr_original.count(original_label) / size
        improvement_metric += original_similar_rate - encoded_similar_rate
        similarity_metric += abs(improvement_metric)
    improvement_metric = improvement_metric / len(samples_keys)
    similarity_metric = similarity_metric / len(samples_keys)
    return improvement_metric, similarity_metric

In [120]:
for model_name in data_tuples.keys():
    print("Model name: %s" % model_name)
    orig, encoded = get_data(data_tuples, model_name)
    for dist_type in ['cityblock', 'euclidean']:
        print("Distance type: %s" % dist_type)
        for s in [5, 10, 20, 50]:
            score = compute_label_based_score(orig, encoded, dist_type=dist_type, size=s)
            result_tuple = (s, score)
            print(result_tuple)

Model name: linear_mse
Distance type: cityblock
(5, (-0.5728833333332348, 17165.17939333335))
(10, (-0.5610000000000152, 16812.870913333354))
(20, (-0.5461516666666607, 16357.784285000014))
(50, (-0.5197356666666877, 15560.709961666706))
Distance type: euclidean
(5, (-0.5809899999999056, 17412.84671666676))
(10, (-0.5696716666666851, 17070.359688333323))
(20, (-0.5567358333333284, 16674.147549999972))
(50, (-0.5322596666666869, 15936.713578666757))
Model name: linear_l1
Distance type: cityblock
(5, (-0.6486899999999376, 19448.891600000163))
(10, (-0.6372600000000461, 19109.606999999996))
(20, (-0.6222608333333267, 18647.227241666817))
(50, (-0.5950913333333439, 17826.588405000017))
Distance type: euclidean
(5, (-0.656716666666616, 19695.73616666666))
(10, (-0.6463816666667129, 19380.141048333488))
(20, (-0.6328633333333221, 18970.77561999999))
(50, (-0.6075270000000014, 18202.644361333478))
Model name: relu_mse
Distance type: cityblock
(5, (-0.1979666666666762, 5985.549330000097))
(10,

In [142]:
tuples_list_imp, tuples_list_sim = [], []
for model_name in data_tuples.keys():
    orig, encoded = get_data(data_tuples, model_name)
    for dist_type in ['cityblock', 'euclidean']:
        improvement_at_sizes = []
        sim_at_sizes = []
        for i, s in enumerate(sizes):
            score = compute_label_based_score(orig, encoded, dist_type=dist_type, size=s)
            improvement, similarity = score
            improvement_at_sizes.append(improvement)
            sim_at_sizes.append(similarity)
        
        result_list_imp = [model_name, dist_type] + improvement_at_sizes
        result_tuple_imp = tuple(result_list_imp)
        tuples_list_imp.append(result_tuple_imp)
        
        result_list_sim = [model_name, dist_type] + sim_at_sizes
        result_tuple_sim = tuple(result_list_sim)
        tuples_list_sim.append(result_tuple_sim)
        

cols_labels = ['score@%d' % s for s in sizes]
cols = ['model_name', 'distance_type'] + cols_labels

df_imp = pd.DataFrame.from_records(tuples_list_imp, columns=cols)
df_sim = pd.DataFrame.from_records(tuples_list_sim, columns=cols)

In [143]:
# improvement
df_imp

Unnamed: 0,model_name,distance_type,score@5,score@10,score@20,score@50
0,linear_mse,cityblock,-0.572883,-0.561,-0.546152,-0.519736
1,linear_mse,euclidean,-0.58099,-0.569672,-0.556736,-0.53226
2,linear_l1,cityblock,-0.64869,-0.63726,-0.622261,-0.595091
3,linear_l1,euclidean,-0.656717,-0.646382,-0.632863,-0.607527
4,relu_mse,cityblock,-0.197967,-0.187535,-0.172286,-0.146728
5,relu_mse,euclidean,-0.205923,-0.19669,-0.182925,-0.159048
6,relu_l1,cityblock,-0.251017,-0.240657,-0.225848,-0.200101
7,relu_l1,euclidean,-0.25936,-0.24982,-0.236222,-0.212447


In [144]:
# similarity
df_sim

Unnamed: 0,model_name,distance_type,score@5,score@10,score@20,score@50
0,linear_mse,cityblock,17165.179393,16812.870913,16357.784285,15560.709962
1,linear_mse,euclidean,17412.846717,17070.359688,16674.14755,15936.713579
2,linear_l1,cityblock,19448.8916,19109.607,18647.227242,17826.588405
3,linear_l1,euclidean,19695.736167,19380.141048,18970.77562,18202.644361
4,relu_mse,cityblock,5985.54933,5665.248672,5193.3695,4417.015791
5,relu_mse,euclidean,6226.396673,5932.24659,5512.591616,4787.708799
6,relu_l1,cityblock,7564.997933,7256.798622,6800.113689,6017.723394
7,relu_l1,euclidean,7817.239087,7527.115652,7114.144236,6392.120924
