In [2]:
import pandas as pd

device = 'cpu'
train_dataset = pd.read_pickle('data/digitWdb_train.pkl.gz')
train_data = []
for i in range(len(train_dataset)):
    dic = {}
    c = 0
    for layer in train_dataset.iloc[i]['layers']:
        if layer['name'] == 'Conv2D' or layer['name'] == 'Dense':
            dic[f'layer_{c}'] = layer['weights']
            dic[f'layer_{c+1}'] = layer['bias']
            c += 2
    train_data.append(dic)

test_dataset = data = pd.read_pickle('data/digitWdb_test.pkl.gz')
test_data = []
for i in range(len(test_dataset)):
    dic = {}
    c = 0
    for layer in test_dataset.iloc[i]['layers']:
        if layer['name'] == 'Conv2D' or layer['name'] == 'Dense':
            dic[f'layer_{c}'] = layer['weights']
            dic[f'layer_{c+1}'] = layer['bias']
            c += 2
    test_data.append(dic)

In [3]:
import numpy as np

labels = {
    'layer_0': 'conv1',
    'layer_1': 'bias1',
    'layer_2': 'conv2',
    'layer_3': 'bias2',
    'layer_4': 'conv3',
    'layer_5': 'bias3',
    'layer_6': 'dense1',
    'layer_7': 'bias4',
    'layer_8': 'dense2',
    'layer_9': 'bias5'
}

def data_distance(x1, x2):
    return np.mean(np.abs(x1 - x2))


In [4]:
import time

train_diff = np.zeros((11, 8000, 8000))
test_diff = np.zeros((11, 2000, 2000))
inter_diff = np.zeros((11, 2000, 8000))

layer_weights = []
for layer in labels:
    layer_weights.append(np.prod(train_data[0][layer].shape))
layer_weights /= np.sum(layer_weights)

for k1 in range(len(train_data)-1):
    start_time = time.time()
    d1 = train_data[k1]
    for k2 in range(k1+1, len(train_data)):
        if k1 != k2:
            d2 = train_data[k2]
            for i, layer in enumerate(labels):
                train_diff[i, k1, k2] = data_distance(d1[layer], d2[layer])
                train_diff[i, k2, k1] = train_diff[i, k1, k2]
                train_diff[10, k1, k2] += train_diff[i, k1, k2] * layer_weights[i]
    print(f"TRAIN {k1+1}/{len(train_data)} ({time.time() - start_time:.2f}s)")

for k1 in range(len(test_data)-1):
    start_time = time.time()
    d1 = test_data[k1]
    for k2 in range(k1+1, len(test_data)):
        if k1 != k2:
            d2 = test_data[k2]
            for i, layer in enumerate(labels):
                test_diff[i, k1, k2] = data_distance(d1[layer], d2[layer])
                test_diff[i, k2, k1] = test_diff[i, k1, k2]
                test_diff[10, k1, k2] += test_diff[i, k1, k2] * layer_weights[i]
    print(f"TEST {k1+1}/{len(test_data)} ({time.time() - start_time:.2f}s)")

for k1 in range(len(test_data)):
    start_time = time.time()
    d1 = test_data[k1]
    for k2 in range(len(train_data)):
        d2 = train_data[k2]
        for i, layer in enumerate(labels):
            inter_diff[i, k1, k2] = data_distance(d1[layer], d2[layer])
            inter_diff[10, k1, k2] += inter_diff[i, k1, k2] * layer_weights[i]
    print(f"TEST {k1+1}/{len(test_data)} ({time.time() - start_time:.2f}s)")


TRAIN 1/8000 (1.05s)
TRAIN 2/8000 (0.95s)
TRAIN 3/8000 (1.01s)
TRAIN 4/8000 (0.95s)
TRAIN 5/8000 (0.93s)
TRAIN 6/8000 (0.95s)
TRAIN 7/8000 (0.94s)
TRAIN 8/8000 (0.94s)
TRAIN 9/8000 (0.97s)
TRAIN 10/8000 (0.93s)
TRAIN 11/8000 (0.93s)
TRAIN 12/8000 (0.93s)
TRAIN 13/8000 (0.94s)
TRAIN 14/8000 (0.94s)
TRAIN 15/8000 (0.94s)
TRAIN 16/8000 (0.94s)
TRAIN 17/8000 (0.94s)
TRAIN 18/8000 (0.94s)
TRAIN 19/8000 (0.95s)
TRAIN 20/8000 (0.99s)
TRAIN 21/8000 (1.00s)
TRAIN 22/8000 (1.01s)
TRAIN 23/8000 (1.00s)
TRAIN 24/8000 (0.97s)
TRAIN 25/8000 (1.01s)
TRAIN 26/8000 (0.99s)
TRAIN 27/8000 (0.99s)
TRAIN 28/8000 (0.99s)
TRAIN 29/8000 (0.99s)
TRAIN 30/8000 (1.00s)
TRAIN 31/8000 (1.00s)
TRAIN 32/8000 (0.99s)
TRAIN 33/8000 (1.00s)
TRAIN 34/8000 (0.99s)
TRAIN 35/8000 (1.00s)
TRAIN 36/8000 (1.00s)
TRAIN 37/8000 (1.00s)
TRAIN 38/8000 (1.00s)
TRAIN 39/8000 (1.01s)
TRAIN 40/8000 (0.96s)
TRAIN 41/8000 (0.94s)
TRAIN 42/8000 (0.94s)
TRAIN 43/8000 (0.93s)
TRAIN 44/8000 (0.94s)
TRAIN 45/8000 (0.98s)
TRAIN 46/8000 (1.00

In [11]:

import matplotlib.pyplot as plt

def box_plot(data, label):
    fig = plt.figure(figsize=(12, len(data)+2))
    ax = fig.add_subplot(1, 1, 1)
    ax.boxplot(data, vert=False, labels=label)
    plt.savefig("test.png")
    # plt.show()
    plt.close(fig)


train_d = train_diff.copy()
test_d = test_diff.copy()
inter_d = inter_diff.copy()

train_d = np.sort(train_d, axis=-1)
test_d = np.sort(test_d, axis=-1)
inter_d = np.sort(inter_d, axis=-1)

print(np.mean(train_d[10, :, 1]), np.mean(test_d[10, :, 1]), np.mean(inter_d[10, :, 1]))

box_plot([train_d[10, :, 1], test_d[10, :, 1], inter_d[10, :, 0]], ['train-train', 'test-test', 'test-train'])

1.3370339961017529e-05 5.434959024202328e-05 0.22877840804524108
