# Fixing predict

In [1]:
import numpy as np
import pandas as pd
import sys

embeddings = pd.read_pickle("embeddings.pkl")
whales = np.load('raw_predictions.npy')

# get array showing for each class where started it's embeddings
ids = embeddings['Id'].values
last_id, starts = ids[0], [0]
for ind, curr_id in enumerate(ids):
    if last_id != curr_id:
        starts.append(ind)
        last_id = curr_id
starts.append(len(ids))
starts = np.array(starts)

In [2]:
embeddings = embeddings.drop(['Id'], axis=1).values
mean_dist = np.empty((whales.shape[0], len(starts)), dtype=float)
mean_dist[:, 0] = sys.maxsize  # new_whale class: constant

In [459]:
#there were computation mistakes before this
mean_emb = np.mean(np.concatenate((embeddings, whales), axis=0), axis = 0)
whales -= mean_emb
embeddings -= mean_emb

--- predict we want to fix ---

In [460]:
# get 2D array showing mean dist between val embedding and embeddings of group
# using stepped calculation to prevent RAM OOM
class_offset = 1  # to compensate new_whale (class 0) - first column in mean_dist
embeddings_offset = 0
splitted_starts = [starts[:len(starts) // 2 + 1], starts[len(starts) // 2:]]
for starts in [splitted_starts[0]]:
#for starts in splitted_starts:
    starts -= embeddings_offset
    curr_embeddings = embeddings[starts[0]:starts[-1]]

    concat = np.concatenate((curr_embeddings, whales), axis=0)
    
    prod = np.dot(concat, np.transpose(concat))
    sq_norms = np.reshape(np.diag(prod), (-1, 1))

    dist = sq_norms - 2.0 * prod + np.transpose(sq_norms)
    dist = dist[curr_embeddings.shape[0]:, :curr_embeddings.shape[0]]
    dist = np.sqrt(np.maximum(dist, 0.0))
    
    print(curr_embeddings.shape)
    print(concat.shape)
    print(prod.shape)
    print(sq_norms.shape)    
    print(dist.shape)  
    
    i = 0
    print(np.mean(dist[:, starts[i]:starts[i + 1]], axis=1))
    res1 = np.mean(dist[:, starts[i]:starts[i + 1]], axis=1)

(7885, 128)
(15845, 128)
(15845, 15845)
(15845, 1)
(7960, 7885)
[0.01175699 0.0137426  0.01540905 ... 0.01161361 0.01867516 0.01478532]


--- same predict but only for class 1 ---

In [461]:
p = 0
for starts in [np.array([splitted_starts[0][p], splitted_starts[0][p+1]]) ]:
    curr_embeddings = embeddings[starts[0]:starts[-1]]

    concat = np.concatenate((curr_embeddings, whales), axis=0)
    
    prod = np.dot(concat, np.transpose(concat))
    sq_norms = np.reshape(np.diag(prod), (-1, 1))

    dist = sq_norms - 2.0 * prod + np.transpose(sq_norms)
    dist = dist[curr_embeddings.shape[0]:, :curr_embeddings.shape[0]]
    dist = np.sqrt(np.maximum(dist, 0.0))
    
    print(curr_embeddings.shape)
    print(concat.shape)
    print(prod.shape)
    print(sq_norms.shape)    
    print(dist.shape)  
    
    print(np.mean(dist, axis=1))
    res2 = np.mean(dist, axis=1)

(1, 128)
(7961, 128)
(7961, 7961)
(7961, 1)
(7960, 1)
[0.01175699 0.0137426  0.01540905 ... 0.01161345 0.01867506 0.01478532]


--- same but with scipy ---

In [469]:
p = 0
for starts in [np.array([splitted_starts[0][p], splitted_starts[0][p+1]]) ]:
    curr_embeddings = embeddings[starts[0]:starts[-1]]
    res3 = np.linalg.norm(whales - curr_embeddings[0], axis = 1)

In [470]:
res3

array([0.01175668, 0.01374262, 0.01540905, ..., 0.01161356, 0.01867518,
       0.01478527], dtype=float32)

In [471]:
res1

array([0.01175699, 0.0137426 , 0.01540905, ..., 0.01161361, 0.01867516,
       0.01478532], dtype=float32)

In [472]:
np.where((res1 - res2 > 0.00001))[0].shape[0]

0

In [473]:
np.where((res1 - res3 > 0.00001))[0].shape[0]

0

In [474]:
from scipy.spatial import distance

dist = distance.cdist(whales, embeddings[:10], 'euclidean')

In [475]:
dist[:, 0]

array([0.01175668, 0.01374262, 0.01540905, ..., 0.01161356, 0.01867518,
       0.01478527])

It's ok now!

# Checking embeddings are collapsed

In [1]:
import numpy as np
import pandas as pd
import sys

embeddings = pd.read_pickle("embeddings.pkl")
embeddings = embeddings.drop(['Id'], axis=1).values
whales = np.load('raw_predictions.npy')

mean_emb = np.mean(np.concatenate((embeddings, whales), axis=0), axis=0)

In [2]:
mean_emb

array([   35.185368 ,   588.4183   ,   146.53278  ,  -516.9568   ,
       -1095.1044   ,   847.2858   ,   863.03076  ,   260.9789   ,
         799.50104  ,   101.32626  ,  -543.7229   ,   140.85608  ,
          34.792057 ,  -893.01306  ,  -282.2807   ,   598.01526  ,
       -1771.527    ,   769.5286   ,  -663.92316  ,  -235.6551   ,
         -27.250784 ,    23.282343 ,  -527.0979   ,  -228.61838  ,
         369.523    ,  -466.30685  ,   195.08673  ,  -455.1446   ,
        -315.57278  ,     7.9976273,  1817.505    ,   794.0997   ,
        -330.05298  ,   229.52522  ,  -425.38358  ,   927.7555   ,
         479.65186  ,   383.99796  , -1108.0277   ,  -748.52856  ,
        -701.66327  ,   213.79466  ,   354.54498  ,  -427.03995  ,
         687.25934  ,  -405.96417  ,  -449.22244  ,  -330.4601   ,
         924.6689   ,  -319.75583  ,   590.80035  ,    58.154484 ,
         -71.59812  ,   535.1442   ,   238.516    ,  -888.861    ,
          28.24076  ,   586.5179   ,  1430.151    ,  -671.7427

In [14]:
np.where(embeddings - mean_emb > 0.05)[0].shape

(0,)

In [13]:
np.where(whales - mean_emb > 0.05)[0].shape

(0,)