In [40]:
import numpy as np
from os import walk, listdir
from os.path import isfile, join
import pickle as pk

from sklearn.decomposition import PCA


In [4]:
def fit_transform_embeds(embeds):
    pca = PCA(n_components=2)
    pca.fit(embeds)
    pk.dump(pca, open("pca_transformer.pkl", "wb"))

    pcas = pca.transform(embeds)
    np.save('unbalanced_results/tr_pcas.npy', pcas)
    

In [47]:
def transform_embeds(model_name, embeds, init, load = True):
    # if load is True: load the compiled PCA model on training set,
    # otherwise build new transformer only on test data
    if load:
        pca = pk.load(open(f'projection_models/{model_name}.pkl', 'rb'))
    else:
        pca = PCA(n_components=2)
        pca.fit(embeds)
    
    #save projection model
    pk.dump(pca, open("projection_models/pca_test_transformer.pkl", "wb"))
    
    #save projected embeddings
    pcas = pca.transform(embeds)
    np.save(f'pca_projected_embdes/{init}_pcas.npy', pcas)

In [43]:
#Load all test file names

In [44]:
test_path = 'test_data'
gfiles = []
test_dirs = listdir(test_path)
for tdirs in test_dirs:
    gfiles.extend([join(test_path, tdirs, f) for f in listdir(join(test_path, tdirs))])

In [45]:
# Load all test sets embedding and construct projection space

In [48]:
embeds = []
for embed in gfiles:
    embeds.append(np.load(embed))

transform_embeds('', np.array(embeds), 'test_sets', load = False)

In [75]:
def find_similarities(pca_list, fnames, fun=1):
    t_pcas = np.load(f'pca_projected_embdes/{pca_list}.npy')
    
    total_distance = []
    for t_embed, t_fname in zip(t_pcas, fnames):
        
        if fun ==1 :
            dist = abs(t_embed - t_pcas).sum(axis=1)
        if fun ==2 :
            dist = np.square(t_embed - t_pcas).sum(axis=1)
        if fun ==3 :
            dist = np.sqrt(np.square(t_embed - t_pcas).sum(axis=1))
            
        total_distance.append(dist)

    # df = pd.DataFrame(np.array(total_distance))
    # df.columns = t_fnames
    # df.index = t_fnames
    # df.to_csv("distance.csv", index=True, header=True)
    
    return np.array(total_distance)


In [71]:
distances = find_similarities(pca_list='test_sets_pcas', fnames=gfiles, fun=1)
n_candidates = 10+1
candidate = 1

candidate_distances = distances[n_candidates]
min_locs = np.argsort(candidate_distances)

similars_names = np.array(gfiles)[min_locs[1:n_candidates]]
similars_dist = candidate_distances[min_locs[1:n_candidates]]

print(similars_names)
print(similars_dist)

['test_data/sanofi_v1/data_PMP_0020.npy'
 'test_data/sanofi_v2/data_PMP_0020.npy'
 'test_data/sanofi_v2/data_PMP_0013.npy'
 'test_data/sanofi_v2/data_PMP_0014.npy'
 'test_data/sanofi_v2/data_PMP_0016.npy'
 'test_data/sanofi_v2/data_PMP_0015.npy'
 'test_data/sanofi_v1/data_BETA_CNC_0003.npy'
 'test_data/sanofi_v1/data_BETA_PMP_0001.npy'
 'test_data/sanofi_v2/data_PMP_0010 bis.npy'
 'test_data/sanofi_v2/data_PMP_0009.npy']
[1.03947124 1.03947267 1.98496412 1.98496417 1.98496418 1.98496418
 2.94707614 3.93913181 5.94540238 6.08913283]


In [74]:
distances = find_similarities(pca_list='test_sets_pcas', fnames=gfiles, fun=2)
n_candidates = 10+1
candidate = 1

candidate_distances = distances[n_candidates]
min_locs = np.argsort(candidate_distances)

similars_names = np.array(gfiles)[min_locs[1:n_candidates]]
similars_dist = candidate_distances[min_locs[1:n_candidates]]

print(similars_names)
print(similars_dist)

['test_data/sanofi_v1/data_PMP_0020.npy'
 'test_data/sanofi_v2/data_PMP_0020.npy'
 'test_data/sanofi_v2/data_PMP_0013.npy'
 'test_data/sanofi_v2/data_PMP_0014.npy'
 'test_data/sanofi_v2/data_PMP_0015.npy'
 'test_data/sanofi_v2/data_PMP_0016.npy'
 'test_data/sanofi_v1/data_BETA_CNC_0003.npy'
 'test_data/sanofi_v1/data_BETA_PMP_0001.npy'
 'test_data/sanofi_v1/data_FLT_0002 rev 1.npy'
 'test_data/sanofi_v2/data_PMP_0048_draft.npy']
[ 0.61703601  0.61703785  2.84942288  2.84942293  2.84942297  2.84942297
  7.07631692 11.25967478 24.5529229  24.78779001]
