In [1]:
import gensim
import logging
import itertools
import math
import glob
import numpy as np
from gensim.models import Word2Vec
import duckdb
import re
import os
import time
import random
from skbio.stats.distance import DistanceMatrix
from scipy.spatial.distance import correlation

In [None]:
'''

This notebook is used to experiment with distance matrices and make sure they 
work before I moive them to a .py file

'''

### Util : Extract vectors and matrices from rand_rep

In [2]:
evo_npy         = "/Users/patrick/dev/ucl/word2vec/comp_0158_msc_project/data/distances/evo/rand_rep_distance_matrix.npy"

#
# extract vocab vector and matrix from npy file
#                
def extract_evo_matrix_vector_files(npy_file_name):
    print(f"- loading {npy_file_name}")
    npy_f               = open(npy_file_name, 'rb')
    dist_matrix_norm    = np.load(npy_f) #loads second matrix
    vocab_vector        = np.load(npy_f)
    
    return vocab_vector, dist_matrix_norm


# ------------------- GET PFAM IDS FROM EVO VECTOR ----------------------------

#
# vectors in evo matrix have format K1SVA3.1/50-86|PF02829
# need to extract these
#
def extract_evo_pfam_ids(evo_vector):
    pfam_ids = []
    for item in evo_vector:
        #print(f"searching in {item}")
        pfam_search  = re.search("\|(PF.*)", item)
        pfam_id       = pfam_search.group(1)
        #print(f"found {pfam_id}")
        pfam_ids.append(pfam_id)
    return pfam_ids

  pfam_search  = re.search("\|(PF.*)", item)


In [3]:
evo_vocab_vector, evo_dist_matrix = extract_evo_matrix_vector_files(evo_npy)
evo_vocab   = extract_evo_pfam_ids(evo_vocab_vector)
print(f"- rand_rep extracted. number of pfams : {len(evo_vocab)} matrix shape: {evo_dist_matrix.shape}")

- loading /Users/patrick/dev/ucl/word2vec/comp_0158_msc_project/data/distances/evo/rand_rep_distance_matrix.npy
- rand_rep extracted. number of pfams : 20651 matrix shape: (20651, 20651)


### Calculate distance matrices - use sklearn

In [4]:
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics.pairwise import euclidean_distances

def get_distances(model_path):
    model           = Word2Vec.load(model_path)
    word_vectors    = model.wv
    
    print('- w2v word vectors shape:', word_vectors.vectors.shape)
    
    # grab the pfam words
    pfams = []
    vocab           = model.wv.key_to_index
    for i, word in enumerate(vocab):
        pfams.append(word)

    # get cosine and normalise
    print('- calculating cosine distances')
    cosine_distance_matrix = cosine_distances(word_vectors.vectors, word_vectors.vectors)
    non_diag_elements       = cosine_distance_matrix[np.triu_indices_from(cosine_distance_matrix, k=1)]
    max_value               = np.max(np.abs(non_diag_elements))
    cosine_distance_matrix_norm = cosine_distance_matrix / max_value
    np.fill_diagonal(cosine_distance_matrix_norm, 0)
    
    # get euclidean and normalise
    print('- calculating euclidean distances')
    euclidean_distance_matrix   = euclidean_distances(word_vectors.vectors, word_vectors.vectors)
    non_diag_elements           = euclidean_distance_matrix[np.triu_indices_from(euclidean_distance_matrix, k=1)]
    max_value                   = np.max(np.abs(non_diag_elements))
    euclidean_distance_matrix_norm = euclidean_distance_matrix / max_value
    np.fill_diagonal(euclidean_distance_matrix_norm, 0)
    
    return pfams, euclidean_distance_matrix, cosine_distance_matrix, euclidean_distance_matrix_norm, cosine_distance_matrix_norm

### RUN ME: Test distances

In [5]:
model_dir       = "/Users/patrick/dev/ucl/word2vec/comp_0158_msc_project/data/models/cbow/"
model_name      = 'w2v_20240911_cbow_mc1_w3_v10'
model_path      = model_dir+model_name+'.model'
model           = Word2Vec.load(model_path)

#
# ---------- get distance matrices
#
w2v_vocab, w2v_euc_dist_matrix, w2v_cos_dist_matrix, w2v_euc_dist_matrix_n, w2v_cos_dist_matrix_n = get_distances(model_path)
#
#

- w2v word vectors shape: (15485, 10)
- calculating cosine distances
- calculating euclidean distances


In [6]:
# ---------------------------------------------------------------------------------------------
#           Confirmed many times (last time being 17/09 that distance calcs are correct)
# ---------------------------------------------------------------------------------------------

#
# ----------- check that the w2v distance matrices are correct
#

# randomly select 2 pfams
word_vectors    = model.wv
rnd_pfam_1_idx  = random.randint(0, len(w2v_vocab))
rnd_pfam_1      = w2v_vocab[rnd_pfam_1_idx]
rnd_pfam_2_idx  = random.randint(0, len(w2v_vocab))
rnd_pfam_2      = w2v_vocab[rnd_pfam_2_idx]

# get their vectors
pfam_1_vector = model.wv[rnd_pfam_1]
pfam_2_vector = model.wv[rnd_pfam_2]

print(f"Testing distances for : {rnd_pfam_1} against {rnd_pfam_2}\n")
print(f"Vector 1:\n{pfam_1_vector}\nVector 2:\n{pfam_2_vector}")

# get distances for those vectors directly
w2v_euc_dist        = np.linalg.norm(pfam_1_vector - pfam_2_vector)
cosine_similarity   = np.dot(pfam_1_vector, pfam_2_vector) / (np.linalg.norm(pfam_1_vector) * np.linalg.norm(pfam_2_vector))
w2v_cos_dist        = 1 - cosine_similarity

# get the distances from the matrices
print(f"\n - Euc Distance calculated directly : {w2v_euc_dist} \n - Euc Distance in matrix :{w2v_euc_dist_matrix[rnd_pfam_1_idx, rnd_pfam_2_idx]}")
print(f"\n - Cos Distance calculated directly : {w2v_cos_dist} \n - Cos Distance in matrix :{w2v_cos_dist_matrix[rnd_pfam_1_idx, rnd_pfam_2_idx]}")

#
# ----------- check that converting distance matrices to skbio.stats.distance.DistanceMatrices stil gives the same results
#
print('\nChecking conversion to Distance Matrix...')
w2v_distance_matrix_fl  = w2v_euc_dist_matrix.astype(np.float64)
w2v_vocab_np            = np.array(w2v_vocab)
evo_vocab_np            = np.array(evo_vocab)
            
# not entirely sure I need new matrices as already have one
w2v_dist_matrix_new = DistanceMatrix(w2v_distance_matrix_fl, ids=w2v_vocab_np)
evo_dist_matrix_new = DistanceMatrix(evo_dist_matrix, ids=evo_vocab_np)

print(' - w2v old v new:', w2v_distance_matrix_fl[rnd_pfam_1_idx, rnd_pfam_2_idx], ':', w2v_dist_matrix_new.data[rnd_pfam_1_idx, rnd_pfam_2_idx])
print(' - evo old v new:', evo_dist_matrix[rnd_pfam_1_idx, rnd_pfam_2_idx], ':', evo_dist_matrix_new.data[rnd_pfam_1_idx, rnd_pfam_2_idx])


Testing distances for : PF14736 against PF17987

Vector 1:
[-1.4850833  -0.49669972  1.571958    0.84423685 -0.16456014  0.7867039
  3.2310567   0.7853137   1.8917574  -1.1023983 ]
Vector 2:
[ 0.91540474  0.28253043  1.1064982   1.5303342   0.7691935  -0.8112851
  2.2297175   1.5800493   1.4424465  -0.74132717]

 - Euc Distance calculated directly : 3.528303623199463 
 - Euc Distance in matrix :3.528303623199463

 - Cos Distance calculated directly : 0.31845784187316895 
 - Cos Distance in matrix :0.3184579014778137

Checking conversion to Distance Matrix...
 - w2v old v new: 3.528303623199463 : 3.528303623199463
 - evo old v new: 0.9660087719298246 : 0.9660087719298246


In [7]:
# ---------------------------------------------------------------------------------------------
#           Check if resizing matrices has any impact
#
# 17 Sept 2024 : They work as expected
# ---------------------------------------------------------------------------------------------


print('\nChecking if I reduce the size of the Evo Matrix, does it keep things in order...')
print(f" - Current matrix shapes. \t w2v: {w2v_distance_matrix_fl.shape} evo : {evo_dist_matrix.shape}")

try:
    # create a new evo matrix to only have the pfams that it shares with the w2v one
    new_evo         = evo_dist_matrix_new.filter(w2v_vocab_np, False)
    new_evo_ids     = new_evo.ids
    # create a new w2v matrix to only have the pfams that it shares with the new evo one!
    new_w2v         = w2v_dist_matrix_new.filter(new_evo_ids, False)
    new_w2v_ids     = new_w2v.ids
except Exception as e:
    print('Error', e)

print(f" - New matrix shapes. \t w2v: {new_w2v.shape} evo : {new_evo.shape}")



Checking if I reduce the size of the Evo Matrix, does it keep things in order...
 - Current matrix shapes. 	 w2v: (15485, 15485) evo : (20651, 20651)
 - New matrix shapes. 	 w2v: (15040, 15040) evo : (15040, 15040)


'\ncorrcoeff = np.corrcoef(new_evo, new_w2v)\nprint(corrcoeff)\n'

In [14]:
# ---------------------------------------------------------------------------------------------
#           Check that distances are stil correct based upon re-ordered matrices
#
# 17 Sept 2024 : They work as expected
# ---------------------------------------------------------------------------------------------

idx_1 = random.randint(0, len(new_w2v_ids))
idx_2 = random.randint(0, len(new_w2v_ids))

# the position of pfam ids should be the same
print('Checking that positions in vocabs are the same.')
print(' - ', new_w2v_ids[idx_1], new_evo_ids[idx_1])
print(' - ', new_w2v_ids[idx_2], new_evo_ids[idx_2])

pf_1 = new_w2v_ids[idx_1]
pf_2 = new_evo_ids[idx_2]

print('\nRandom pfams to compare :',pf_1, pf_2)

# find indices of pfam items in original matrices and their distance entries
orig_w2v_idx_1 = w2v_vocab.index(pf_1)
orig_w2v_idx_2 = w2v_vocab.index(pf_2)
orig_evo_idx_1 = evo_vocab.index(pf_1)
orig_evo_idx_2 = evo_vocab.index(pf_2)

orig_w2v_dist  = w2v_euc_dist_matrix[orig_w2v_idx_1, orig_w2v_idx_2]
orig_evo_dist   = evo_dist_matrix[orig_evo_idx_1, orig_evo_idx_2]

print(' - dist entry in orig w2v matrix :', orig_w2v_dist)
print(' - dist entry in new w2v matrix :', new_w2v[idx_1, idx_2])

print(' - dist entry in orig evo matrix :', orig_evo_dist)
print(' - dist entry in new evo matrix :', new_evo[idx_1, idx_2])




'''
print(new_evo.data.shape)
print(new_w2v.data.shape)

print(new_evo[idx_1, idx_2])
print(new_w2v[idx_1, idx_2])


from scipy.spatial.distance import is_valid_dm
print(is_valid_dm(new_evo))
print(is_valid_dm(new_w2v))

#correlation = correlation(new_evo, new_w2v)
#print(correlation)

# the matrix is actually stored in the data attribute of Distance Matrix
try:
    corrcoeff = np.corrcoef(new_evo.data, new_w2v.data)
    mean_corr = np.mean(corrcoeff)
    print(mean_corr)
except Exception as e:
    print('Error ', e)
    '''


Checking that positions in vocabs are the same.
 -  PF00939 PF00939
 -  PF11627 PF11627

Random pfams to compare : PF00939 PF11627
 - dist entry in orig w2v matrix : 11.363613
 - dist entry in new w2v matrix : 11.36361312866211
 - dist entry in orig evo matrix : 0.9703947368421053
 - dist entry in new evo matrix : 0.9703947368421053


"\nprint(new_evo.data.shape)\nprint(new_w2v.data.shape)\n\nprint(new_evo[idx_1, idx_2])\nprint(new_w2v[idx_1, idx_2])\n\n\nfrom scipy.spatial.distance import is_valid_dm\nprint(is_valid_dm(new_evo))\nprint(is_valid_dm(new_w2v))\n\n#correlation = correlation(new_evo, new_w2v)\n#print(correlation)\n\n# the matrix is actually stored in the data attribute of Distance Matrix\ntry:\n    corrcoeff = np.corrcoef(new_evo.data, new_w2v.data)\n    mean_corr = np.mean(corrcoeff)\n    print(mean_corr)\nexcept Exception as e:\n    print('Error ', e)\n    "

### Matrix Correlations

In [20]:
# ---------------------------------------------------------------------------------------------
#           Perform simple matrix correlation
#
# 17 Sept 2024 : 
# ---------------------------------------------------------------------------------------------

# To get the correlation need to work with normalised distance matrices
# w2v_euc_dist_matrix_n, w2v_cos_dist_matrix_n

#
# -------------------------- Convert normalised w2v and evo to scikitbio.DistanceMatrix -------------------
#
from skbio.stats.distance import DistanceMatrix

print('\nConverting to Distance Matrix ahead of correlation and distance calculations...')
w2v_euc_dist_matrix_n_fl   = w2v_euc_dist_matrix_n.astype(np.float64)

w2v_vocab_np            = np.array(w2v_vocab)
evo_vocab_np            = np.array(evo_vocab)
            
# Convert existing matrices to DistanceMatrix
w2v_dist_matrix_new = DistanceMatrix(w2v_euc_dist_matrix_n_fl, ids=w2v_vocab_np)
evo_dist_matrix_new = DistanceMatrix(evo_dist_matrix, ids=evo_vocab_np)

#
# ------------------ Modify to have only shared pfam ids (have checked above that this works) -------------------
#
print(f" - Current matrix shapes. \t w2v: {w2v_distance_matrix_fl.shape} evo : {evo_dist_matrix.shape}")

try:
    # create a new evo matrix to only have the pfams that it shares with the w2v one
    new_evo         = evo_dist_matrix_new.filter(w2v_vocab_np, False)
    new_evo_ids     = new_evo.ids
    # create a new w2v matrix to only have the pfams that it shares with the new evo one!
    new_w2v         = w2v_dist_matrix_new.filter(new_evo_ids, False)
    new_w2v_ids     = new_w2v.ids
except Exception as e:
    print('Error', e)

print(f" - New matrix shapes. \t w2v: {new_w2v.shape} evo : {new_evo.shape}")


Converting to Distance Matrix ahead of correlation and distance calculations...
 - Current matrix shapes. 	 w2v: (15485, 15485) evo : (20651, 20651)
 - New matrix shapes. 	 w2v: (15040, 15040) evo : (15040, 15040)


In [16]:
# With new matrices, can get the correlations
try:
    corrcoeff = np.corrcoef(new_w2v.data, new_evo.data)
    mean_corr = np.mean(corrcoeff)
    print(mean_corr)
except Exception as e:
    print('Error ', e)

0.3481976741193307


In [50]:
# ---------------- Standard correlation comparison
#
# also try with scipy - need to extract data as 1D
#
from scipy.spatial.distance import correlation
from skbio.stats.distance import mantel

# extract in condesed form - basically a 1D array
new_w2v_condensed = new_w2v.condensed_form()
new_evo_condensed = new_evo.condensed_form()
print(len(new_w2v_condensed), len(new_evo_condensed))
my_correlation = correlation(new_w2v_condensed, new_evo_condensed)
print(' - scipy correlation:', my_correlation)

# flatten
new_w2v_flat = new_w2v.data.flatten()
new_evo_flat = new_evo.data.flatten()
my_correlation_flat = correlation(new_w2v_flat, new_evo_flat)
print(' - scipy correlation (flat):', my_correlation_flat)

# mantel
n=50
corr_coeff, p_value, num = mantel(new_w2v, new_evo, permutations=n, strict=False)
print(f"- mantel test corr : {round(corr_coeff,4)} p_val : {round(p_value,4)} num: {num}.\n")

113093280 113093280
 - scipy correlation: 0.9562341407339019
 - scipy correlation (flat): 0.9544681116068903
- mantel test corr : 0.0438 p_val : 0.0196 num: 15040.



In [49]:
# condensing the matrix reoves the diagonals and only takes the upper (or lower triangle)
# flattening it flattens the entire matrix and returns all variables

print('w2v shape:', new_w2v.data.shape)
print('w2v condensed length:', len(new_w2v_condensed))
print('w2v flattened length:', len(new_w2v_flat))

print('---------------------')
print(new_w2v_condensed[0])
print(new_w2v_flat[0])
print(new_w2v[0,0])

print('---------------------')
print(new_w2v_condensed[0])
print(new_w2v_flat[1])
print(new_w2v[0,1])

print('---------------------')
print(new_w2v_condensed[1])
print(new_w2v_flat[2])
print(new_w2v[0,2])

print('---------------------')
print(new_w2v_condensed[15029])
print(new_w2v_flat[15030])
print(new_w2v[0,15030])

print('---------------------')
print(new_w2v_condensed[15039])
print(new_w2v_flat[15042])
print(new_w2v[1,2])

print('---------------------')
print(new_w2v_condensed[15035])
print(new_w2v_condensed[15036])
print(new_w2v_condensed[15037])
print(new_w2v_condensed[15038])
print(new_w2v_condensed[15039])
print(new_w2v_condensed[15040])
print(new_w2v_flat[15040])
print(new_w2v[1,0])

w2v shape: (15040, 15040)
w2v condensed length: 113093280
w2v flattened length: 226201600
---------------------
0.5567417740821838
0.0
0.0
---------------------
0.5567417740821838
0.5567417740821838
0.5567417740821838
---------------------
0.5339210629463196
0.5339210629463196
0.5339210629463196
---------------------
0.4893765151500702
0.4893765151500702
0.4893765151500702
---------------------
0.3729321360588074
0.3729321360588074
0.3729321360588074
---------------------
0.48700857162475586
0.4913928806781769
0.4821048676967621
0.4885183274745941
0.3729321360588074
0.3890213072299957
0.5567417740821838
0.5567417740821838


In [21]:
# -------------- Mantel test comparision
from skbio.stats.distance import mantel

n=50
corr_coeff, p_value, num = mantel(new_w2v, new_evo, permutations=n, strict=False)

print(f"- mantel test corr : {round(corr_coeff,4)} p_val : {round(p_value,4)} num: {num}.\n")


- mantel test corr : 0.0438 p_val : 0.0196 num: 15040.



### Try to resize randrep

In [43]:
w2v_distance_matrix_fl = euclidean_distance_matrix.astype(np.float64)
w2v_vocab_np = np.array(w2v_vocab)
evo_vocab_np = np.array(evo_vocab)
            
# not entirely sure I need new matrices as alreaady have one
w2v_dist_matrix = DistanceMatrix(w2v_distance_matrix_fl, ids=w2v_vocab_np)
evo_dist_matrix = DistanceMatrix(evo_dist_matrix, ids=evo_vocab_np)

## --------- OLDER STUFF

In [4]:
from pathlib import Path

model_dir       = "/Users/patrick/dev/ucl/word2vec/comp_0158_msc_project/data/models/"
def find_files(directory):
    files_info = []
    # Traverse the directory recursively
    for file_path in Path(directory).rglob(f'*model'):
        if file_path.is_file():  # Check if it's a file
            filename = file_path.stem  # Get the filename without extension
            file_extension = file_path.suffix  # Get the file extension
            files_info.append((str(file_path), filename, file_extension))
    
    return files_info


models_info = find_files(model_dir)

for model_info in models_info:
    print(f"Model Path: {model_info[0]}, \tModel name: {model_info[1]}, Extension: {model_info[2]}")


Model Path: /Users/patrick/dev/ucl/word2vec/comp_0158_msc_project/data/models/skip/w2v_20240912_skip_mc8_w13_v50.model, 	Model name: w2v_20240912_skip_mc8_w13_v50, Extension: .model
Model Path: /Users/patrick/dev/ucl/word2vec/comp_0158_msc_project/data/models/skip/w2v_20240911_sg1_mc1_w8_v25.model, 	Model name: w2v_20240911_sg1_mc1_w8_v25, Extension: .model
Model Path: /Users/patrick/dev/ucl/word2vec/comp_0158_msc_project/data/models/skip/w2v_20240912_skip_mc8_w13_v5.model, 	Model name: w2v_20240912_skip_mc8_w13_v5, Extension: .model
Model Path: /Users/patrick/dev/ucl/word2vec/comp_0158_msc_project/data/models/skip/w2v_20240910_sg1_mc8_w44_v5.model, 	Model name: w2v_20240910_sg1_mc8_w44_v5, Extension: .model
Model Path: /Users/patrick/dev/ucl/word2vec/comp_0158_msc_project/data/models/skip/w2v_20240912_skip_mc8_w21_v10.model, 	Model name: w2v_20240912_skip_mc8_w21_v10, Extension: .model
Model Path: /Users/patrick/dev/ucl/word2vec/comp_0158_msc_project/data/models/skip/w2v_20240911_skip

### Calculate distances for pfam entries in a model

In [35]:
def calculate_distances(model_dir, model_name, vocab_dir):
    vocab_file = vocab_dir+model_name+'_vocab.txt'
    
    # get pfam ids from the models vocab
    #print(f"Encoding vocab file: {vocab_file}")
    model = Word2Vec.load(model_dir+model_name+'.model')
    pfam_ids = []
    with open(vocab_file, 'r') as vf:
        for line in vf:
            line = line.rstrip()
            line =  line.lstrip()
            if line.startswith('PF'):
                pfam_ids.append(line)
                #encoding = model.wv[line]
                #print(f"Encoding :{line}: {encoding}")
    #print(pfam_ids)
    vf.close()
    
    # calculate matrix size and initialise
    num_entries = len(pfam_ids)
    distance_matrix = np.zeros((num_entries, num_entries))
    
    # create empty distance matrix
    #print(f"Calculating distances for {num_entries} pfam ids under model {model_name}")
    error_count = 0
    success_count = 0
    s = time.time()
    for i in range(num_entries):
        for j in range(i+1, num_entries):
            pfam_1 = pfam_ids[i]
            pfam_2 = pfam_ids[j]
            try:
                v1 = model.wv[pfam_1]
                v2 = model.wv[pfam_2]
                distance = np.linalg.norm(v1 - v2)
                distance_matrix[i][j] = distance
                success_count +=1
            except Exception as e: # a bit convoluted, but want to print out the missing pfam
                #print(f"Exception calculating {pfam_1} to {pfam_2} : {e.args[0]}")
                missing = re.search("Key '(.*)' not", e.args[0] )
                print(missing.group(1))
                #of.write(missing.group(1) + '\n')
                error_count +=1
                continue
    # close the output file
    #of.close()
    
    output_name = vocab_dir+model_name+"_dist"
    np.save(output_name, distance_matrix)
    e = time.time()
    print(f"distance matrix computed for model: {model_name}. num words: {num_entries}. time: {round(e-s,2)}s. success: {success_count} fail: {error_count} output: {output_name}.npy")
    

In [68]:
# get distances for one model
vocab_dir="/Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/vocab/"
model_dir="/Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/"

# get vocab for a particular file
model_name = "w2v_20240811_v5_w5_mc3"
#calculate_distances(model_dir, model_name, vocab_dir)

In [69]:
# get distances for all models
vocab_dir="/Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/vocab/"
model_dir="/Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/"

file_list = glob.glob(os.path.join(model_dir, '*.model'))
for file_path in file_list:
    model_name_s  = re.search("(w2v_.*)\.model", file_path)
    model_name         = model_name_s.group(1)
    #calculate_distances(model_dir, model_name, vocab_dir)


  model_name_s  = re.search("(w2v_.*)\.model", file_path)


### Get vocab for a model for pfam words only

In [45]:
def get_model_vocab(vocab_dir, model_name):
    # extracts pfam ids from the vecotr

    pfam_ids = []
    vocab_file = vocab_dir+model_name+'_vocab.txt'
    
    with open(vocab_file, 'r') as vf:
        for line in vf:
            line = line.rstrip()
            line =  line.lstrip()
            if line.startswith('PF'):
                pfam_ids.append(line)
    vf.close()
    
    np_pfam_ids = np.array(pfam_ids)
    
    return pfam_ids, np_pfam_ids

### REMOVE ROWS/COLS FROM TARGET IF NOT IN SOUCRE

In [75]:
vocab_dir="/Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/vocab/"
model_dir="/Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/"

# models to compare - ulitmately we want a new target model with only entries that are in the soure as well
source_model_name = "w2v_20240811_v5_w5_mc5"
target_model_name = "w2v_20240811_v5_w5_mc3"

# get te vocab list for both models we wish to compare
source_vocab, np_source_vocab = get_model_vocab(vocab_dir, source_model_name)
target_vocab, np_target_vocab = get_model_vocab(vocab_dir, target_model_name)

print(f"Model {source_model_name} vocab size: {len(source_vocab)}.")
print(f"Model {target_model_name} vocab size: {len(target_vocab)}.")

# create a True/False mask with True where the item from the source array is in the target array
mask = np.isin(np_target_vocab, np_source_vocab)
print(f"Shared model words: {mask.sum()}.")

# load the target matrix
distance_matrix_name = vocab_dir+target_model_name+'_dist.npy'
print(f"Loading target dist matrix {distance_matrix_name}")
dist_matrix = np.load(distance_matrix_name)


# remove non-common rows/columns from the target matrix

mask                = np.isin(np_target_vocab, np_source_vocab)
dist_matrix_subset  = dist_matrix[np.ix_(mask, mask)]

print(f"Target dist matrix reduced from {dist_matrix.shape} to {dist_matrix_subset.shape}")

Model w2v_20240811_v5_w5_mc5 vocab size: 12802.
Model w2v_20240811_v5_w5_mc3 vocab size: 13529.
Shared model words: 12802.
Loading target dist matrix /Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/vocab/w2v_20240811_v5_w5_mc3_dist.npy
Target dist matrix reduced from (13529, 13529) to (12802, 12802)


In [66]:
# sample code to test the use of masks
# compare two vectors and find the indices in the second for common entries
source_v = np.array(['a', 'b', 'f'])
target_v = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g'])

mask = np.isin(target_v, source_v)
print(mask)


# apply this mask to a target matrix - the mask should be applied to ros and columns
target_matrix = np.array([  [11, 12, 13, 14, 15, 16], \
                            [21, 22, 23, 24, 25, 26], \
                            [31, 32, 33, 34, 35, 36], \
                            [41, 42, 43, 44, 45, 46], \
                            [51, 52, 53, 54, 55, 56], \
                            [61, 62, 63, 64, 55, 66]])

# new matrix has the dimensions corresponding to the number of shared items in the two original vectors
# the entries in the matrix are the row/column entries that had 'True' in the mask
target_subset = target_matrix[np.ix_(mask, mask)]
print(target_subset)


[ True  True False False False  True False]
[[11 12 16]
 [21 22 26]
 [61 62 66]]


In [51]:
# sample code to test the use of masks - this modifies the target to only include
# elements that are in the source, but doe not reorder them - see next section

# source and target vectors - need to get the items in target that are only in source
source_v = np.array(['a', 'g', 'c'])
target_v = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g'])

# this tells us whcih items in target are also in source
mask = np.isin(target_v, source_v)
print(mask)

# create a target matrix to test
target_matrix = np.array([  [11, 12, 13, 14, 15, 16, 17], \
                            [21, 22, 23, 24, 25, 26, 27], \
                            [31, 32, 33, 34, 35, 36, 37], \
                            [41, 42, 43, 44, 45, 46, 47], \
                            [51, 52, 53, 54, 55, 56, 57], \
                            [61, 62, 63, 64, 55, 66, 67], \
                            [71, 72, 73, 74, 75, 76, 77]])

# apply the mask to rows and columns
target_subset = target_matrix[np.ix_(mask, mask)]

# print output
print(target_subset)


[ True False  True False False False  True]
[[11 13 17]
 [31 33 37]
 [71 73 77]]


In [3]:
# this code reduces the target matrix to only have entries according to the 
# common elements between source and target vectors. it also reorders
# the target matrix so that the order of its rows and columns matches
# those of the source

# source and target vectors of elements - for example if source has 'd' 'a' in that order
# we need to find 'd' and 'a' in the target vector and the reorder the matrix so that
# a) it only contains rows/cols in the source vector
# b) those rows and cols appear in the same order
source_v = np.array(['d', 'b'])
target_v = np.array(['a', 'b', 'c', 'd'])

target_matrix = np.array([  [10, 11, 12, 13],
                            [20, 21, 22, 23],
                            [30, 31, 32, 33],
                            [40, 41, 42, 43]])

reorder_indices = []
for item in source_v:
    index = np.where(target_v == item)[0]
    reorder_indices.append(index[0])

print('indices needed in target matrix and in new order', reorder_indices, '\n')

reordered_matrix = target_matrix[reorder_indices, :]
reordered_matrix = reordered_matrix[:, reorder_indices]

# Print the result
print('\n', reordered_matrix)

indices needed in target matrix and in new order [3, 1] 


 [[43 41]
 [23 21]]


In [36]:
import numpy as np

# Original matrix
matrix = np.array([[10, 11, 12, 13],
                   [20, 21, 22, 23],
                   [30, 31, 32, 33],
                   [40, 41, 42, 43]])

# Indices to reorder the rows and columns
# current row 2 becomes first, row 0 comes second, row 1 comes last  
reorder_indices = [1, 3]

# Reorder rows
reordered_matrix = matrix[reorder_indices, :]
reordered_matrix = reordered_matrix[:, reorder_indices]

# Print the result
print('\n', reordered_matrix)


reorder_indices = [3, 1]

# Reorder rows
reordered_matrix = matrix[reorder_indices, :]
reordered_matrix = reordered_matrix[:, reorder_indices]

# Print the result
print('\n', reordered_matrix)



 [[21 23]
 [41 43]]

 [[43 41]
 [23 21]]
