In [2]:
import gensim
import logging
import itertools
import math
import glob
import numpy as np
from gensim.models import Word2Vec
import duckdb
import re
import os
import time

#### ALL THIS CODE IS NOW IN w2vdistancetools.py AND CALLED FROM model_orchestrator.py

In [2]:
db_string = "/Users/patrick/dev/ucl/comp0158_mscproject/database/w2v_20240731_test.db"

### Calculate distances for pfam entries in a model

In [35]:
def calculate_distances(model_dir, model_name, vocab_dir):
    vocab_file = vocab_dir+model_name+'_vocab.txt'
    
    # get pfam ids from the models vocab
    #print(f"Encoding vocab file: {vocab_file}")
    model = Word2Vec.load(model_dir+model_name+'.model')
    pfam_ids = []
    with open(vocab_file, 'r') as vf:
        for line in vf:
            line = line.rstrip()
            line =  line.lstrip()
            if line.startswith('PF'):
                pfam_ids.append(line)
                #encoding = model.wv[line]
                #print(f"Encoding :{line}: {encoding}")
    #print(pfam_ids)
    vf.close()
    
    # calculate matrix size and initialise
    num_entries = len(pfam_ids)
    distance_matrix = np.zeros((num_entries, num_entries))
    
    # create empty distance matrix
    #print(f"Calculating distances for {num_entries} pfam ids under model {model_name}")
    error_count = 0
    success_count = 0
    s = time.time()
    for i in range(num_entries):
        for j in range(i+1, num_entries):
            pfam_1 = pfam_ids[i]
            pfam_2 = pfam_ids[j]
            try:
                v1 = model.wv[pfam_1]
                v2 = model.wv[pfam_2]
                distance = np.linalg.norm(v1 - v2)
                distance_matrix[i][j] = distance
                success_count +=1
            except Exception as e: # a bit convoluted, but want to print out the missing pfam
                #print(f"Exception calculating {pfam_1} to {pfam_2} : {e.args[0]}")
                missing = re.search("Key '(.*)' not", e.args[0] )
                print(missing.group(1))
                #of.write(missing.group(1) + '\n')
                error_count +=1
                continue
    # close the output file
    #of.close()
    
    output_name = vocab_dir+model_name+"_dist"
    np.save(output_name, distance_matrix)
    e = time.time()
    print(f"distance matrix computed for model: {model_name}. num words: {num_entries}. time: {round(e-s,2)}s. success: {success_count} fail: {error_count} output: {output_name}.npy")
    

In [68]:
# get distances for one model
vocab_dir="/Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/vocab/"
model_dir="/Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/"

# get vocab for a particular file
model_name = "w2v_20240811_v5_w5_mc3"
#calculate_distances(model_dir, model_name, vocab_dir)

In [69]:
# get distances for all models
vocab_dir="/Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/vocab/"
model_dir="/Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/"

file_list = glob.glob(os.path.join(model_dir, '*.model'))
for file_path in file_list:
    model_name_s  = re.search("(w2v_.*)\.model", file_path)
    model_name         = model_name_s.group(1)
    #calculate_distances(model_dir, model_name, vocab_dir)


  model_name_s  = re.search("(w2v_.*)\.model", file_path)


### Get vocab for a model for pfam words only

In [45]:
def get_model_vocab(vocab_dir, model_name):
    # extracts pfam ids from the vecotr

    pfam_ids = []
    vocab_file = vocab_dir+model_name+'_vocab.txt'
    
    with open(vocab_file, 'r') as vf:
        for line in vf:
            line = line.rstrip()
            line =  line.lstrip()
            if line.startswith('PF'):
                pfam_ids.append(line)
    vf.close()
    
    np_pfam_ids = np.array(pfam_ids)
    
    return pfam_ids, np_pfam_ids

### REMOVE ROWS/COLS FROM TARGET IF NOT IN SOUCRE

In [75]:
vocab_dir="/Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/vocab/"
model_dir="/Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/"

# models to compare - ulitmately we want a new target model with only entries that are in the soure as well
source_model_name = "w2v_20240811_v5_w5_mc5"
target_model_name = "w2v_20240811_v5_w5_mc3"

# get te vocab list for both models we wish to compare
source_vocab, np_source_vocab = get_model_vocab(vocab_dir, source_model_name)
target_vocab, np_target_vocab = get_model_vocab(vocab_dir, target_model_name)

print(f"Model {source_model_name} vocab size: {len(source_vocab)}.")
print(f"Model {target_model_name} vocab size: {len(target_vocab)}.")

# create a True/False mask with True where the item from the source array is in the target array
mask = np.isin(np_target_vocab, np_source_vocab)
print(f"Shared model words: {mask.sum()}.")

# load the target matrix
distance_matrix_name = vocab_dir+target_model_name+'_dist.npy'
print(f"Loading target dist matrix {distance_matrix_name}")
dist_matrix = np.load(distance_matrix_name)


# remove non-common rows/columns from the target matrix

mask                = np.isin(np_target_vocab, np_source_vocab)
dist_matrix_subset  = dist_matrix[np.ix_(mask, mask)]

print(f"Target dist matrix reduced from {dist_matrix.shape} to {dist_matrix_subset.shape}")

Model w2v_20240811_v5_w5_mc5 vocab size: 12802.
Model w2v_20240811_v5_w5_mc3 vocab size: 13529.
Shared model words: 12802.
Loading target dist matrix /Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/vocab/w2v_20240811_v5_w5_mc3_dist.npy
Target dist matrix reduced from (13529, 13529) to (12802, 12802)


In [66]:
# sample code to test the use of masks
# compare two vectors and find the indices in the second for common entries
source_v = np.array(['a', 'b', 'f'])
target_v = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g'])

mask = np.isin(target_v, source_v)
print(mask)


# apply this mask to a target matrix - the mask should be applied to ros and columns
target_matrix = np.array([  [11, 12, 13, 14, 15, 16], \
                            [21, 22, 23, 24, 25, 26], \
                            [31, 32, 33, 34, 35, 36], \
                            [41, 42, 43, 44, 45, 46], \
                            [51, 52, 53, 54, 55, 56], \
                            [61, 62, 63, 64, 55, 66]])

# new matrix has the dimensions corresponding to the number of shared items in the two original vectors
# the entries in the matrix are the row/column entries that had 'True' in the mask
target_subset = target_matrix[np.ix_(mask, mask)]
print(target_subset)


[ True  True False False False  True False]
[[11 12 16]
 [21 22 26]
 [61 62 66]]


In [51]:
# sample code to test the use of masks - this modifies the target to only include
# elements that are in the source, but doe not reorder them - see next section

# source and target vectors - need to get the items in target that are only in source
source_v = np.array(['a', 'g', 'c'])
target_v = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g'])

# this tells us whcih items in target are also in source
mask = np.isin(target_v, source_v)
print(mask)

# create a target matrix to test
target_matrix = np.array([  [11, 12, 13, 14, 15, 16, 17], \
                            [21, 22, 23, 24, 25, 26, 27], \
                            [31, 32, 33, 34, 35, 36, 37], \
                            [41, 42, 43, 44, 45, 46, 47], \
                            [51, 52, 53, 54, 55, 56, 57], \
                            [61, 62, 63, 64, 55, 66, 67], \
                            [71, 72, 73, 74, 75, 76, 77]])

# apply the mask to rows and columns
target_subset = target_matrix[np.ix_(mask, mask)]

# print output
print(target_subset)


[ True False  True False False False  True]
[[11 13 17]
 [31 33 37]
 [71 73 77]]


In [56]:
# this code reduces the target matrix to only have entries according to the 
# common elements between source and target vectors. it also reorders
# the target matrix so that the order of its rows and columns matches
# those of the source

# source and target vectors of elements - for example if source has 'd' 'a' in that order
# we need to find 'd' and 'a' in the target vector and the reorder the matrix so that
# a) it only contains rows/cols in the source vector
# b) those rows and cols appear in the same order
source_v = np.array(['d', 'b'])
target_v = np.array(['a', 'b', 'c', 'd'])

target_matrix = np.array([  [10, 11, 12, 13],
                            [20, 21, 22, 23],
                            [30, 31, 32, 33],
                            [40, 41, 42, 43]])

reorder_indices = []
for item in source_v:
    index = np.where(target_v == item)[0]
    reorder_indices.append(index[0])

print('indices needed in target matrix and in new order', reorder_indices, '\n')

reordered_matrix = target_matrix[reorder_indices, :]
reordered_matrix = reordered_matrix[:, reorder_indices]

# Print the result
print('\n', reordered_matrix)

indices needed in target matrix and in new order [3, 1] 


 [[43 41]
 [23 21]]


In [36]:
import numpy as np

# Original matrix
matrix = np.array([[10, 11, 12, 13],
                   [20, 21, 22, 23],
                   [30, 31, 32, 33],
                   [40, 41, 42, 43]])

# Indices to reorder the rows and columns
# current row 2 becomes first, row 0 comes second, row 1 comes last  
reorder_indices = [1, 3]

# Reorder rows
reordered_matrix = matrix[reorder_indices, :]
reordered_matrix = reordered_matrix[:, reorder_indices]

# Print the result
print('\n', reordered_matrix)


reorder_indices = [3, 1]

# Reorder rows
reordered_matrix = matrix[reorder_indices, :]
reordered_matrix = reordered_matrix[:, reorder_indices]

# Print the result
print('\n', reordered_matrix)



 [[21 23]
 [41 43]]

 [[43 41]
 [23 21]]
