In [37]:
import gensim
import logging
import itertools
import math
import glob
import numpy as np
from gensim.models import Word2Vec
import duckdb
import re
import os
import time

In [2]:
db_string = "/Users/patrick/dev/ucl/comp0158_mscproject/database/w2v_20240731_test.db"

#### Create NUMPY Distance Matrix

In [3]:
# Takes a model and loops through a list of pfam ids and outputs a npy file

def calculate_distances_numpy(model, output_folder):
    
    model_id_search  = re.search("\/(w2v_.*)\.model", model)
    model_id         = model_id_search.group(1)
    
    # get pfam ids - could make this query more complex at some point
    pfam_ids = []
    con         = duckdb.connect(database=db_string)           
    
    missing_pfams_file = output_folder+model_id+"_missing_pfams.txt"
    of = open(missing_pfams_file, "w")
    
    #
    # First try with eukaryotic thatI have extracted
    #
    results     = con.execute("SELECT COUNTER, PFAM_ID FROM W2V_PFAM_E ORDER BY COUNTER ").fetchall()
    
    con.close()
    for res in results:
        pfam_ids.append(res[1])
    
    # load the model
    model = Word2Vec.load(model)

    # calculate matrix size and initialise
    num_entries = len(pfam_ids)
    distance_matrix = np.zeros((num_entries, num_entries))
    
    # create empty distance matrix
    print(f"Calculating distances for {num_entries} pfam ids under model {model_id}")
    error_count = 0
    success_count = 0
    for i in range(num_entries):
        for j in range(i+1, num_entries):
            pfam_1 = pfam_ids[i]
            pfam_2 = pfam_ids[j]
            try:
                v1 = model.wv[pfam_1]
                v2 = model.wv[pfam_2]
                distance = np.linalg.norm(v1 - v2)
                distance_matrix[i][j] = distance
                success_count +=1
            except Exception as e: # a bit convoluted, but want to print out the missing pfam
                #print(f"Exception calculating {pfam_1} to {pfam_2} : {e.args[0]}")
                missing = re.search("Key '(.*)' not", e.args[0] )
                #print(missing.group(1))
                of.write(missing.group(1) + '\n')
                error_count +=1
                continue
    # close the output file
    of.close()
    output_name = output_folder+model_id+"_dist"
    np.save(output_name, distance_matrix)
    print(f"Distances complete for model {model_id}. success: {success_count} fail: {error_count} output: {output_name}.npy")


  model_id_search  = re.search("\/(w2v_.*)\.model", model)


In [None]:
model           = "/Users/patrick/dev/ucl/comp0158_mscproject/data/models/w2v_20240810_v5_w2.model"
output_folder   = "/Users/patrick/dev/ucl/comp0158_mscproject/data/analysis/"
calculate_distances_numpy(model, output_folder)

### New

In [35]:
def encode_pfams(model_dir, model_name, vocab_dir):
    vocab_file = vocab_dir+model_name+'_vocab.txt'
    
    # get pfam ids from the models vocab
    #print(f"Encoding vocab file: {vocab_file}")
    model = Word2Vec.load(model_dir+model_name+'.model')
    pfam_ids = []
    with open(vocab_file, 'r') as vf:
        for line in vf:
            line = line.rstrip()
            line =  line.lstrip()
            if line.startswith('PF'):
                pfam_ids.append(line)
                #encoding = model.wv[line]
                #print(f"Encoding :{line}: {encoding}")
    #print(pfam_ids)
    vf.close()
    
    # calculate matrix size and initialise
    num_entries = len(pfam_ids)
    distance_matrix = np.zeros((num_entries, num_entries))
    
    # create empty distance matrix
    #print(f"Calculating distances for {num_entries} pfam ids under model {model_name}")
    error_count = 0
    success_count = 0
    s = time.time()
    for i in range(num_entries):
        for j in range(i+1, num_entries):
            pfam_1 = pfam_ids[i]
            pfam_2 = pfam_ids[j]
            try:
                v1 = model.wv[pfam_1]
                v2 = model.wv[pfam_2]
                distance = np.linalg.norm(v1 - v2)
                distance_matrix[i][j] = distance
                success_count +=1
            except Exception as e: # a bit convoluted, but want to print out the missing pfam
                #print(f"Exception calculating {pfam_1} to {pfam_2} : {e.args[0]}")
                missing = re.search("Key '(.*)' not", e.args[0] )
                print(missing.group(1))
                #of.write(missing.group(1) + '\n')
                error_count +=1
                continue
    # close the output file
    #of.close()
    
    output_name = vocab_dir+model_name+"_dist"
    np.save(output_name, distance_matrix)
    e = time.time()
    print(f"distance matrix computed for model: {model_name}. num words: {num_entries}. time: {round(e-s,2)}s. success: {success_count} fail: {error_count} output: {output_name}.npy")
    

In [None]:
# get vocab for one model
vocab_dir="/Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/vocab/"
model_dir="/Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/"

# get vocab for a particular file
model_name = "w2v_20240811_v5_w5_mc3"
#encode_pfams(model_dir, model_name, vocab_dir)

In [38]:
# get distances for all models
vocab_dir="/Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/vocab/"
model_dir="/Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/"

file_list = glob.glob(os.path.join(model_dir, '*.model'))
for file_path in file_list:
    model_name_s  = re.search("(w2v_.*)\.model", file_path)
    model_name         = model_name_s.group(1)
    encode_pfams(model_dir, model_name, vocab_dir)


  model_name_s  = re.search("(w2v_.*)\.model", file_path)


distance matrix computed for model: w2v_20240811_v5_w10_mc5. num words: 12802. time: 438.42s. success: 81939201 fail: 0 output: /Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/vocab/w2v_20240811_v5_w10_mc5_dist.npy
distance matrix computed for model: w2v_20240811_v5_w10_mc3. num words: 13529. time: 489.65s. success: 91510156 fail: 0 output: /Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/vocab/w2v_20240811_v5_w10_mc3_dist.npy
distance matrix computed for model: w2v_20240811_v10_w5_mc3. num words: 13529. time: 491.76s. success: 91510156 fail: 0 output: /Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/vocab/w2v_20240811_v10_w5_mc3_dist.npy
distance matrix computed for model: w2v_20240811_v10_w10_mc5. num words: 12802. time: 435.7s. success: 81939201 fail: 0 output: /Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/models/vocab/w2v_20240811_v10_w10_mc5_dist.npy
distance matrix computed for model: w2v_20240811_v5_w5_mc5. num wor