In [1]:
import gensim
import logging
import itertools
import math
import glob
import numpy as np
from gensim.models import Word2Vec
import duckdb
import re
import os

In [2]:
db_string = "/Users/patrick/dev/ucl/comp0158_mscproject/database/w2v_20240731_test.db"

#### Create NUMPY Distance Matrix

In [3]:
# Takes a model and loops through a list of pfam ids and outputs a npy file

def calculate_distances_numpy(model, output_folder):
    
    model_id_search  = re.search("\/(w2v_.*)\.model", model)
    model_id         = model_id_search.group(1)
    
    # get pfam ids - could make this query more complex at some point
    pfam_ids = []
    con         = duckdb.connect(database=db_string)           
    
    missing_pfams_file = output_folder+model_id+"_missing_pfams.txt"
    of = open(missing_pfams_file, "w")
    
    #
    # First try with eukaryotic thatI have extracted
    #
    results     = con.execute("SELECT COUNTER, PFAM_ID FROM W2V_PFAM_E ORDER BY COUNTER ").fetchall()
    
    con.close()
    for res in results:
        pfam_ids.append(res[1])
    
    # load the model
    model = Word2Vec.load(model)

    # calculate matrix size and initialise
    num_entries = len(pfam_ids)
    distance_matrix = np.zeros((num_entries, num_entries))
    
    # create empty distance matrix
    print(f"Calculating distances for {num_entries} pfam ids under model {model_id}")
    error_count = 0
    success_count = 0
    for i in range(num_entries):
        for j in range(i+1, num_entries):
            pfam_1 = pfam_ids[i]
            pfam_2 = pfam_ids[j]
            try:
                v1 = model.wv[pfam_1]
                v2 = model.wv[pfam_2]
                distance = np.linalg.norm(v1 - v2)
                distance_matrix[i][j] = distance
                success_count +=1
            except Exception as e: # a bit convoluted, but want to print out the missing pfam
                #print(f"Exception calculating {pfam_1} to {pfam_2} : {e.args[0]}")
                missing = re.search("Key '(.*)' not", e.args[0] )
                #print(missing.group(1))
                of.write(missing.group(1) + '\n')
                error_count +=1
                continue
    # close the output file
    of.close()
    output_name = output_folder+model_id+"_dist"
    np.save(output_name, distance_matrix)
    print(f"Distances complete for model {model_id}. success: {success_count} fail: {error_count} output: {output_name}.npy")


  model_id_search  = re.search("\/(w2v_.*)\.model", model)


In [4]:
    
model           = "/Users/patrick/dev/ucl/comp0158_mscproject/data/models/w2v_20240810_v5_w2.model"
output_folder   = "/Users/patrick/dev/ucl/comp0158_mscproject/data/analysis/"

calculate_distances_numpy(model, output_folder)

Calculating distances for 15577 pfam ids under model w2v_20240810_v5_w2


  model_id_search  = re.search("\/(w2v_.*)\.model", model)


KeyboardInterrupt: 

## Scratchpad area

In [8]:
# check if a pfam entry is in a model
model = Word2Vec.load("/Users/patrick/dev/ucl/comp0158_mscproject/data/models/w2v_20240810_v5_w5.model")

#pfam_id = 'PF19687'     # present in model
#pfam_id = 'PF01257'    # not present in model - but shouldn't be
#pfam_id = 'PF00424'     # not present in model - but should be
pfam_id = 'PF14033'     # not present in model - but should be

try:
    print(model.wv[pfam_id])
except Exception as e:
    print(e)

[-10.612822   -0.6756388   4.104623    0.3811658   0.3067643]


In [11]:
from gensim import models

model_file   = Word2Vec.load("/Users/patrick/dev/ucl/comp0158_mscproject/data/models/w2v_20240809_v5_w5.model")

pfam_id = 'PF19687'

try:
    #my_model = Word2Vec.load(model_file)
    
    my_model = models.Word2Vec.load_word2vec_format(model_file, binary=True)

    print(my_model.wv[pfam_id])
except Exception as e:
    print(e)


type object 'Word2Vec' has no attribute 'load_word2vec_format'
