In [31]:
import gensim
import logging
import itertools
import math
import glob
import numpy as np
from gensim.models import Word2Vec
import duckdb
import re
import os

In [8]:
db_string = "/Users/patrick/dev/ucl/comp0158_mscproject/database/w2v_20240731_test.db"

In [69]:
#
# Calcualtes distances between paris of pfam ids
# Assumes pfam ids are in a database - but can esily be changed to work with a list
# Outputs a dat file containin al distances:
#
# pfam_from | pfam_to | model_id |distance
#
def calculate_distances(model, output_file, output_folder):
    
    model_id_search  = re.search("\/(w2v_.*)\.model", model)
    model_id         = model_id_search.group(1)
    #print('Currrent model id for distance calculation:', model_id)
    
    if output_file is None:
        output_file = output_folder+model_id+'_distances_e_.dat'
    
    # open a file to append output to
    output      = open(output_file, "a")
    
    # get pfam ids - could make this query more complex at some point
    pfam_ids = []
    con         = duckdb.connect(database=db_string)           
    results     = con.execute("SELECT COUNTER, PFAM_ID, STRIPPED_PFAM_ID FROM W2V_PFAM_E ORDER BY COUNTER ").fetchall()
    con.close()
    for res in results:
        pfam_ids.append(res[1])
    
    # load the model
    #print('loading model:', model)
    model = Word2Vec.load(model)

    # calculate distances
    num_entries = len(pfam_ids)
    print(f"Calculating distances for {num_entries} pfam ids under model {model_id} output to {output_file}")
    error_count = 0
    success_count = 0
    for i in range(num_entries):
        for j in range(i+1, num_entries):
            pfam_1 = pfam_ids[i]
            pfam_2 = pfam_ids[j]
            try:
                v1 = model.wv[pfam_1]
                v2 = model.wv[pfam_2]
                distance = np.linalg.norm(v1 - v2)
                dist_line = "|".join([pfam_1, pfam_2, model_id, str(distance)])
                #print(dist_line)
                output.write(dist_line + '\n')
                success_count +=1
            except Exception as e:
                #print(f"Exception calculating {pfam_1} to {pfam_2} {e}")
                error_count +=1
                continue
    # close the output file
    print(f"Distances complete for model {model_id}. success: {success_count} fail: {error_count}")
    output.close()  

  model_id_search  = re.search("\/(w2v_.*)\.model", model)


In [70]:
#
# parses a directory to build up sentences to create a model
#

models_dir    = "/Users/patrick/dev/ucl/comp0158_mscproject/models/"
output_folder = "/Users/patrick/dev/ucl/comp0158_mscproject/data/analysis/"
output_file   = "/Users/patrick/dev/ucl/comp0158_mscproject/data/analysis/distances_all_models_20240806.dat"

def calc_dist_all_models(models_dir, output_file):
    # find the files in the target directory
    #print('Searching for corpi files in:', corpus_dir)
    file_list = glob.glob(os.path.join(models_dir, '*.model'))
    
    # initialise
    s = time.time()

    # parse each corpus file to build up the sentences
    for file_path in file_list:
        #with open(file_path, 'r') as file:
        print(f'calculating distances for model: {file_path}')
        calculate_distances(file_path, None, output_folder)
            
    # time check
    e = time.time()
    print(f"Overall distance time {e - s}" )
    
calc_dist_all_models(models_dir, output_file)

calculating distances for model: /Users/patrick/dev/ucl/comp0158_mscproject/models/w2v_20240806_vs20_w5.model
Calculating distances for 15577 pfam ids under model w2v_20240806_vs20_w5 output to /Users/patrick/dev/ucl/comp0158_mscproject/data/analysis/w2v_20240806_vs20_w5_distances_e_.dat
Distances complete for model w2v_20240806_vs20_w5. success: 83288871 fail: 38024805
calculating distances for model: /Users/patrick/dev/ucl/comp0158_mscproject/models/w2v_20240806_vs5_w15.model
Calculating distances for 15577 pfam ids under model w2v_20240806_vs5_w15 output to /Users/patrick/dev/ucl/comp0158_mscproject/data/analysis/w2v_20240806_vs5_w15_distances_e_.dat
Distances complete for model w2v_20240806_vs5_w15. success: 83288871 fail: 38024805
calculating distances for model: /Users/patrick/dev/ucl/comp0158_mscproject/models/w2v_20240806_vs15_w20.model
Calculating distances for 15577 pfam ids under model w2v_20240806_vs15_w20 output to /Users/patrick/dev/ucl/comp0158_mscproject/data/analysis/w

In [59]:
model = Word2Vec.load("/Users/patrick/dev/ucl/comp0158_mscproject/models/test/w2v_20240806_vs5_w5.model")

models_dir    = "/Users/patrick/dev/ucl/comp0158_mscproject/models/"
file_list = glob.glob(os.path.join(models_dir, '*.model'))

# parse each corpus file to build up the sentences
for file_path in file_list:
    with open(file_path, 'r') as file:
        print(f'calculating distances for model: {file_path}')
        model = Word2Vec.load(file_path)
        try:
            print(model.wv['PF19687'])
        except Exception as e:
            print(f"Not present in {file_path}, e")
            continue
        


calculating distances for model: /Users/patrick/dev/ucl/comp0158_mscproject/models/w2v_20240806_vs20_w5.model
[ 2.9066389   3.5860233   4.6226587  -0.963702    2.6453066  -1.3669956
  6.3653183  -0.42857066  1.3111105   0.47835028  0.13663353 -0.29985735
 -1.3757349  -5.241645    0.57515144  5.6618195  -1.5268751   2.5389545
  2.0167305  -0.819868  ]
calculating distances for model: /Users/patrick/dev/ucl/comp0158_mscproject/models/w2v_20240806_vs5_w15.model
[  0.85574126   5.801419   -10.04899     -1.3114487  -12.985388  ]
calculating distances for model: /Users/patrick/dev/ucl/comp0158_mscproject/models/w2v_20240806_vs15_w20.model
[ 1.9627786   3.7906158  -3.2945726  -3.7155714  -2.663767   -0.93325514
  2.9249983   2.4891574   2.440626    2.0488322   4.792761   -4.138822
 -0.479754   -0.2977415  -1.2628495 ]
calculating distances for model: /Users/patrick/dev/ucl/comp0158_mscproject/models/w2v_20240805_vs10_w10.model
[-2.2945354  -5.3206587   1.2935404  -2.6940527   3.2355013   7.56