In [1]:
import re
import numpy as np
import duckdb
import requests

db_string = "/Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/database/w2v_20240731_test.db"

In [3]:
#
# Query interpro for clan id associated with a pfam id. returns 'undef' if there is no match
#
def get_interpro_clan(pfam_id):
    # Example URL
    url = "https://www.ebi.ac.uk/interpro/api/entry/pfam/"+pfam_id+"/"  # Replace with the actual URL
    
    response = requests.get(url)
    if response.status_code == 200:
        content = response.text

        # Search for the token in the content
        # set_info":{"accession":"CL0192","name":"GPCR_A"}
        #token_match = re.search("set_info\":\{\"(acc.*)on\"", content)
        token_match = re.search("set_info\":\{\"acc.*(CL[0-9]+)\"", content)

        if token_match:
            token = token_match.group(1)  # Extract the token
            #print(f"{pfam_id}|{token}")
            return token
        else:
            return 'undef'
    else:
        print(f"Failed to retrieve data for {pfam_id}. Status code: {response.status_code}")
        return 'undef_err'

  token_match = re.search("set_info\":\{\"acc.*(CL[0-9]+)\"", content)


In [16]:
#
# Basically the same as the above but just takes a list of pfams
#
def get_clan_for_rand_rep_pfams(pfam_id):
    con = duckdb.connect(database=db_string)

    try:          
        results = con.execute(f"SELECT CLAN_ID FROM W2V_RAND_REP_CLAN WHERE PFAM_ID='{pfam_id}'").fetchall()
        
        if(results is None or results ==[]):
            raise Exception(f"No clan entry for {pfam_id} - add the entry to the DB table W2V_PFAM_CLAN_MC1" )
        else:
            clan_id = results[0][0]
            con.close()
            return clan_id
    except Exception as e:
        print('get_clan_for_rand_rep_pfams() error', e)
        con.close()
        return
    
#
# Basically the same as the above but just takes a list of pfams
#
def get_clans_for_pfams(pfam_ids):
    con     = duckdb.connect(database=db_string)
    clans   = []
    
    #f = open('rand_rep_clans.dat', 'w')
    
    # loop through each pfam, find its clan and build up a dictinary of clans > pfam_id
    for i, pfam_id in enumerate(pfam_ids):
        try:          
            results = con.execute(f"SELECT CLAN_ID FROM W2V_PFAM_CLAN_MC1 WHERE PFAM_ID='{pfam_id}'").fetchall()
            
            if(results is None or results ==[]):
                #clan = get_interpro_clan(pfam_id)
                clan_id = get_clan_for_rand_rep_pfams(pfam_id)
                #print(f"{pfam_id}|{clan_id}|1")
                #f.write(f"{pfam_id}|{clan}\n")
                if clan_id != 'undef':
                    clans.append(clan_id)
                else:
                    print(f"item {i} for {pfam_id} undefined clan")
                #raise Exception(f"No clan entry for {pfam_id} - add the entry to the DB table W2V_PFAM_CLAN_MC1" )
            else:
                clan_id = results[0][0]
                #print(f"{pfam_id}|{clan_id}|2")
                clans.append(clan_id)
        except Exception as e:
            print('get_clans_for_pfams() error', e)
            con.close()
            return
    con.close()
    #f.close()
    return clans


# vectors in evo matrix have format K1SVA3.1/50-86|PF02829
# need to extract these
#
def extract_evo_pfam_ids(evo_vector):
    pfam_ids = []
    for item in evo_vector:
        #print(f"searching in {item}")
        pfam_search  = re.search("\|(PF.*)", item)
        if pfam_search is not None:
            pfam_id       = pfam_search.group(1)
            pfam_ids.append(pfam_id)
        else:
            print(f"No pfam found for {item}")

    return pfam_ids

#
# extract vocab vector and matrix from npy file
#                
def extract_evo_matrix_vector_files(npy_file_name):
    npy_f               = open(npy_file_name, 'rb')
    dist_matrix_norm    = np.load(npy_f) #loads second matrix
    vocab_vector        = np.load(npy_f)
    
    return vocab_vector, dist_matrix_norm

#
# returns list of pfams from the evo vectorand a matrix
#
def get_rand_rep_from_npy(npy_file_name):
    print(f"\n- Extracting from {npy_file_name}")
    
    evo_vector, evo_matrix = extract_evo_matrix_vector_files(npy_file_name)
    
    evo_pfam_ids = extract_evo_pfam_ids(evo_vector)
    print(f"- Extracted {len(evo_pfam_ids)} pfams from {npy_file_name}\n")
    return evo_pfam_ids, evo_matrix



  pfam_search  = re.search("\|(PF.*)", item)


In [17]:
#
# get list of pfams and related clans from evo
#
rand_rep_file = '/Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/distances/evo/rand_rep_distance_matrix.npy'
rand_rep_pfams, rand_rep_matrix = get_rand_rep_from_npy(rand_rep_file)
rand_rep_clans = get_clans_for_pfams(rand_rep_pfams)

print(f"{len(rand_rep_pfams)}, {len(rand_rep_clans)}, {rand_rep_matrix.shape}")


- Extracting from /Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/distances/evo/rand_rep_distance_matrix.npy
- Extracted 20651 pfams from /Users/patrick/dev/ucl/word2vec/COMP_0158_MSC_PROJECT/data/distances/evo/rand_rep_distance_matrix.npy

item 13 for PF16620 undefined clan
item 36 for PF17644 undefined clan
item 56 for PF18671 undefined clan
item 101 for PF21585 undefined clan
item 182 for PF14156 undefined clan
item 196 for PF09819 undefined clan
item 236 for PF16873 undefined clan
item 238 for PF14253 undefined clan
item 240 for PF18863 undefined clan
item 241 for PF18865 undefined clan
item 242 for PF18864 undefined clan
item 251 for PF18277 undefined clan
item 252 for PF07280 undefined clan
item 262 for PF05528 undefined clan
item 267 for PF16671 undefined clan
item 282 for PF03421 undefined clan
item 301 for PF05054 undefined clan
item 302 for PF07138 undefined clan
item 303 for PF06648 undefined clan
item 304 for PF05815 undefined clan
item 305 for PF06856 undefined

: 