### Overview


This notebook contains various scripts to assit with preparing distaance matrices for comparision with the evolutionary distances <br>


## SETUP AND TEST

In [1]:
import duckdb
import time
import re
#
# TODO - SET THIS STRING TO WHERE YOU WANT THE DB TO STORE ITS DATA
#
db_string = "/Users/patrick/dev/ucl/comp0158_mscproject/database/w2v_20240731_test.db"


#### FIND UNIQUE EUKARYOTIC PFAM

In [None]:
# STEP 1 - GET A LIST OF PFAM TOKENS THAT ARE ONLY RELEVANT TO EUKARYOTIC PROTEINS BY JOINING ACROSS TABLES
#
# W2V_TOKEN was created from protein2ipr.dat and thus has loads of non-eukaryiotic entries that are not relevant. 
# This script creates a list of pfam entries for eukaryotic proteins only by joining across tables:
#
# - W2V_PROTEIN_UREF100_E (has eukaryotic proteins in it and that it what was used to create the corpus)
# - W2V_TOKEN   (has come from protein2ipr.dat and has many more pfam entries)
#
# W2V_PROTEIN_UREF100_E only has eukaryotic proteins in it and that it what was used to create the corpus
#
# For performance reasons, this script outputs to a number of separate files in 'chunks' - I used 500k on a Mac


# output directory for each 'chunk of pfam ids'
output_file_root = "/Users/patrick/dev/ucl/comp0158_mscproject/data/pfam/tmp/eukaryotic_pfam_smart_not_unique_"

def get_eukaryotic_pfams(start_pos, end_pos, iteration):
    #print(f"iteration {iteration} from {start_pos} to {end_pos}.")
    s = time.time()
    
    output_file = output_file_root+ str(iteration) + ".txt"
    # create long life/expensive objects
    of  = open(output_file, "w")
    con = duckdb.connect(database=db_string)
    
    try:
        results = con.execute(f"SELECT W2V_PROTEIN_UREF100_E.UNIPROT_ID, W2V_TOKEN.TOKEN FROM ( SELECT UNIPROT_ID FROM W2V_PROTEIN_UREF100_E WHERE COUNTER >= {start_pos} and COUNTER < {end_pos}) AS W2V_PROTEIN_UREF100_E INNER JOIN W2V_TOKEN AS W2V_TOKEN ON W2V_PROTEIN_UREF100_E.UNIPROT_ID = W2V_TOKEN.UNIPROT_ID WHERE W2V_TOKEN.TYPE = 'PFAM' ").fetchall()
    except Exception as e:
        print(f"Error on iteration {iteration}, {e}, closing finr {output_file}")
        of.close()
        con.close()
        return
    e1 = time.time()

    for res in results:
        #print(res[1])
        of.write(res[1] +'\n')        
    e2 = time.time()

    print(f"iteration {iteration} from {start_pos} to {end_pos}. query took {e1-s}s, overall took {e2-s}s")

    of.close()
    con.close()

num_eukaryotic = 95272305
start_pos       = 0    # start point
chunk_size      = 500000    # how many rows to return
end_pos         = chunk_size
iterations      = (num_eukaryotic // chunk_size) + 1


print(iterations, 'required.')

for i in range(iterations):
    get_eukaryotic_pfams(start_pos, end_pos, i)
    start_pos += chunk_size
    end_pos += chunk_size

In [None]:
# STEP 2 - COMBINE OUTPUTS FROM STEP 1

# Use the shell script : 'combine_pfam_list.sh' to combine the output from step 1 into a single list
#
# INPUT: Individual chunked files from step 1
# OUTPUT : A single file with only eukaryotic pfam entries 
#
# chunked files : /Users/patrick/dev/ucl/comp0158_mscproject/data/pfam/tmp
# unique eukaryotic pfams: /Users/patrick/dev/ucl/comp0158_mscproject/data/pfam/unique_eukaryotic_pfam.txt
#
# there is an interim file created by this script before the unique list is extracted

In [None]:
#
# Creating a list of pfam ids without the 'PF' - this will alow me to store them in a 
# numpy array. Also going to save to a database
#
def create_pfam_analysis_list():
    
    pfam_file   = "/Users/patrick/dev/ucl/comp0158_mscproject/data/pfam/unique_eukaryotic_pfam.txt"
    output_file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/pfam/unique_eukaryotic_pfam_ids.txt"
    
    output = open(output_file, "w")
    with open(pfam_file, "r") as pfam_data:
        for line_number, line in enumerate(pfam_data):
            
            line = line.rstrip()
            line = line.lstrip()
            
            my_search  = re.search("PF([0-9]*)", line)
            pf_root      = my_search.group(1)
            
            buffer = "|".join([str(line_number + 1), pf_root, line])
            print(buffer)
            output.write(buffer +'\n')
    output.close()        

create_pfam_analysis_list()

In [None]:
# STEP 3 - Create ouptut file with ids
#
# Creating a list of pfam ids without the 'PF' - this will alow me to store them in a 
# numpy array. Also going to save to a database
#
def create_pfam_analysis_list():
    
    pfam_file   = "/Users/patrick/dev/ucl/comp0158_mscproject/data/pfam/unique_eukaryotic_pfam.txt"
    output_file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/pfam/unique_eukaryotic_pfam_ids.txt"
    
    output = open(output_file, "w")
    with open(pfam_file, "r") as pfam_data:
        for line_number, line in enumerate(pfam_data):
            
            line = line.rstrip()
            line = line.lstrip()
            
            my_search  = re.search("PF([0-9]*)", line)
            pf_root      = my_search.group(1)
            
            buffer = "|".join([str(line_number + 1), pf_root, line])
            print(buffer)
            output.write(buffer +'\n')
    output.close()        

create_pfam_analysis_list()

In [25]:
#
# Creates a new table with only eukaryotic PFAM entries - these will be encoded for ever model
#
con = duckdb.connect(database=db_string)

output_file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/pfam/unique_eukaryotic_pfam_ids.txt"          
con.execute("CREATE TABLE W2V_PFAM_E AS SELECT * FROM read_csv_auto('/Users/patrick/dev/ucl/comp0158_mscproject/data/pfam/unique_eukaryotic_pfam_ids.txt', columns={'COUNTER' :'USMALLINT', 'STRIPPED_PFAM_ID': 'USMALLINT', 'PFAM_ID': 'VARCHAR'})")
con.close()

In [3]:
con = duckdb.connect(database=db_string)           
count = con.execute("SELECT COUNT(*) FROM W2V_PFAM_E").fetchall()
print(count[0])
con.close()

(15577,)


In [5]:
con = duckdb.connect(database=db_string)           
results = con.execute("SELECT COUNTER, PFAM_ID FROM W2V_PFAM_E WHERE COUNTER >0 AND COUNTER <= 10").fetchall()
for res in results:
    print(res)
con.close()

(1, 'PF00002')
(2, 'PF00003')
(3, 'PF00004')
(4, 'PF00005')
(5, 'PF00006')
(6, 'PF00007')
(7, 'PF00008')
(8, 'PF00009')
(9, 'PF00010')
(10, 'PF00011')


#### Create table for list of EVO PFAM IDS

In [10]:
con = duckdb.connect(database=db_string)
         
con.execute("CREATE TABLE W2V_EVO_PFAM AS SELECT * FROM read_csv_auto('/Users/patrick/dev/ucl/comp0158_mscproject/code/evolutionary/evo_pfam_ids.dat', columns={'COUNTER' :'USMALLINT', 'PFAM_ID': 'VARCHAR'})")
con.close()

In [12]:
con = duckdb.connect(database=db_string)  
count = con.execute("SELECT COUNT(*) FROM W2V_EVO_PFAM").fetchall()
print(count)
con.close()

[(20651,)]


## UTILITIES

#### Search for PFAM and PROTEIN ENTRIES

In [26]:
# test that W2V_TOKEN has all pfam and disorder entries
# 1445577   : Colletotrichum fioriniae PJ7
# 10116     : Rattus norvegicus
con = duckdb.connect(database=db_string)

# 1. Test - find a protein with pfam entries
#    - Both of these work
#protein_id = "A0A009GYB3" # this is prob not eukaryotic
protein_id = "A0A010PZJ8"

#tokens = con.execute("SELECT * FROM W2V_TOKEN WHERE UNIPROT_ID = 'A0A009GYB3'").fetchall()
#tokens = con.execute("SELECT * FROM W2V_TOKEN WHERE UNIPROT_ID = (?)", [protein_id] ).fetchall()

# 2. Find that same protein in W2V_PROTEIN
# doesn't work - possibly because the pfam entries are from all proteins whereas W2V_PROTEIN only
# has TrEMBL Eukaryotic proteins
#tokens = con.execute("SELECT * FROM W2V_PROTEIN WHERE UNIPROT_ID = 'A0A009GYB3'").fetchall()
tokens = con.execute("SELECT * FROM W2V_PROTEIN WHERE UNIPROT_ID = (?)", [protein_id]).fetchall()
print('W2V_PROTEIN', tokens)

# doesn't work
#tokens = con.execute("SELECT * FROM W2V_TOKEN WHERE UNIPROT_ID = (?)", ['protein_id']).fetchall()

# none of these work - is the protein A0A009GYB3 in UniRef??
# tokens = con.execute("SELECT * FROM W2V_PROTEIN_UNIREF_100_ALL_TAX WHERE UNIPROT_ID = 'A0A009GYB3'").fetchall()
tokens = con.execute("SELECT * FROM W2V_PROTEIN_UREF100_E WHERE UNIPROT_ID = (?)", [protein_id]).fetchall()
# tokens = con.execute("SELECT * FROM W2V_PROTEIN_UNIREF_100_ALL_TAX WHERE UNIPROT_ID = (?)", [protein_id]).fetchall()
# grep "A0A009GYB3" uniref100_tax_20240801.dat > returns nothing

print('W2V_PROTEIN_UREF100_E', tokens)
con.close()

W2V_PROTEIN [('A0A010PZJ8', 1, 494)]
W2V_PROTEIN_UREF100_E [('UniRef100', 'A0A010PZJ8', 493, 1, 494, 1, 1445577, 'Colletotrichum fioriniae PJ7')]


In [40]:
# test that W2V_TOKEN has all pfam and disorder entries
# 1445577   : Colletotrichum fioriniae PJ7
# 10116     : Rattus norvegicus
con = duckdb.connect(database=db_string)           
tokens = con.execute("SELECT * FROM W2V_TAX_CAT WHERE ID=(?)", ['1445577']).fetchall()
print(tokens)
tokens = con.execute("SELECT * FROM W2V_TAX_NAME WHERE TAX_ID=(?)", ['1445577']).fetchall()
print(tokens)
con.close()

[('E', '710243', '1445577')]
[('1445577', 'Colletotrichum fioriniae PJ7')]


#### Drop Table

In [16]:
con = duckdb.connect(database=db_string)           
con.execute("DROP TABLE X")
con.close()

In [7]:
con = duckdb.connect(database=db_string)           
tables = con.execute("SHOW TABLES").fetchall()
print(tables)
con.close()

[('W2V_PFAM_E',), ('W2V_PROTEIN',), ('W2V_PROTEIN_UNIREF_100_ALL_TAX',), ('W2V_PROTEIN_UREF100_E',), ('W2V_TAX_CAT',), ('W2V_TAX_NAME',), ('W2V_TOKEN',)]


In [8]:
con = duckdb.connect(database=db_string)           
tables = con.execute("DESCRIBE W2V_PFAM_E").fetchall()
print(tables)
con.close()

[('COUNTER', 'USMALLINT', 'YES', None, None, None), ('STRIPPED_PFAM_ID', 'USMALLINT', 'YES', None, None, None), ('PFAM_ID', 'VARCHAR', 'YES', None, None, None)]


#### Unlock database

In [21]:
import duckdb
import os

# this doesn;t seem to work....
def is_locked():
    lock_file = f'{db_string}.lock'
    return os.path.exists(lock_file)

is_locked()

# ... but this does from a command prompt
#fuser database/proteins.db

fuser /Users/patrick/dev/ucl/comp0158_mscproject/database/w2v_20240731_test.db

# then kill -9 <id if there is one list>

False

: 