In [None]:
import argparse
from Bio import SeqIO
import re
import csv
import duckdb

# internal representation
from protein_metadata import ProteinSentence
from protein_metadata import ProteinWord
from protein_db import ProteinDB

# file handles
protein_fasta_file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/uniref100_10M.fasta";
interpro_file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/protein2ipr.dat";

pdb = ProteinDB()

#### grep

In [None]:
#file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/protein2ipr.dat";
#file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/protein2ipr_pfam.dat"; # protein2ipr with only pfam entries
#file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/protein2ipr_pfam_20M.dat"; # protein2ipr with only pfam entries 0 20M lines only
file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/A8KBH6_ipr_pfam.dat"; # protein2ipr with only pfam entries - only for one protein

#
# greps for an id in interpro dat file
# For A8KBH6, this will match A0A01A8KBH6 and A8KBH6
# Time to parse the full protein2ipr.dat for A8KBH6 : 22min 56s
#
def grep_interpro(id):
    with open(file, 'r') as input_file:
        for line_number, line in enumerate(input_file):
            #match_string = "รง"            # works
            #match_string = "[A0A0-9]*A8KBH6"   # works
            #match_string = "^[A0A0-9]*"+id     # works
            match_string = "^[A0A0-9]*"+id
            match = re.search(match_string, line)
            if match:
                print('Matched:', id, 'in line:', line.strip())
                
grep_interpro('A8KBH6')
grep_interpro('A0A1A8KBH6')

# DATA ABSTRACTION AND LOADING INTO DB


## 1. TAXONOMY

#### Taxonomy names

In [None]:
import csv

# Note that the default names.dmp file is not formatted very well and didn't parse easily
# I had to use Perl to reformat it (perl_taxonomy.pl) and then imported the result here
# I could probably just as easily have used python
#
# 4,057,294 lines read into DB in 50mins 
#
file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/taxonomy/names_fixed.dmp"

con = duckdb.connect(database=ProteinDB.db_string)
#con.execute("DROP TABLE TAX_NAME")

con.execute(" \
    CREATE TABLE TAX_NAME (\
                TAX_ID VARCHAR,\
                NAME VARCHAR,\
                UNIQUE_NAME VARCHAR,\
                NAME_CLASS VARCHAR)" \
                )

with open(file, mode='r') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter='|')
    line_count = 0
    for row in csv_reader:
        #print(f'{row[0]} : {row[1]} : {row[2]} :  {row[3]}')
        id = row[0]
        name = row[1]
        unique_name = row[2]
        name_class = row[3]
        
        con.execute("INSERT INTO TAX_NAME (TAX_ID, NAME, UNIQUE_NAME, NAME_CLASS) VALUES(?,?,?,?)", (id, name, unique_name, name_class))
        
        line_count += 1
    print(f'Processed {line_count} lines.')
con.close()

In [None]:
# ad index after data load 
con = duckdb.connect(database=ProteinDB.db_string)
con.execute("CREATE INDEX tn_txid_idx ON TAX_NAME(TAX_ID)")
con.close()

In [None]:
# check data
con = duckdb.connect(database=ProteinDB.db_string)
count = con.execute("SELECT COUNT(*) FROM TAX_NAME").fetchall()
print('Number of rows', count)
res = con.execute("SELECT * FROM TAX_NAME").fetchall()
for i in range(10):
    print(res[i])
con.close()

### Taxonomy categories

In [None]:
# drop table
con = duckdb.connect(database=ProteinDB.db_string)
#con.execute("DROP TABLE TAX_CAT")
con.close()

In [None]:
# create table
con = duckdb.connect(database=ProteinDB.db_string)
con.execute(" \
    CREATE TABLE TAX_CAT (\
                TYPE VARCHAR,\
                SPECIES_ID VARCHAR,\
                CAT_ID VARCHAR)" \
                )
con.close()

In [None]:
# load categories
import csv

file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/taxonomy/categories.dmp"

con = duckdb.connect(database=ProteinDB.db_string)

with open(file, mode='r') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter='\t')
    line_count = 0
    for row in csv_reader:
        #print(f'{row[0]} : {row[1]} : {row[2]}')
        type = row[0]
        species = row[1]
        cat = row[2]
        
        con.execute("INSERT INTO TAX_CAT (TYPE, SPECIES_ID, CAT_ID) VALUES(?,?,?)", (type, species, cat))

        line_count += 1
    print(f'Processed {line_count} lines.')
    
con.close()

# This apears to work but doesn't for some reason
#con.execute(f'''
#    COPY TAX_CAT FROM '{file}' (AUTO_DETECT TRUE)
#''')


con.close()

In [None]:
# index
con = duckdb.connect(database=ProteinDB.db_string)
con.execute("CREATE INDEX TCAT_CATID_IDX ON TAX_CAT(CAT_ID)")
con.close()

In [None]:
con = duckdb.connect(database=ProteinDB.db_string)

count = con.execute("SELECT COUNT(*) FROM TAX_CAT WHERE TYPE='E'").fetchall()
#result = con.execute("SELECT * FROM TAX_CAT where SPECIES_ID='2759'").fetchall()
print(count)
#print(result)
#for i in range(10):
#    print(result[i])
#con.close()

## 2. UNIREF100 FASTA

In [None]:

con = duckdb.connect(database=ProteinDB.db_string)
#con.execute("DROP TABLE PROTEIN")
#con.execute("DROP INDEX PROT_ID_X PROTEIN")
pdb.create_protein_table()
con.close()


#### Read proteins into db

In [None]:
#
# Reads fasta files into db
# 100k lines takes : 2min
# 1M : 20min
# 10M : 3hrs 20min ? (started 13:44)
# 
# UniRef100.fasta has 1,355,591,115 lines for which there are 
# 12,814,583 entries created in 342m
# without an index a query on select * and then selecting a particujlr item took 55.9s
# index took 5.5s to apply
# same query after index took 
#
def parse_fasta_to_db(dom_type):
    
    path        = "/Users/patrick/dev/ucl/comp0158_mscproject/data/uniref100_100M.fasta"
    
    uniprot_re      = "UniRef100_([A-Z0-9]+)" # modified for UniRef100
    tax_id_re       = "TaxID=([0-9]+)"
    #tax_name_re     = "Tax=([a-zA-Z0-9\s]+)\s"
    tax_name_re     = "Tax=(.+)[\s\t]TaxID"
    name_string     = "([a-zA-Z0-9\s-]+)n="
    rep_id_re       = "RepID=(.*)"
    
    PROCESS_LIMIT   = -1 # number of lines to process, set to -1 to ignore
    OUTPUT_LIMIT    = 10000  # determines how often to print a progress message
    
    line_number = 0
    count = 0
    con = duckdb.connect(database=ProteinDB.db_string) 
    
    for record in SeqIO.parse(path, "fasta"):
        
        # ------- extract data from name -----------
        
        # this removes the name from the description and removes commas
        raw_desc = record.description.replace(record.name+" ", "")
        raw_desc = raw_desc.replace(",", "")
        
        # extracts the id from the name
        result      = re.search(uniprot_re, record.name)
        uniprot_id  = result.group(1)
        
        name_q = re.search(name_string, raw_desc)
        short_desc = name_q.group(1) 
        
        rep_id=""
        rep_id_res = re.search(rep_id_re, raw_desc)
        if rep_id_res is not None:
            rep_id = rep_id_res.group(1)
        
        # need to get tax id
        tax_id_res = re.search(tax_id_re, raw_desc)
        tax_id = tax_id_res.group(1)
        
        tax_nm_res = re.search(tax_name_re, raw_desc)
        if tax_nm_res is not None:
            tax_name = tax_nm_res.group(1)
        else:
            print ('No tax name :', raw_desc )
        
        
        # -------- check for termination ------------
        #
        if ((line_number // OUTPUT_LIMIT > 0) and (line_number % OUTPUT_LIMIT) == 0):
            count += 1
            print(count * OUTPUT_LIMIT, 'lines processed.....')
        if(PROCESS_LIMIT != -1):
            if line_number == PROCESS_LIMIT:
                print('Last entry:', uniprot_id)
            if line_number >= PROCESS_LIMIT:
                print('Processing limit reached %s stopping' % (PROCESS_LIMIT))
                break
        line_number += 1
        # ------------------------------------
        
        
        # -------- get length ------------
        #
        # loops throgh the sequence 3 dots at a time - not sure why? Is this supposed to 
        # return a line for each collection of at least 3 sequence characters?
        #for m in re.finditer(r'\.{3,}', str(record.seq)):          # original version
        start = 0
        end = 0
        for m in re.finditer(r'.{3,}', str(record.seq)):            # modified for UniRef100
            #print(uniprot_id+"\tIPRXXXXXX\t"+raw_desc+"\t"+dom_type+"\t" + str(m.start()+1)+"\t"+str(m.end()+1))
            #ipr="IPRXXXXXX"
            start = str(m.start()+1)
            end = str(m.end()+1)
            #con.execute("INSERT INTO PROTEIN_SENTENCE (UNIPROT_ID, IPR, DESCRIPTION, DOM_TYPE, START_POS, END_POS) VALUES(?,?,?,?,?,?,)", (uniprot_id, ipr, raw_desc, dom_type, start, end))
            #output_file.write(uniprot_id+"\tIPRXXXXXX\t"+raw_desc+"\t"+dom_type+"\t" + str(m.start()+1)+"\t"+str(m.end()+1) + '\n')
        # return()
        
        '''
        #print('full name :', record.name)
        print('uniprot_id :', uniprot_id)
        print('full desc :', raw_desc)
        print('short desc :', short_desc)
        print('tax name :', tax_name)
        print('tax id :', tax_id)
        print('rep id :', rep_id)
        print('start :', start)
        print('end :', end, '\n')
        '''
        
        con.execute("INSERT INTO PROTEIN (UNIPROT_ID, SHORT_DESCRIPTION, TAX_NAME, TAX_ID, DOM_TYPE, REP_ID, START_POS, END_POS) VALUES(?,?,?,?,?,?,?,?)", (uniprot_id, short_desc, tax_name, tax_id, dom_type, rep_id, start, end))
        
        
    con.close()

parse_fasta_to_db("LowComplexity")
#parse_file(file, "CoiledCoil")


In [None]:
def parse_eukaryotic_fasta(dom_type):
    
    path        = "/Users/patrick/dev/ucl/comp0158_mscproject/data/uniref100_100M.fasta"
    
    uniprot_re      = "UniRef100_([A-Z0-9]+)" # modified for UniRef100
    tax_id_re       = "TaxID=([0-9]+)"

    PROCESS_LIMIT   = 10000000 # number of lines to process, set to -1 to ignore
    OUTPUT_LIMIT    = 1000000  # determines how often to print a progress message
    
    line_number = 0
    count = 0
    
    #con = duckdb.connect(database=ProteinDB.db_string) 
    
    for record in SeqIO.parse(path, "fasta"):
        
        # -------- check for termination ------------
        #
        if ((line_number // OUTPUT_LIMIT > 0) and (line_number % OUTPUT_LIMIT) == 0):
            count += 1
            print(count * OUTPUT_LIMIT, 'lines processed.....')
        if(PROCESS_LIMIT != -1):
            if line_number >= PROCESS_LIMIT:
                print('Processing limit reached %s stopping' % (PROCESS_LIMIT))
                break
        line_number += 1
        
        
        # ------- if eukaryotic.... -----------
        
        # need to get tax id
        tax_id_res = re.search(tax_id_re, record.description)
        tax_id = tax_id_res.group(1)
        
        if(tax_id == '2759'):
            #print('Found eukaryotic protein...')
        
            # extracts the id from the name
            uniprot_res = re.search(uniprot_re, record.name)
            uniprot_id  = uniprot_res.group(1)
    
            # -------- get length ------------
            # loops throgh the sequence 3 dots at a time - not sure why? Is this supposed to 
            # return a line for each collection of at least 3 sequence characters?
            #for m in re.finditer(r'\.{3,}', str(record.seq)):          # original version
            start = 0
            end = 0
            for m in re.finditer(r'.{3,}', str(record.seq)):            # modified for UniRef100
                start = str(m.start()+1)
                end = str(m.end()+1)

            #print(uniprot_id, tax_id, start, end)
            print('id %s taxonomy %s start %s end %s ' % (uniprot_id, tax_id, start, end))

        
        #con.execute("INSERT INTO PROTEIN (UNIPROT_ID, SHORT_DESCRIPTION, TAX_NAME, TAX_ID, DOM_TYPE, REP_ID, START_POS, END_POS) VALUES##(?,?,?,?,?,?,?,?)", (uniprot_id, short_desc, tax_name, tax_id, dom_type, rep_id, start, end))
        
    #con.close()

parse_eukaryotic_fasta("LowComplexity")
#parse_file(file, "CoiledCoil")

In [None]:
pdb.create_protein_indices()

In [None]:
# test the connection worked
con = duckdb.connect(database=ProteinDB.db_string)           

#count = con.execute("SELECT COUNT(*) FROM PROTEIN").fetchall()
#result = con.execute("SELECT * FROM PROTEIN").fetchall()
result = con.execute("SELECT * FROM PROTEIN WHERE PROTEIN.UNIPROT_ID = 'A0JP26'").fetchall()
print(result)

#print(result[1500859])
'''
for i in range(10):
    print(result[i])
con.close()
'''

## TeEMBL

In [None]:
#
# PARSE TREMBL FASTA 
#
def parse_trembl_fasta(dom_type):
    path        = "/Users/patrick/dev/ucl/comp0158_mscproject/data/uniprot_trembl_10M.fasta"
    uniprot_re  = "tr\|([A-Z0-9]+)\|" # modified for UniRef100
    
    con = duckdb.connect(database=ProteinDB.db_string) 
    
    PROCESS_LIMIT = 10000
    record_count = 0
    
    for record in SeqIO.parse(path, "fasta"):
            
        # -------- check for termination ------------
        #
        if(PROCESS_LIMIT != -1):
            if record_count >= PROCESS_LIMIT:
                print('Last entry:', record.name)
                print('Processing limit reached %s stopping' % (PROCESS_LIMIT))
                break
        record_count += 1
        # ------------------------------------

        result      = re.search(uniprot_re, record.name)
        uniprot_id  = result.group(1)
                
        pfam_res = con.execute("SELECT * FROM PROTEIN_WORD WHERE UNIPROT_ID = ?", [uniprot_id]).fetchall()
        if pfam_res is not None:
            if len(pfam_res) > 0:
                print(uniprot_id)
                print(pfam_res)
        
        #con.execute("INSERT INTO PROTEIN (UNIPROT_ID, SHORT_DESCRIPTION, TAX_NAME, TAX_ID, DOM_TYPE, REP_ID, START_POS, END_POS) VALUES##(?,?,?,?,?,?,?,?)", (uniprot_id, short_desc, tax_name, tax_id, dom_type, rep_id, start, end))
     
    #con.close()

parse_trembl_fasta("LowComplexity")
#parse_trembl_fasta(file, "CoiledCoil")

## 3. PFAM

#### Parse protein2ipr.dat into database

In [None]:
con = duckdb.connect(database=ProteinDB.db_string)
con.execute("DROP TABLE PROTEIN_WORD")
#con.execute("DROP INDEX WORD_PROT_ID_X")
pdb.create_word_table()
con.close()

In [None]:
#10M lines 10000000 : 20s
#MAX_LINES = 10000000

limit       = True # if True, onLy parses Max_lines lines 
MAX_COUNT   = 10000000
OUTPUT_LIMIT = 10000

# ------------------------------------------------------------------------------------------
# 26 June 2024 - creating a file
# parses protein2ipr.dat for entries with'PFNNN' and outpts those lines to a separate file
# protein2ipr.dat       : 98.7GB,   1,355,591,115 entries
# protein2ipr_new.dat   : 20.73GB,  298,766,058 entries
# parsing time          : 23mins
#
# 01 July - Laoding direct into DB Table (PROTEIN_WORD)
# Parses protein2ipr.dat, extracts ky info and writes it to DB
# 2M in 10mins
# 7.5M 40min
# 10M in 50min > only produces 2.2M rows
#
# last entry processed : A0A074Z763
# ----------------------------------------------------------------------------------------

input = "/Users/patrick/dev/ucl/comp0158_mscproject/data/protein2ipr.dat"

#
# parse an Interpro file and grep out those with pfam domains
# Parsing the full protein2ipr.dat file in python took: 23mins 40s
#
def create_pfam_word():
    count  = 0
    
    con = duckdb.connect(database=ProteinDB.db_string) 
    
    uniprot_id = ""
    with open(input, 'r') as input_file:
        for line_number, line in enumerate(input_file):
            
            # -------- check for termination ------------
            #
            if ((line_number // OUTPUT_LIMIT > 0) and (line_number % OUTPUT_LIMIT) == 0):
                count += 1
                print(count * OUTPUT_LIMIT, 'lines processed.....')
            if(MAX_COUNT != -1):
                if line_number > MAX_COUNT:
                    print('Processing limit reached %s stopping. Last entry was %s' % (MAX_COUNT, uniprot_id))
                    break
            # ------------------------------------  
            

            match = re.search("([A0A0-9]*[a-zA-Z0-9]+)\\tIPR[0-9]+\\t.*\\t(PF[0-9]+)\\t([0-9]+)\\t([0-9]+)", line)

            if match is not None:
                uniprot_id  = match.group(1)
                pfam_word   = match.group(2)
                start       = match.group(3)
                end         = match.group(4)
                item_type   = "PFAM"
                
                #print(uniprot_id, '\t', pfam_word, '\t', start, '\t', end)
                
                con.execute("INSERT INTO PROTEIN_WORD (UNIPROT_ID, WORD_TYPE, REF_ID, START_POS, END_POS) VALUES(?,?,?,?,?)", (uniprot_id, item_type, pfam_word, start, end))
    con.close()

create_pfam_word()

In [None]:
pdb.create_pfam_indices()

In [None]:
# test the connection worked
con = duckdb.connect(database=ProteinDB.db_string)           
#id = 'A0A059X392'
#id = 'A0A009GV07' # this has multiple pfam entries
#id = 'A0A009GMU8'
#id = 'AA0A074Z763'
#result = con.execute("SELECT * FROM PROTEIN_WORD WHERE UNIPROT_ID = ?", [id]).fetchall()

result = con.execute("SELECT COUNT(*) FROM PROTEIN_WORD").fetchall()
print(result)

#### Query pfam entry

In [None]:
def qurery_pfam(uniprot_id):
    con = duckdb.connect(database=ProteinDB.db_string)           
    result = con.execute("SELECT * FROM PROTEIN_WORD WHERE UNIPROT_ID = ?", [id]).fetchall()
    print(result)

## 4. DISORDERED REGIONS

In [None]:
#
# THIS WILL ERROR IF YOU ONLY USE A PARTIAL FILE
#

# This works directly on the uncompressed .gz file
# No space on laptop for fully extracted extra.xml
#

# Used this command to extract first 10000 lines into a separate file:
#
# zgrep . -m 10000 data/disordered/extra.xml.gz > data/disordered/extra.10000.xml
#

import xml.etree.ElementTree as ElementTree

file = '/Users/patrick/dev/ucl/comp0158_mscproject/data/disordered/extra.10000.xml'

# get an iterable
context = ElementTree.iterparse(file, events=("start", "end"))

# turn it into an iterator
context = iter(context)

# get the root element
event, root = next(context)
con = duckdb.connect(database=ProteinDB.db_string)
 
for event, protein in context:
    if event == "end" and protein.tag == "protein":
        # print(elem.attrib['id'])
        for match in protein:
            if 'MOBIDBLT' in match.attrib['dbname']:
                for coords in match:
                    uniprot_id = protein.attrib['id']
                    word_type="DISORDER"
                    start_pos = coords.attrib['start']
                    end_pos = coords.attrib['end']
                    '''
                    print(protein.attrib['id']+"\tIPRXXXXXX\t" +
                          match.attrib['name']+"\t"+match.attrib['id']+"\t" +
                          coords.attrib['start']+"\t"+coords.attrib['end'])
                    '''
                    con.execute("INSERT INTO PROTEIN_WORD (UNIPROT_ID, WORD_TYPE, START_POS, END_POS) VALUES(?,?,?,?)", (uniprot_id, word_type, start_pos, end_pos))
        # exit()
        root.clear()
con.close()

In [None]:
# test the connection worked
con = duckdb.connect(database=ProteinDB.db_string)           
word_type="DISORDER"
count = con.execute("SELECT COUNT(*) FROM PROTEIN_WORD WHERE WORD_TYPE = ?", [word_type]).fetchall()
print(count)

result = con.execute("SELECT * FROM PROTEIN_WORD WHERE WORD_TYPE = ?", [word_type]).fetchall()
for i in range(10):
    print(res[i])
con.close()


In [None]:
# test the connection worked
con = duckdb.connect(database=ProteinDB.db_string)           
word_type="DISORDER"
result = con.execute("DELETE FROM PROTEIN_WORD WHERE WORD_TYPE = ?", [word_type])
con.close()

# <h2 font color="yellow">WORD SENTENCES</h2>

#### Taxonomy Categories

In [None]:
# Test
con = duckdb.connect(database=ProteinDB.db_string)

cat_result = con.execute("SELECT * FROM TAX_CAT WHERE TYPE = 'E'").fetchall()

for i in range(10):
    print(cat_result[i])
con.close()


#### Taxonomy Names

In [None]:
con = duckdb.connect(database=ProteinDB.db_string)

name_result = con.execute("SELECT * FROM TAX_NAME").fetchall()

for i in range(10):
    print(name_result[i])

con.close()

#### Taxonomy Categories

In [None]:
for i in range(10):
    print(name_result[i])

In [None]:
con = duckdb.connect(database=ProteinDB.db_string)

cat_result = con.execute("SELECT * FROM TAX_CAT WHERE TYPE='E'").fetchall()
for i in range(10):
    print(cat_result[i])


#item_id = '10492'
item_id = '2777'

name_result = con.execute("SELECT * FROM TAX_NAME WHERE TAX_ID = ?", [item_id]).fetchall()

print(name_result)

cat_result = con.execute("SELECT * FROM TAX_CAT WHERE TAX_CAT.SPECIES_ID = ?", [item_id]).fetchall()

print(cat_result)

con.close()

In [None]:
con = duckdb.connect(database=ProteinDB.db_string)
#id = 'A0A059X392'
id = 'A0A009GV07' # this has multiple pfam entries
#id = 'A0A009GMU8'
#id = 'AA0A074Z763'

#pfam_result = con.execute("SELECT * FROM PROTEIN_WORD WHERE UNIPROT_ID = ?", [id]).fetchall()


pfam_result = con.execute("SELECT * FROM PROTEIN_WORD WHERE UNIPROT_ID IN ('A0A009GV07','Q6GZX4', 'Q6GZX3', 'Q197F8', 'Q197F7','Q6GZX2', 'Q6GZX1', 'Q197F5','Q6GZX0','Q91G88','Q6GZW8')").fetchall()

print(pfam_result)

### Query proteins

In [None]:
con = duckdb.connect(database=ProteinDB.db_string)
id = 'A0A059X392'
#id = 'A0A009GV07' # this has multiple pfam entries
#id = 'A0A009GMU8'
#id = 'AA0A074Z763'
prot_count= con.execute("SELECT COUNT(*) FROM PROTEIN").fetchall()
print('Num proteins in DB:', prot_count)

print('First 10:\n')
prot_result = con.execute("SELECT * FROM PROTEIN").fetchall()
for i in range(10):
    print(prot_result[i])
con.close()


#### Query pfam entries

In [None]:
con = duckdb.connect(database=ProteinDB.db_string)

id = 'A0A009GV07' # this has multiple pfam entries
#id = '09GV07' # this has multiple pfam entries


pfam_result = con.execute("SELECT * FROM PROTEIN_WORD WHERE UNIPROT_ID = ?", [id]).fetchall()

#pfam_result = con.execute("SELECT * FROM PROTEIN_WORD WHERE UNIPROT_ID IN ('A0A009GV07','Q6GZX4', 'Q6GZX3', 'Q197F8', 'Q197F7','Q6GZX2', 'Q6GZX1', 'Q197F5','Q6GZX0','Q91G88','Q6GZW8')").fetchall()

print(pfam_result)

#### Query disorder regions

In [None]:
con = duckdb.connect(database=ProteinDB.db_string)

disorder_result = con.execute("SELECT * FROM PROTEIN_WORD WHERE WORD_TYPE='DISORDER' ").fetchall()

for i in range(10):
    print(disorder_result[i])

con.close()


In [None]:
con = duckdb.connect(database=ProteinDB.db_string)
id = 'A0A059X392'
#id = 'A0A009GV07' # this has multiple pfam entries
#id = 'A0A009GMU8'
#id = 'AA0A074Z763'
prot_result = con.execute("SELECT * FROM PROTEIN WHERE UNIPROT_ID = ?", [id]).fetchall()
print(prot_result)

# --------- OLD STUFF -----------------

### OLD : UNIREF FASTA

#### deprecated : Parse UniRef100 FASTA 1 > Check for ProteinSentences

In [None]:
'''
# 500k is enough for matches to be found
MAX_LINES = 10000000
matched = []
not_matched = []

#
# parse a fasta file to get protein ids
# for uniref, these ids are the characters after UniRef100_
# TODO: Check if ids are proteins or protein clusters
#
def parse_fasta():
    with open(protein_fasta_file, 'r') as input_file:
        for line_number, line in enumerate(input_file):
            
            if line_number > MAX_LINES:  # line_number starts at 0.
                break
            #print('Processing :', line)
            # note that the raw interpro file is tab delimited between fields
            match = re.search("UniRef100_([A-Z0-9]+) ", line)
            if match is not None:
                id = match.group(1)

                if id in sentences.keys():
                    #print(line_number, 'Found sentence for protein :', id, sentences[id].text)
                    if id not in matched:
                        matched.append(id)
                else:
                    not_matched.append(id)
parse_fasta()
'''

#### deprecated : Parse UniRef100 FASTA 2 > Extract id, length, taxonomy

In [None]:
'''
# 500k is enough for matches to be found
MAX_LINES = 1000
matched = []
not_matched = []

file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/uniref100_500M.fasta"

#
# parse a fasta file to get protein ids
# for uniref, these ids are the characters after UniRef100_
# TODO: Check if ids are proteins or protein clusters
#
def parse_fasta_2():
    matchline = ""
    id = ""
    match_inprogress = False
    
    with open(file, 'r') as input_file:
        for line_number, line in enumerate(input_file):
            
            if line_number > MAX_LINES:  # line_number starts at 0.
                break

            match = re.search(">UniRef100_([A-Z0-9]+).*TaxID=([0-9]+).*", line)
            
            #match = re.search(">UniRef100_([A-Z0-9]+)", line) # works
            
            if match is not None:
                if(id != ""):
                    print(id, '\t', len(matchline), '\t', tax_id, '\t', matchline)
                matchline = ""
                id = match.group(1)
                tax_id = match.group(2)
                continue
            else:
                matchline += line.strip()
parse_fasta_2()
'''

In [None]:
print(len(matched),'Matched proteins:\n', matched)
print(len(not_matched), 'Unmatched proteins:\n',not_matched)

#### deprecated : Parse UniRef100 FASTA 3 > Modified parse_masked_regions from Daniel - outputs masked_regions.dat

In [None]:
'''
from Bio import SeqIO
import re

# re_string = "\|(.+)\|" # original version
re_string = "UniRef100_([A-Z0-9]+)" # modified for UniRef100

input = "/Users/patrick/dev/ucl/comp0158_mscproject/data/uniref100_10M.fasta"
output = "/Users/patrick/dev/ucl/comp0158_mscproject/data/masked_regions.dat"

def parse_file(path, dom_type):
    output_file = open(output, "w")
    for record in SeqIO.parse(path, "fasta"):
        # record.name = UniRef100_Q6GZX3
        # record.description = UniRef100_Q6GZX3 Putative transc....
        
        # this removes the name from the description removes commas
        raw_desc = record.description.replace(record.name+" ", "")
        raw_desc = raw_desc.replace(",", "")
        
        # extracts the id from the name
        result = re.search(re_string, record.name)
        uniprot_id = result.group(1)
        
        # loops throgh the sequence 3 dots at a time - not sure why
        # is this supposed to return a line for each collection of at least 3 sequence characters?
        # sequence characters?
        #for m in re.finditer(r'\.{3,}', str(record.seq)):          # original version
        for m in re.finditer(r'.{3,}', str(record.seq)):            # modified for UniRef100
            print(uniprot_id+"\tIPRXXXXXX\t"+raw_desc+"\t"+dom_type+"\t" + str(m.start()+1)+"\t"+str(m.end()+1))
            #output_file.write(uniprot_id+"\tIPRXXXXXX\t"+raw_desc+"\t"+dom_type+"\t" + str(m.start()+1)+"\t"+str(m.end()+1) + '\n')
        # return()
    output_file.close()

parse_file(input, "LowComplexity")
#parse_file(file, "CoiledCoil")
'''


#### DEPRECATED - Read UniRef proteins into DB

In [None]:
'''
#
def parse_fasta_to_db(dom_type):
    
    path = "/Users/patrick/dev/ucl/comp0158_mscproject/data/uniref100_10M.fasta"
    re_string = "UniRef100_([A-Z0-9]+)" # modified for UniRef100
    
    PROCESS_LIMIT   = 1000 # number of lines to process, set to -1 to ignore
    OUTPUT_LIMIT    = 100  # determines how often to print a progress message
    
    line_number = 0
    count = 0
    con = duckdb.connect(database=ProteinDB.db_string) 
    
    for record in SeqIO.parse(path, "fasta"):
        
        # this removes the name from the description and removes commas
        raw_desc = record.description.replace(record.name+" ", "")
        raw_desc = raw_desc.replace(",", "")
        
        # extracts the id from the name
        result = re.search(re_string, record.name)
        uniprot_id = result.group(1)
        
        # check if want to output progress
        if ((line_number // OUTPUT_LIMIT > 0) and (line_number % OUTPUT_LIMIT) == 0):
            count += 1
            print(count * OUTPUT_LIMIT, 'lines processed.....')
                
        # check if we've hit the line limit
        if(PROCESS_LIMIT != -1):
            if line_number == PROCESS_LIMIT:
                print('Last entry:', uniprot_id)
            if line_number > PROCESS_LIMIT:
                print('Processing limit reached %s stopping' % (PROCESS_LIMIT))
                break
        
        line_number += 1
        
        # loops throgh the sequence 3 dots at a time - not sure why? Is this supposed to 
        # return a line for each collection of at least 3 sequence characters?
        #for m in re.finditer(r'\.{3,}', str(record.seq)):          # original version
        for m in re.finditer(r'.{3,}', str(record.seq)):            # modified for UniRef100
            #print(uniprot_id+"\tIPRXXXXXX\t"+raw_desc+"\t"+dom_type+"\t" + str(m.start()+1)+"\t"+str(m.end()+1))
            ipr="IPRXXXXXX"
            start = str(m.start()+1)
            end = str(m.end()+1)
            con.execute("INSERT INTO PROTEIN_SENTENCE (UNIPROT_ID, IPR, DESCRIPTION, DOM_TYPE, START_POS, END_POS) VALUES(?,?,?,?,?,?,)", (uniprot_id, ipr, raw_desc, dom_type, start, end))
            #output_file.write(uniprot_id+"\tIPRXXXXXX\t"+raw_desc+"\t"+dom_type+"\t" + str(m.start()+1)+"\t"+str(m.end()+1) + '\n')
        # return()
    con.close()

parse_fasta_to_db("LowComplexity")
#parse_file(file, "CoiledCoil")
'''

In [None]:
def parse_fasta_to_db_tax(dom_type):
    
    path        = "/Users/patrick/dev/ucl/comp0158_mscproject/data/uniref100_10M.fasta"
    
    re_string       = "UniRef100_([A-Z0-9]+)" # modified for UniRef100
    tax_id_string   = "TaxID=([0-9]+)"
    tax_string      = "Tax=([a-zA-Z0-9\s]+)TaxID"
    name_string     = "([a-zA-Z0-9\s-]+)n="
    
    PROCESS_LIMIT   = 100 # number of lines to process, set to -1 to ignore
    OUTPUT_LIMIT    = 100  # determines how often to print a progress message
    
    line_number = 0
    count = 0
    con = duckdb.connect(database=ProteinDB.db_string) 
    
    for record in SeqIO.parse(path, "fasta"):
        
        # ------- extract data from name -----------
        
        # this removes the name from the description and removes commas
        raw_desc = record.description.replace(record.name+" ", "")
        raw_desc = raw_desc.replace(",", "")
        
        # extracts the id from the name
        result      = re.search(re_string, record.name)
        uniprot_id  = result.group(1)
        
        name_q = re.search(name_string, raw_desc)
        short_name = name_q.group(1) 
        
        # need to get tax id
        tax_id_q = re.search(tax_id_string, raw_desc)
        tax_id = tax_id_q.group(1)
        
        tax_nm_res = re.search(tax_string, raw_desc)
        if tax_nm_res is not None:
            tax_name = tax_nm_res.group(1)
        else:
            print ('No tax name :', raw_desc )
        
        
        # -------- check for termination ------------
        #
        if ((line_number // OUTPUT_LIMIT > 0) and (line_number % OUTPUT_LIMIT) == 0):
            count += 1
            print(count * OUTPUT_LIMIT, 'lines processed.....')
        if(PROCESS_LIMIT != -1):
            if line_number == PROCESS_LIMIT:
                print('Last entry:', uniprot_id)
            if line_number > PROCESS_LIMIT:
                print('Processing limit reached %s stopping' % (PROCESS_LIMIT))
                break
        line_number += 1
        # ------------------------------------
        
        
        
        
        # -------- get length ------------
        #
        # loops throgh the sequence 3 dots at a time - not sure why? Is this supposed to 
        # return a line for each collection of at least 3 sequence characters?
        #for m in re.finditer(r'\.{3,}', str(record.seq)):          # original version
        start = 0
        end = 0
        for m in re.finditer(r'.{3,}', str(record.seq)):            # modified for UniRef100
            #print(uniprot_id+"\tIPRXXXXXX\t"+raw_desc+"\t"+dom_type+"\t" + str(m.start()+1)+"\t"+str(m.end()+1))
            ipr="IPRXXXXXX"
            start = str(m.start()+1)
            end = str(m.end()+1)
            print('start :', start)
            print('end :', end, '\n')
            #con.execute("INSERT INTO PROTEIN_SENTENCE (UNIPROT_ID, IPR, DESCRIPTION, DOM_TYPE, START_POS, END_POS) VALUES(?,?,?,?,?,?,)", (uniprot_id, ipr, raw_desc, dom_type, start, end))
            #output_file.write(uniprot_id+"\tIPRXXXXXX\t"+raw_desc+"\t"+dom_type+"\t" + str(m.start()+1)+"\t"+str(m.end()+1) + '\n')
        # return()
        
        print('full name :', record.name)
        print('uniprot_id :', uniprot_id)
        print('short name :', short_name)
        print('desc :', raw_desc)
        print('tax name :', tax_name)
        print('tax id :', tax_id, '\n')
        print('start :', start)
        print('end :', end, '\n')
        
    con.close()

parse_fasta_to_db_tax("LowComplexity")
#parse_file(file, "CoiledCoil")

In [None]:
# test the connection worked
con = duckdb.connect(database=ProteinDB.db_string)           
#id = 'A0A059X392'
id = 'Q99L13' # this has multiple pfam entries
result = con.execute("SELECT * FROM PROTEIN_SENTENCE WHERE UNIPROT_ID = ?", [id]).fetchall()
print(result)