In [None]:
import argparse
from Bio import SeqIO
import re
import csv

# file handles
protein_fasta_file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/uniref100_10M.fasta";
interpro_file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/protein2ipr.dat";

### File Examples

#### Uniref100 Fasta
>\>UniRef100_Q197F3 Uncharacterized protein 007R n=1 Tax=Invertebrate iridescent virus 3 TaxID=345201 RepID=007R_IIV3
MEAKNITIDNTTYNFFKFYNINQPLTNLKYLNSERLCFSNAVMGKIVDDASTITITYHRV
YFGISGPKPRQVADLGEYYDVNELLNYDTYTKTQEFAQKYNSLVKPTIDAKNWSGNELVL


#### Protein2ipr.dat
>A0A000	IPR004839	Aminotransferase, class I/classII	PF00155	41	381<br>
A0A000	IPR010961	Tetrapyrrole biosynthesis, 5-aminolevulinic acid synthase	TIGR01821	12	391<br>
A0A000	IPR015421	Pyridoxal phosphate-dependent transferase, major domain	G3DSA:3.40.640.10	48	288<br>
A0A000	IPR015422	Pyridoxal phosphate-dependent transferase, small domain	G3DSA:3.90.1150.10	36	378<br>
A0A000	IPR015424	Pyridoxal phosphate-dependent transferase	SSF53383	9	389<br>
A0A000	IPR050087	8-amino-7-oxononanoate synthase class-II	PTHR13693	34	382<br>
A0A001	IPR003439	ABC transporter-like, ATP-binding domain	PF00005	361	503<br>

#### Explanation
A0A0 Means its uniprot but not yet verified

### Data Preparation - Unix

1. Extract pfam entries from protein2ipr.dat

awk '{FS="\t"}{print $1, " ", $2, " ", $4}' /Users/patrick/dev/ucl/comp0158_mscproject/data/protein2ipr.dat | awk '$3 ~ /PF[0-9]./ {print $0}' > protein2ipr_pfam2.dat

2. Disorder regions
- Use parse_match_complete.py
- This parses /Volumes/My Passport/downloads/match_complete.xml which is 500GB abd searches for MobiDb entries
- This script took about 6 hours to run

- BUT -

- MobiDB does not appear as a dbname, thus use the below to find possible values:


grep "dbname=" /Volumes/My\ Passport/downloads/match_complete.xml | awk '{FS="dbname="}{print $2}' | awk '{print $1}' | sort |uniq -c

NOTE THAT THIS GREP TOOK ABOUT 8 HRS TO RUN ON MY LAPTOP:

Ouput: 
1 "ANTIFAM"
254194862 "CATHGENE3D"
100323516 "CDD"
25444276 "HAMAP"
   1 "INTERPRO"
51495327 "NCBIFAM"
167725277 "PANTHER"
278089257 "PFAM"
20659779 "PIRSF"
38902426 "PRINTS"
94353952 "PROFILE"
49021231 "PROSITE"
4722014 "SFLD"
69964999 "SMART"
209693119 "SSF"
   1 dbname="SFLD"

In [None]:
#file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/protein2ipr.dat";
#file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/protein2ipr_pfam2.dat"; first pfam with text stripped out
file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/protein2ipr_new.dat"; #new pfam with full line
#file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/A8KBH6_ipr_pfam.dat";

#
# greps for an id in interpro dat file
# For A8KBH6, this will match A0A01A8KBH6 and A8KBH6
# Time to parse the full protein2ipr.dat for A8KBH6 : 22min 56s
#
def grep_interpro(id):
    with open(file, 'r') as input_file:
        for line_number, line in enumerate(input_file):
            #match_string = "ç"            # works
            #match_string = "[A0A0-9]*A8KBH6"   # works
            #match_string = "^[A0A0-9]*"+id     # works
            match_string = "^[A0A0-9]*"+id
            match = re.search(match_string, line)
            if match:
                print('Matched:', id, 'in line:', line.strip())

In [None]:
grep_interpro('A8KBH6')
grep_interpro('A0A1A8KBH6')

#### Protein Sentence Class

In [None]:
#
# Protein Word
#
class ProteinWord:
    def __init__(self, type, text, start, end):
        self.type = type
        self.text = text
        self.start = start
        self.end = end

    def __str__(self):
        return f' {self.type}, {self.text}, {self.start}, {self.end}'

    def __repr__(self):
        return f' {self.type}, {self.text}, {self.start}, {self.end}'
    
#
# Protein Sentence
#
class ProteinSentence:
    def __init__(self, uniprot_id, word):
        #print('Creating new sentence for', uniprot_id, ': ',  word.text)
        self.uniprot_id = uniprot_id
        self.words = [word]
        self.text = word.text
        
    def add_word(self, word):
        #print('Adding new word to', self.uniprot_id, ':',  word.text)
        self.words.append(word)
        self.text = self.text + ',' + word.text
        
    def __str__(self):
        return f' {self.uniprot_id}: {self.text}'

    def __repr__(self):
        return f' {self.uniprot_id}: {self.text}'
        

#### Parse PFAM entries from Interpro file

In [None]:
#10M lines 10000000 : 20s
#MAX_LINES = 10000000

limit       = True # if True, onLy parses Max_lines lines 
MAX_COUNT   = 25000000

# ------------------------------------------------------------------------------------------
# 26 June 2024
# parses protein2ipr.dat for entries with'PFNNN' and outpts those lines to a separate file
# protein2ipr.dat       : 98.7GB,   1,355,591,115 entries
# protein2ipr_new.dat   : 20.73GB,  298,766,058 entries
# parsing time          : 23mins
# ----------------------------------------------------------------------------------------

input = "/Users/patrick/dev/ucl/comp0158_mscproject/data/protein2ipr.dat"
output = "/Users/patrick/dev/ucl/comp0158_mscproject/data/protein2ipr_new_25M.dat"

#
# parse an Interpro file and grep out those with pfam domains
# Parsing the full protein2ipr.dat file in python took: 23mins 40s
#
def create_pfam_interpro():
    match_count  = 0
    output_file = open(output, "w")

    with open(input, 'r') as input_file:
        for line_number, line in enumerate(input_file):

            # just match for PF
            match = re.search("PF[0-9]+", line) 
            
            if match:
                match_count += 1
                if match_count > MAX_COUNT:
                    print(MAX_COUNT, 'limit reached, breaking.')
                    break

                output_file.write(line)
    output_file.close()

create_pfam_interpro()

#### Parse Interpro file to create initial sentence classes with PFAM pantries

In [None]:
#10M lines 10000000 : 20s

limit = True # if true, obey the max lines
MAX_LINES = 1000000

#file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/protein2ipr.dat"; # full file
#file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/protein2ipr_new.dat"; # pfam only
file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/protein2ipr_new_25M.dat"; # pfam only
#file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/A8KBH6_ipr_pfam.dat"; # as above but only entries for A8KBH6 (pfam only)

# to hold sentences for each sequence
sentences = {}

#
# parse an Interpro file and create ProteinSentences and Words
#
# TODO: Confirm id starting A0A0
def parse_interpro():
    count = 0
    with open(file, 'r') as input_file:
        for line_number, line in enumerate(input_file):
            if (line_number % 100000) == 0:
                count += 1
                print(count * 100000, 'lines processed.....')
            if(limit):
                if line_number > MAX_LINES:
                    break
            
            # NB: Make sure the file is tab delimited
            
            # note that the first line matches exactly, so loads iof the fasta formats with A0A* will not match
            #match = re.search("^([a-zA-Z0-9]+)\\tIPR[0-9]+\\t.*(PF[0-9]+)\\t([0-9]+)\\t([0-9]+)", line)
            
            #match = re.search("[A0A0-9]*[a-zA-Z0-9]+\\tIPR", line) # matches in protein2ipr_new.dat
            #match = re.search("([A0A0-9]*[a-zA-Z0-9]+)\\tIPR[0-9]+\\t.*\\t(PF[0-9]+)", line) # matches in protein2ipr_new.dat
            
            #
            # Testing A0A1A8KBH6 and A8KBH6
            #
            # Works for both in: A8KBH6_ipr_pfam.dat
            match = re.search("([A0A0-9]*[a-zA-Z0-9]+)\\tIPR[0-9]+\\t.*\\t(PF[0-9]+)\\t([0-9]+)\\t([0-9]+)", line)
                      
            if match is not None:
                id = match.group(1)
                pfam_word = match.group(2)
                start = match.group(3)
                end = match.group(4)
                
                #print('Match protein :', id, 'PF:', pfam_word)
                
                # create a new word item
                word = ProteinWord('pfam', pfam_word, start, end)
                
                # check if already have a protein with this id
                if (id in sentences.keys()):
                    sentences[id].add_word(word)
                else:
                    sentence = ProteinSentence(id, word)
                    sentences[id] = sentence
            
parse_interpro()


In [None]:
print(len(sentences.keys()))
#print(sentences.keys())
#print(sentences.keys()[0:10])
#print(sentences['A0A002'])
#print(sentences['A8KBH6'])
#print(sentences['A0A1A8KBH6'])
print(sentences['A0A009GV07'])

#### Parse fasta entries and look up Interpro sentences

In [None]:
# 500k is enough for matches to be found
MAX_LINES = 10000000
matched = []
not_matched = []

#
# parse a fasta file to get protein ids
# for uniref, these ids are the characters after UniRef100_
# TODO: Check if ids are proteins or protein clusters
#
def parse_fasta():
    with open(protein_fasta_file, 'r') as input_file:
        for line_number, line in enumerate(input_file):
            
            if line_number > MAX_LINES:  # line_number starts at 0.
                break
            #print('Processing :', line)
            # note that the raw interpro file is tab delimited between fields
            match = re.search("UniRef100_([A-Z0-9]+) ", line)
            if match is not None:
                id = match.group(1)

                if id in sentences.keys():
                    #print(line_number, 'Found sentence for protein :', id, sentences[id].text)
                    if id not in matched:
                        matched.append(id)
                else:
                    not_matched.append(id)
parse_fasta()

In [None]:
# 500k is enough for matches to be found
MAX_LINES = 1000
matched = []
not_matched = []

file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/uniref100_500M.fasta"

#
# parse a fasta file to get protein ids
# for uniref, these ids are the characters after UniRef100_
# TODO: Check if ids are proteins or protein clusters
#
def parse_fasta_2():
    matchline = ""
    id = ""
    match_inprogress = False
    
    with open(file, 'r') as input_file:
        for line_number, line in enumerate(input_file):
            
            if line_number > MAX_LINES:  # line_number starts at 0.
                break

            match = re.search(">UniRef100_([A-Z0-9]+).*TaxID=([0-9]+).*", line)
            
            #match = re.search(">UniRef100_([A-Z0-9]+)", line) # works
            
            if match is not None:
                if(id != ""):
                    print(id, '\t', len(matchline), '\t', tax_id, '\t', matchline)
                matchline = ""
                id = match.group(1)
                tax_id = match.group(2)
                continue
            else:
                matchline += line.strip()
parse_fasta_2()

In [None]:
print(len(matched),'Matched proteins:\n', matched)
print(len(not_matched), 'Unmatched proteins:\n',not_matched)

#### Modified parse_masked_regions from Daniel

In [3]:
from Bio import SeqIO
import re

# re_string = "\|(.+)\|" # original version
re_string = "UniRef100_([A-Z0-9]+)" # modified for UniRef100

input = "/Users/patrick/dev/ucl/comp0158_mscproject/data/uniref100_10M.fasta"
output = "/Users/patrick/dev/ucl/comp0158_mscproject/data/masked_regions.dat"

def parse_file(path, dom_type):
    output_file = open(output, "w")
    for record in SeqIO.parse(path, "fasta"):
        # record.name = UniRef100_Q6GZX3
        # record.description = UniRef100_Q6GZX3 Putative transc....
        
        # this removes the name from the description removes commas
        raw_desc = record.description.replace(record.name+" ", "")
        raw_desc = raw_desc.replace(",", "")
        
        # extracts the id from the name
        result = re.search(re_string, record.name)
        uniprot_id = result.group(1)
        
        # loops throgh the sequence 3 dots at a time - not sure why
        # is this supposed to return a line for each collection of at least 3 sequence characters?
        # sequence characters?
        #for m in re.finditer(r'\.{3,}', str(record.seq)):          # original version
        for m in re.finditer(r'.{3,}', str(record.seq)):            # modified for UniRef100
            output_file.write(uniprot_id+"\tIPRXXXXXX\t"+raw_desc+"\t"+dom_type+"\t" +
                    str(m.start()+1)+"\t"+str(m.end()+1) + '\n')
        # return()
    output_file.close()

parse_file(input, "LowComplexity")
#parse_file(file, "CoiledCoil")
