In [8]:
import argparse
from Bio import SeqIO
import re
import csv

# file handles
protein_fasta_file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/uniref100_10M.fasta";
interpro_file = "/Users/patrick/dev/ucl/comp0158_mscproject/data/protein2ipr.dat";

### File Examples

#### Uniref100 Fasta
>\>UniRef100_Q197F3 Uncharacterized protein 007R n=1 Tax=Invertebrate iridescent virus 3 TaxID=345201 RepID=007R_IIV3
MEAKNITIDNTTYNFFKFYNINQPLTNLKYLNSERLCFSNAVMGKIVDDASTITITYHRV
YFGISGPKPRQVADLGEYYDVNELLNYDTYTKTQEFAQKYNSLVKPTIDAKNWSGNELVL


#### Protein2ipr.dat
>A0A000	IPR004839	Aminotransferase, class I/classII	PF00155	41	381<br>
A0A000	IPR010961	Tetrapyrrole biosynthesis, 5-aminolevulinic acid synthase	TIGR01821	12	391<br>
A0A000	IPR015421	Pyridoxal phosphate-dependent transferase, major domain	G3DSA:3.40.640.10	48	288<br>
A0A000	IPR015422	Pyridoxal phosphate-dependent transferase, small domain	G3DSA:3.90.1150.10	36	378<br>
A0A000	IPR015424	Pyridoxal phosphate-dependent transferase	SSF53383	9	389<br>
A0A000	IPR050087	8-amino-7-oxononanoate synthase class-II	PTHR13693	34	382<br>
A0A001	IPR003439	ABC transporter-like, ATP-binding domain	PF00005	361	503<br>

#### Explanation
A0A0 Means its uniprot but not yet verified

### Data Preparation - Unix

1. Extract pfam entries from protein2ipr.dat

awk '{FS="\t"}{print $1, " ", $2, " ", $4}' /Users/patrick/dev/ucl/comp0158_mscproject/data/protein2ipr.dat | awk '$3 ~ /PF[0-9]./ {print $0}' > protein2ipr_pfam2.dat

2. Disorder regions
- Use parse_match_complete.py
- This parses /Volumes/My Passport/downloads/match_complete.xml which is 500GB abd searches for MobiDb entries
- This script took about 6 hours to run

- BUT -

- MobiDB does not appear as a dbname, thus use the below to find possible values:


grep "dbname=" /Volumes/My\ Passport/downloads/match_complete.xml | awk '{FS="dbname="}{print $2}' | awk '{print $1}' | sort |uniq -c

NOTE THAT THIS GREP TOOK ABOUT 8 HRS TO RUN ON MY LAPTOP:

Ouput: 
1 "ANTIFAM"
254194862 "CATHGENE3D"
100323516 "CDD"
25444276 "HAMAP"
   1 "INTERPRO"
51495327 "NCBIFAM"
167725277 "PANTHER"
278089257 "PFAM"
20659779 "PIRSF"
38902426 "PRINTS"
94353952 "PROFILE"
49021231 "PROSITE"
4722014 "SFLD"
69964999 "SMART"
209693119 "SSF"
   1 dbname="SFLD"

In [141]:
#
# Protein Word
#
class ProteinWord:
    def __init__(self, type, text, start, end):
        self.type = type
        self.text = text
        self.start = start
        self.end = end

#
# Protein Sentence
#
class ProteinSentence:
    
    def __init__(self, uniprot_id, word):
        #print('Creating new sentence for', uniprot_id, ': ',  word.text)
        self.uniprot_id = uniprot_id
        self.words = [word]
        self.text = word.text
        
    def add_word(self, word):
        #print('Adding new word to', self.uniprot_id, ':',  word.text)
        self.words.append(word)
        self.text = self.text + ',' + word.text
        
    def pretty_print(self):
        #sentence = ""
        print('Protein id:', self.uniprot_id)
        for word in self.words:
            print(word.text)
        
        

In [149]:
MAX_LINES = 10000000

# to hold sentences for each sequence
sentences = {}

def parse_interpro():
    with open(interpro_file, 'r') as input_file:
        for line_number, line in enumerate(input_file):
            if line_number > MAX_LINES:  # line_number starts at 0.
                break
            
            # note that the raw interpro file is tab delimited between fields
            match = re.search("^([a-zA-Z0-9]+)\\tIPR[0-9]+\\t.*(PF[0-9]+)\\t([0-9]+)\\t([0-9]+)", line)
            if match is not None:
                #print('Found match:', line)
                
                id = match.group(1)
                pfam_word = match.group(2)
                start = match.group(3)
                end = match.group(4)
                
                # create a new word item
                word = ProteinWord('pfam', pfam_word, start, end)
                
                # check if already have a protein with this id
                if (id in sentences.keys()):
                    sentences[id].add_word(word)
                else:
                    sentence = ProteinSentence(id, word)
                    sentences[id] = sentence
parse_interpro()


In [150]:
print(len(sentences.keys()))
print(sentences['A0A002'].text)

1386244
PF00005,PF00664


In [154]:
MAX_LINES = 10000000

def parse_fasta():
    with open(protein_fasta_file, 'r') as input_file:
        for line_number, line in enumerate(input_file):
            
            if line_number > MAX_LINES:  # line_number starts at 0.
                break
            #print('Processing :', line)
            # note that the raw interpro file is tab delimited between fields
            match = re.search("UniRef100_([A-Z0-9]+) ", line)
            if match is not None:
                id = match.group(1)
                #print('Found id:', id)
                
                if id in sentences.keys():
                    print('Found sentence for protein :', id, sentences[id].text)
parse_fasta()


Found sentence for protein : A0A072VHJ1 PF00909
Found sentence for protein : A0A068Q6B2 PF03906
Found sentence for protein : A0A067XH53 PF01474
Found sentence for protein : A0A067XGX8 PF01474
Found sentence for protein : A0A072U307 PF02298
Found sentence for protein : A0A044RE18 PF00082,PF01483,PF16470
Found sentence for protein : A0A068ACU9 PF12146
Found sentence for protein : A0A068A9T2 PF00067
Found sentence for protein : A0A068AA98 PF00067
Found sentence for protein : A0A068AA78 PF00067
Found sentence for protein : A0A068ACU3 PF00067
Found sentence for protein : A0A068ABB7 PF08240,PF08659,PF00109,PF02801,PF00698,PF16197,PF14765,PF21089
Found sentence for protein : A0A061FMF5 PF03492
Found sentence for protein : A0A061FKL9 PF03492
Found sentence for protein : A0A061FTC2 PF03492
Found sentence for protein : A0A061FLA2 PF03492
Found sentence for protein : A0A061FKM4 PF03492
Found sentence for protein : A0A068Q721 PF00067
Found sentence for protein : A0A068Q609 PF00067
Found sentence f