In [1]:
# Dependencies
import modules.blast as blast
import modules.uniprot as uniprot
import modules.msa as msa
import pandas as pd
import numpy as np
import re
import time

In [2]:
# Constants
BLAST_MATCHES_PATH = 'data/blast.csv'  # Blast matches dataset
BLAST_FASTA_PATH = 'data/blast.fasta'  # Blast matched proteins fasta file
MSA_FASTA_PATH = 'data/msa.fasta'  # Multiple sequence alignment file
HUMAN_CSV_PATH = 'data/human.csv'  # Test set csv
HUMAN_IN_PFAM_CSV_PATH = 'data/human_in_pfam.csv'
HUMAN_NO_PFAM_CSV_PATH = 'data/human_no_pfam.csv'
HUMAN_FASTA_PATH = 'data/human.fasta'  # Test set fasta
GO_CSV_PATH = 'data/go.csv'  # GO terms dataset
FAMILIES_CSV_PATH = 'data/families.csv'  # Pfam families dataset

## BLAST search

Execute a blast search using EBI restful web service

In [3]:
# Start a BLAST job
status, job_id, _ = blast.run_job(email='damiano.clementel@studenti.unipd.it',
                              sequence='VPSGWKAVFDDEYQTWYYVDLSTNSSQWEPP',
                              params={'database': 'uniref90',
                                      'matrix': 'BLOSUM62',
                                      'alignments': 1000, 
                                      'gapalign': True, 
                                      'exp': '1e-3'})

# Check output
print('Started job: {:s}'.format(job_id))

Started job: ncbiblast-R20200207-143952-0648-1220177-p1m


In [4]:
# Define results container
job_result = None
# job_id = 'ncbiblast-R20200206-172705-0918-98731783-p1m'

# Retruieve BLAST job results
while True:
    # Make request for job status
    status, job_status, _ = blast.get_job_status(job_id)
    print('Job status: {:s}'.format(job_status))  # LOG
    # Check error
    if not status: break
    # Check if job has finished running
    if job_status == 'FINISHED':
        # Retrieve results
        status, job_result, _ = blast.get_job_result(job_id)
        print('Job exited with status: {:s}'.format(job_status))  # LOG
        break  # Exit loop
    # Wait 10 seconds befroe making another call
    time.sleep(10)

Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: FINISHED
Job exited with status: FINISHED


In [5]:
# Turn job result into pandas DataFrame object
matches = pd.DataFrame(job_result)
matches.to_csv(BLAST_MATCHES_PATH, index=False)  # Save to disk

# Show dataframe
matches.head()

Unnamed: 0,score,bits,expectation,identity,positives,gaps,strand,pattern_seq,match_seq,match_start,match_end,database,id,ac,description
0,182,74.7146,1e-15,100.0,100.0,0,none/none,VPSGWKAVFDDEYQTWYYVDLSTNSSQWEPP,VPSGWKAVFDDEYQTWYYVDLSTNSSQWEPP,11,41,UR90,UniRef90_P43582,WW,WW domain-containing protein WWM1 n=9 Tax=Sacc...
1,175,72.0182,1.1e-14,93.5,100.0,0,none/none,VPSGWKAVFDDEYQTW+YVDLSTN+SQWEPP,VPSGWKAVFDDEYQTWFYVDLSTNNSQWEPP,11,41,UR90,UniRef90_J8Q8J2,Wwm1p,Wwm1p n=1 Tax=Saccharomyces arboricola (strain...
2,173,71.2478,1.8e-14,93.5,100.0,0,none/none,VPSGWKAVFDDEYQTW+YV+LSTNSSQWEPP,VPSGWKAVFDDEYQTWFYVNLSTNSSQWEPP,11,41,UR90,UniRef90_J5RH20,WWM1-like,WWM1-like protein n=2 Tax=Saccharomyces TaxID=...
3,170,70.0922,4.7e-14,93.5,96.8,0,none/none,VPSGWKAVFDDEYQTW+YVDLSTNSSQWE P,VPSGWKAVFDDEYQTWFYVDLSTNSSQWEAP,11,41,UR90,UniRef90_A0A0L8RJY2,WWM1-like,WWM1-like protein n=1 Tax=Saccharomyces eubaya...
4,158,65.4698,2.8e-12,80.6,93.5,0,none/none,VP GWKAVFDDEY+TW+YV+L+TN SQWEPP,VPKGWKAVFDDEYKTWFYVNLATNQSQWEPP,11,41,UR90,UniRef90_A0A212M9M4,WW,WW domain-containing protein n=4 Tax=Zygosacch...


## Create FASTA file from BLAST

In [6]:
# Save full fasta file
with open(BLAST_FASTA_PATH, 'w') as blast_fasta_file:
    # Get fasta file
    status, blast_fasta, _ = uniprot.get_proteins(matches.id.tolist(), batch_size=100)
    # Write fasta file to disk
    blast_fasta_file.write(blast_fasta)

In [7]:
# Check number of retrieved fasta files
blast_fasta_len = sum([1 for c in blast_fasta if c == '>'])  # Number of rows in fasta file
matches_len = matches.shape[0]  # Number of rows in dataframe

assert  blast_fasta_len == matches_len, 'Fasta file and dataset lengths do not coincide'

## Execute multiple sequence alignment

In [8]:
# Make multiple sequence alignment

# Create MSA job
status, job_id, _ = msa.run_clustalo(email='damiano.clementel@studenti.unipd.it', sequence=blast_fasta)

# Wait until MSA job finishes
while True:
    # Check response status
    status, job_status, _ = msa.get_job_status(job_id, algorithm=msa.CLUSTALO)
    # Check if response status is 200 OK
    if not status:
        print('Error: job exited')
        break 
    # Check if job has finished, then go on
    print('Job status: {:s}'.format(job_status))
    if job_status == 'FINISHED': break
    # Add delay
    time.sleep(3)
print()
    
# Get results
if status:
    # Retrieve result as fasta
    status, msa_fasta, _ = msa.get_job_result(job_id, algorithm=msa.CLUSTALO)
    # Save msa to file
    with open(MSA_FASTA_PATH, 'w') as msa_fasta_file:
        msa_fasta_file.write(msa_fasta)
    # Show fasta file head
    print('Retrieved fasta file')
    print(msa_fasta[:2000])

Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: FINISHED

Retrieved fasta file
>UniRef90_P43582 WW domain-containing protein WWM1 n=9 Tax=Saccharomyces TaxID=4930 RepID=WWM1_YEAST
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
-----------------------------------------------

In [9]:
# Check length of retrieved fasta
msa_fasta_len = sum([1 for c in blast_fasta if c == '>'])  # Number of rows in fasta file

assert blast_fasta_len == matches_len, 'Alignments fasta file and dataset lengths do not coincide'

## Create a dataset of human proteins

In [10]:
# Retrieve dataset of human proteins
status, table, _ = uniprot.make_query('reviewed:yes AND organism:"Homo sapiens (Human) [9606]"', params={
    'compress': 'no',
    'columns': ','.join(['id', 'entry name', 'reviewed', 'protein names', 'genes', 'length', 'go', 'database(Pfam)']),
    'sort': 'score',
    'format': 'tab'
})

In [26]:
# Create DataFrame object

# Get rows and header row
rows = table.split('\n')
header = rows[0].split('\t')

# Instantiate new dataframe
human_ds = pd.DataFrame([row.split('\t') for row in rows[1:-1]], columns=header)
human_ds.to_csv(HUMAN_CSV_PATH, index=False)
human_ds.head()

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Length,Gene ontology (GO),Cross-reference (Pfam)
0,Q9Y263,PLAP_HUMAN,reviewed,Phospholipase A-2-activating protein (PLA2P) (...,PLAA PLAP,795,cell [GO:0005623]; cell junction [GO:0030054];...,PF09070;PF08324;PF00400;
1,Q96RE7,NACC1_HUMAN,reviewed,Nucleus accumbens-associated protein 1 (NAC-1)...,NACC1 BTBD14B NAC1,527,cell junction [GO:0030054]; cytoplasm [GO:0005...,PF10523;PF00651;
2,O43312,MTSS1_HUMAN,reviewed,Protein MTSS 1 (Metastasis suppressor YGL-1) (...,MTSS1 KIAA0429 MIM,755,actin cytoskeleton [GO:0015629]; cytoplasm [GO...,PF08397;PF02205;
3,Q9NP80,PLPL8_HUMAN,reviewed,Calcium-independent phospholipase A2-gamma (EC...,PNPLA8 IPLA22 IPLA2G BM-043,782,endoplasmic reticulum membrane [GO:0005789]; G...,PF01734;
4,Q15319,PO4F3_HUMAN,reviewed,"POU domain, class 4, transcription factor 3 (B...",POU4F3 BRN3C,338,cytoplasm [GO:0005737]; nuclear chromatin [GO:...,PF00046;PF00157;


In [12]:
# Add boolean column: is given protein in given PF00397 family?

# Define reference pfam
pfam = 'PF00397'
# Define matching pattern
pattern = r'(\;?){:s};'.format(pfam)

# Get families column
families = human_ds['Cross-reference (Pfam)'] 
# Add "is in family?" column
human_ds[pfam] = families.apply(lambda x: re.search(pattern, str(x)) is not None)
human_ds.head()

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Length,Gene ontology (GO),Cross-reference (Pfam),PF00397
0,Q9Y263,PLAP_HUMAN,reviewed,Phospholipase A-2-activating protein (PLA2P) (...,PLAA PLAP,795,cell [GO:0005623]; cell junction [GO:0030054];...,PF09070;PF08324;PF00400;,False
1,Q96RE7,NACC1_HUMAN,reviewed,Nucleus accumbens-associated protein 1 (NAC-1)...,NACC1 BTBD14B NAC1,527,cell junction [GO:0030054]; cytoplasm [GO:0005...,PF10523;PF00651;,False
2,O43312,MTSS1_HUMAN,reviewed,Protein MTSS 1 (Metastasis suppressor YGL-1) (...,MTSS1 KIAA0429 MIM,755,actin cytoskeleton [GO:0015629]; cytoplasm [GO...,PF08397;PF02205;,False
3,Q9NP80,PLPL8_HUMAN,reviewed,Calcium-independent phospholipase A2-gamma (EC...,PNPLA8 IPLA22 IPLA2G BM-043,782,endoplasmic reticulum membrane [GO:0005789]; G...,PF01734;,False
4,Q15319,PO4F3_HUMAN,reviewed,"POU domain, class 4, transcription factor 3 (B...",POU4F3 BRN3C,338,cytoplasm [GO:0005737]; nuclear chromatin [GO:...,PF00046;PF00157;,False


In [13]:
# Create gene ontology dataset

# Initialize gene ontology dataset
go_ds = {'Entry': [], 'GO id': [], 'GO descr': []}

# Iterate through each human dataset row
for i, row in human_ds.iterrows():
    entry = str(row['Entry'])
    go_str = str(row['Gene ontology (GO)'])
    go_ids = re.findall(r'\[(GO:.+?)\]', go_str)
    go_descr = [re.sub(r'\[(GO:.+?)\]', '', s).strip() for s in re.split('; ', go_str)]
    # Create entries in new dataframe
    for j in range(len(go_ids)):
        go_ds['Entry'].append(entry)
        go_id = re.sub('GO:', '', go_ids[j])
        go_ds['GO id'].append(go_id)
        go_ds['GO descr'].append(go_descr[j])
    
# Turn gene ontology dataset into Pandas DataFrame object
go_ds = pd.DataFrame(go_ds)
go_ds.to_csv(GO_CSV_PATH, index=False)
go_ds.head(5)

Unnamed: 0,Entry,GO id,GO descr
0,Q9Y263,5623,cell
1,Q9Y263,30054,cell junction
2,Q9Y263,5737,cytoplasm
3,Q9Y263,70062,extracellular exosome
4,Q9Y263,5634,nucleus


In [14]:
# Create protein family dataset

# Initialize protein family dataset
families_ds = {'Entry': [], 'Pfam': []}

# Iterate through each human dataset row
for i, row in human_ds.iterrows():
    # Get list of protein accessions
    entry = row['Entry']
    families = [f for f in str(row['Cross-reference (Pfam)']).split(';') if f != '']
    # Create a row for each family
    for j in range(len(families)):
        families_ds['Entry'].append(entry)
        families_ds['Pfam'].append(families[j])
        
# Turn dataset into a Pandas DataFrame object
families_ds = pd.DataFrame(families_ds)
families_ds.to_csv(FAMILIES_CSV_PATH, index=False)  # Save to disk
families_ds.head()

Unnamed: 0,Entry,Pfam
0,Q9Y263,PF09070
1,Q9Y263,PF08324
2,Q9Y263,PF00400
3,Q96RE7,PF10523
4,Q96RE7,PF00651


In [22]:
# Get proteins in PF00397 family
human_in_pfam_ds = human_ds[human_ds['PF00397']]
human_in_pfam_ds.to_csv(HUMAN_IN_PFAM_CSV_PATH, index=False)
human_in_pfam_ds.head()

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Length,Gene ontology (GO),Cross-reference (Pfam),PF00397
428,Q9BTA9,WAC_HUMAN,reviewed,WW domain-containing adapter protein with coil...,WAC KIAA1844,647,nuclear speck [GO:0016607]; nucleoplasm [GO:00...,PF00397;,True
871,Q9NZC7,WWOX_HUMAN,reviewed,WW domain-containing oxidoreductase (EC 1.1.1....,WWOX FOR SDR41C1 WOX1,414,cytoplasm [GO:0005737]; cytosol [GO:0005829]; ...,PF00106;PF00397;,True
966,Q9GZV5,WWTR1_HUMAN,reviewed,WW domain-containing transcription regulator p...,WWTR1 TAZ,400,cytoplasm [GO:0005737]; cytosol [GO:0005829]; ...,PF00397;,True
2246,Q8N3X1,FNBP4_HUMAN,reviewed,Formin-binding protein 4 (Formin-binding prote...,FNBP4 FBP30 KIAA1014,1017,nuclear speck [GO:0016607],PF00397;,True
2780,O15428,PINL_HUMAN,reviewed,Putative PIN1-like protein (Peptidylprolyl cis...,PIN1P1 PIN1L,100,cytosol [GO:0005829]; nucleus [GO:0005634]; pe...,PF00397;,True


In [16]:
# Get human proteins not in PF00397 family
human_no_pfam_ds = human_ds[~human_ds['PF00397']]
human_no_pfam_ds.to_csv(HUMAN_NO_PFAM_CSV_PATH, index=False)
human_no_pfam_ds.head()

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Length,Gene ontology (GO),Cross-reference (Pfam),PF00397
0,Q9Y263,PLAP_HUMAN,reviewed,Phospholipase A-2-activating protein (PLA2P) (...,PLAA PLAP,795,cell [GO:0005623]; cell junction [GO:0030054];...,PF09070;PF08324;PF00400;,False
1,Q96RE7,NACC1_HUMAN,reviewed,Nucleus accumbens-associated protein 1 (NAC-1)...,NACC1 BTBD14B NAC1,527,cell junction [GO:0030054]; cytoplasm [GO:0005...,PF10523;PF00651;,False
2,O43312,MTSS1_HUMAN,reviewed,Protein MTSS 1 (Metastasis suppressor YGL-1) (...,MTSS1 KIAA0429 MIM,755,actin cytoskeleton [GO:0015629]; cytoplasm [GO...,PF08397;PF02205;,False
3,Q9NP80,PLPL8_HUMAN,reviewed,Calcium-independent phospholipase A2-gamma (EC...,PNPLA8 IPLA22 IPLA2G BM-043,782,endoplasmic reticulum membrane [GO:0005789]; G...,PF01734;,False
4,Q15319,PO4F3_HUMAN,reviewed,"POU domain, class 4, transcription factor 3 (B...",POU4F3 BRN3C,338,cytoplasm [GO:0005737]; nuclear chromatin [GO:...,PF00046;PF00157;,False
...,...,...,...,...,...,...,...,...,...
20363,Q9NXW2,DJB12_HUMAN,reviewed,DnaJ homolog subfamily B member 12,DNAJB12,375,endoplasmic reticulum [GO:0005783]; endoplasmi...,PF00226;PF09320;,False
20364,O95395,GCNT3_HUMAN,reviewed,"Beta-1,3-galactosyl-O-glycosyl-glycoprotein be...",GCNT3,438,extracellular exosome [GO:0070062]; Golgi memb...,PF02485;,False
20365,Q9Y238,DLEC1_HUMAN,reviewed,Deleted in lung and esophageal cancer protein ...,DLEC1 DLC1,1755,cytoplasm [GO:0005737]; cytosol [GO:0005829]; ...,,False
20366,Q8TDM6,DLG5_HUMAN,reviewed,Disks large homolog 5 (Discs large protein P-d...,DLG5 KIAA0583 PDLG,1919,cell junction [GO:0030054]; cell-cell adherens...,PF00625;PF00595;PF17820;PF04822;,False


In [17]:
# Download fasta file
status, human_fasta, _ =  uniprot.make_query('reviewed:yes AND organism:"Homo sapiens (Human) [9606]"', params={
    'compress': 'no',
    'sort': 'score',
    'format': 'fasta'
})

In [18]:
# Check the retrieved fasta file
print(human_fasta[:250])

>sp|Q9Y263|PLAP_HUMAN Phospholipase A-2-activating protein OS=Homo sapiens OX=9606 GN=PLAA PE=1 SV=2
MTSGATRYRLSCSLRGHELDVRGLVCCAYPPGAFVSVSRDRTTRLWAPDSPNRSFTEMHC
MSGHSNFVSCVCIIPSSDIYPHGLIATGGNDHNICIFSLDSPMPLYILKGHKNTVCSLSS
GKFGTLLSGSWDTTAKVWLNDKCMMTL


In [19]:
# Save human fasta to disk
with open(HUMAN_FASTA_PATH, 'w') as human_fasta_file:
    human_fasta_file.write(human_fasta)