In [1]:
# Dependencies
import modules.blast as blast
import modules.uniprot as uniprot
import modules.msa as msa
import pandas as pd
import time

In [2]:
# Constants
BLAST_MATCHES_PATH = 'data/blast.csv'  # Blast matches dataset
BLAST_FASTA_PATH = 'data/blast.fasta'  # Blast matched proteins fasta file
MSA_FASTA_PATH = 'data/msa.fasta'  # Multiple sequence alignment file
HUMAN_CSV_PATH = 'data/human.csv'  # Test set csv
HUMAN_FASTA_PATH = 'data/human.fasta'  # Test set fasta

## BLAST search

Execute a blast search using EBI restful web service

In [3]:
# Start a BLAST job
status, job_id, _ = blast.run_job(email='damiano.clementel@studenti.unipd.it',
                              sequence='VPSGWKAVFDDEYQTWYYVDLSTNSSQWEPP',
                              params={'database': 'uniref90',
                                      'matrix': 'BLOSUM62',
                                      'alignments': 1000, 
                                      'gapalign': True, 
                                      'exp': '1e-3'})

# Check output
print('Started job: {:s}'.format(job_id))

Started job: ncbiblast-R20200206-223016-0184-83717379-p2m


In [4]:
# Define results container
job_result = None
# job_id = 'ncbiblast-R20200206-172705-0918-98731783-p1m'

# Retruieve BLAST job results
while True:
    # Make request for job status
    status, job_status, _ = blast.get_job_status(job_id)
    print('Job status: {:s}'.format(job_status))  # LOG
    # Check error
    if not status: break
    # Check if job has finished running
    if job_status == 'FINISHED':
        # Retrieve results
        status, job_result, _ = blast.get_job_result(job_id)
        print('Job exited with status: {:s}'.format(job_status))  # LOG
        break  # Exit loop
    # Wait 10 seconds befroe making another call
    time.sleep(10)

Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: FINISHED
Job exited with status: FINISHED


In [5]:
# Turn job result into pandas DataFrame object
matches = pd.DataFrame(job_result)
matches.to_csv(BLAST_MATCHES_PATH, index=False)  # Save to disk

# Show dataframe
matches.head()

Unnamed: 0,score,bits,expectation,identity,positives,gaps,strand,pattern_seq,match_seq,match_start,match_end,database,id,ac,description
0,182,74.7146,1e-15,100.0,100.0,0,none/none,VPSGWKAVFDDEYQTWYYVDLSTNSSQWEPP,VPSGWKAVFDDEYQTWYYVDLSTNSSQWEPP,11,41,UR90,UniRef90_P43582,WW,WW domain-containing protein WWM1 n=9 Tax=Sacc...
1,175,72.0182,1.1e-14,93.5,100.0,0,none/none,VPSGWKAVFDDEYQTW+YVDLSTN+SQWEPP,VPSGWKAVFDDEYQTWFYVDLSTNNSQWEPP,11,41,UR90,UniRef90_J8Q8J2,Wwm1p,Wwm1p n=1 Tax=Saccharomyces arboricola (strain...
2,173,71.2478,1.8e-14,93.5,100.0,0,none/none,VPSGWKAVFDDEYQTW+YV+LSTNSSQWEPP,VPSGWKAVFDDEYQTWFYVNLSTNSSQWEPP,11,41,UR90,UniRef90_J5RH20,WWM1-like,WWM1-like protein n=2 Tax=Saccharomyces TaxID=...
3,170,70.0922,4.7e-14,93.5,96.8,0,none/none,VPSGWKAVFDDEYQTW+YVDLSTNSSQWE P,VPSGWKAVFDDEYQTWFYVDLSTNSSQWEAP,11,41,UR90,UniRef90_A0A0L8RJY2,WWM1-like,WWM1-like protein n=1 Tax=Saccharomyces eubaya...
4,158,65.4698,2.8e-12,80.6,93.5,0,none/none,VP GWKAVFDDEY+TW+YV+L+TN SQWEPP,VPKGWKAVFDDEYKTWFYVNLATNQSQWEPP,11,41,UR90,UniRef90_A0A212M9M4,WW,WW domain-containing protein n=4 Tax=Zygosacch...


## Create FASTA file from BLAST

In [7]:
# Save full fasta file
with open(BLAST_FASTA_PATH, 'w') as blast_fasta_file:
    # Get fasta file
    status, blast_fasta, _ = uniprot.get_proteins(matches.id.tolist(), batch_size=100)
    # Write fasta file to disk
    blast_fasta_file.write(blast_fasta)

In [8]:
# Check number of retrieved fasta files
blast_fasta_len = sum([1 for c in blast_fasta if c == '>'])  # Number of rows in fasta file
matches_len = matches.shape[0]  # Number of rows in dataframe

assert  blast_fasta_len == matches_len, 'Fasta file and dataset lengths do not coincide'

## Execute multiple sequence alignment

In [9]:
# Make multiple sequence alignment

# Create MSA job
status, job_id, _ = msa.run_clustalo(email='damiano.clementel@studenti.unipd.it', sequence=blast_fasta)

# Wait until MSA job finishes
while True:
    # Check response status
    status, job_status, _ = msa.get_job_status(job_id, algorithm=msa.CLUSTALO)
    # Check if response status is 200 OK
    if not status:
        print('Error: job exited')
        break 
    # Check if job has finished, then go on
    print('Job status: {:s}'.format(job_status))
    if job_status == 'FINISHED': break
    # Add delay
    time.sleep(3)
print()
    
# Get results
if status:
    # Retrieve result as fasta
    status, msa_fasta, _ = msa.get_job_result(job_id, algorithm=msa.CLUSTALO)
    # Save msa to file
    with open(MSA_FASTA_PATH, 'w') as msa_fasta_file:
        msa_fasta_file.write(msa_fasta)
    # Show fasta file head
    print('Retrieved fasta file')
    print(msa_fasta[:2000])

Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: RUNNING
Job status: FINISHED

Retrieved fasta file
>UniRef90_P43582 WW domain-containing protein WWM1 n=9 Tax=Saccharomyces TaxID=4930 RepID=WWM1_YEAST
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
-------

In [10]:
# Check length of retrieved fasta
msa_fasta_len = sum([1 for c in blast_fasta if c == '>'])  # Number of rows in fasta file

assert blast_fasta_len == matches_len, 'Alignments fasta file and dataset lengths do not coincide'

## Create a dataset of human proteins

In [11]:
# Retrieve dataset of human proteins
status, table, _ = uniprot.make_query('organism:"Homo sapiens (Human) [9606]"', params={
    'compress': 'no',
    'columns': ','.join(['id', 'entry name', 'reviewed', 'protein names', 'genes', 'length']),
    'sort': 'score',
    'format': 'tab'
})

In [12]:
# Create DataFrame object

# Get rows and header row
rows = table.split('\n')
header = rows[0].split('\t')

# Instantiate new dataframe
human_ds = pd.DataFrame([row.split('\t') for row in rows[1:]], columns=header)
human_ds.to_csv(HUMAN_CSV_PATH, index=False)
human_ds.head()

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Length
0,Q9Y263,PLAP_HUMAN,reviewed,Phospholipase A-2-activating protein (PLA2P) (...,PLAA PLAP,795
1,Q96RE7,NACC1_HUMAN,reviewed,Nucleus accumbens-associated protein 1 (NAC-1)...,NACC1 BTBD14B NAC1,527
2,O43312,MTSS1_HUMAN,reviewed,Protein MTSS 1 (Metastasis suppressor YGL-1) (...,MTSS1 KIAA0429 MIM,755
3,Q9NP80,PLPL8_HUMAN,reviewed,Calcium-independent phospholipase A2-gamma (EC...,PNPLA8 IPLA22 IPLA2G BM-043,782
4,Q15319,PO4F3_HUMAN,reviewed,"POU domain, class 4, transcription factor 3 (B...",POU4F3 BRN3C,338


In [13]:
# Get non reviewed proteins
human_ds[human_ds.Status != 'reviewed'].head()

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Length
26,H0YNE8,H0YNE8_HUMAN,unreviewed,Amyloid-beta A4 precursor protein-binding fami...,APBA2,27
27,F8W0M7,F8W0M7_HUMAN,unreviewed,Methionyl-tRNA synthetase 1,MARS1,95
28,G3V1D8,G3V1D8_HUMAN,unreviewed,Leucine rich repeat containing 24 (Leucine-ric...,LRRC24 hCG_1818221,510
29,B2XJG5,B2XJG5_HUMAN,unreviewed,NADH-ubiquinone oxidoreductase chain 4 (EC 7.1...,ND4,459
30,B2XHT9,B2XHT9_HUMAN,unreviewed,Cytochrome c oxidase subunit 2,COX2,227


In [14]:
# Get reviewed proteins
human_ds[human_ds.Status == 'reviewed'].head()

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Length
0,Q9Y263,PLAP_HUMAN,reviewed,Phospholipase A-2-activating protein (PLA2P) (...,PLAA PLAP,795
1,Q96RE7,NACC1_HUMAN,reviewed,Nucleus accumbens-associated protein 1 (NAC-1)...,NACC1 BTBD14B NAC1,527
2,O43312,MTSS1_HUMAN,reviewed,Protein MTSS 1 (Metastasis suppressor YGL-1) (...,MTSS1 KIAA0429 MIM,755
3,Q9NP80,PLPL8_HUMAN,reviewed,Calcium-independent phospholipase A2-gamma (EC...,PNPLA8 IPLA22 IPLA2G BM-043,782
4,Q15319,PO4F3_HUMAN,reviewed,"POU domain, class 4, transcription factor 3 (B...",POU4F3 BRN3C,338


In [15]:
# Download fasta file
status, human_fasta, _ =  uniprot.make_query('organism:"Homo sapiens (Human) [9606]"', params={
    'compress': 'no',
    'sort': 'score',
    'format': 'fasta'
})

In [16]:
# Check the retrieved fasta file
print(human_fasta[:1000])

>sp|Q9Y263|PLAP_HUMAN Phospholipase A-2-activating protein OS=Homo sapiens OX=9606 GN=PLAA PE=1 SV=2
MTSGATRYRLSCSLRGHELDVRGLVCCAYPPGAFVSVSRDRTTRLWAPDSPNRSFTEMHC
MSGHSNFVSCVCIIPSSDIYPHGLIATGGNDHNICIFSLDSPMPLYILKGHKNTVCSLSS
GKFGTLLSGSWDTTAKVWLNDKCMMTLQGHTAAVWAVKILPEQGLMLTGSADKTVKLWKA
GRCERTFSGHEDCVRGLAILSETEFLSCANDASIRRWQITGECLEVYYGHTNYIYSISVF
PNCRDFVTTAEDRSLRIWKHGECAQTIRLPAQSIWCCCVLDNGDIVVGASDGIIRVFTES
EDRTASAEEIKAFEKELSHATIDSKTGDLGDINAEQLPGREHLNEPGTREGQTRLIRDGE
KVEAYQWSVSEGRWIKIGDVVGSSGANQQTSGKVLYEGKEFDYVFSIDVNEGGPSYKLPY
NTSDDPWLTAYNFLQKNDLNPMFLDQVAKFIIDNTKGQMLGLGNPSFSDPFTGGGRYVPG
SSGSSNTLPTADPFTGAGRYVPGSASMGTTMAGVDPFTGNSAYRSAASKTMNIYFPKKEA
VTFDQANPTQILGKLKELNGTAPEEKKLTEDDLILLEKILSLICNSSSEKPTVQQLQILW
KAINCPEDIVFPALDILRLSIKHPSVNENFCNEKEGAQFSSHLINLLNPKGKPANQLLAL
RTFCNCFVGQAGQKLMMSQRESLMSHAIELKSGSNKNIHIALATLALNYSVCFHKDHNIE
GKAQCLSLISTILEVVQDLEATFRLLVALGTLISDDSNAVQLAKSLGVDSQIKKYSSVSE
PAKVSECCRFILNLL
>sp|Q96RE7|NACC1_HUMAN Nucleus accumbens-associated protein 1 OS=Homo sapiens OX=9606 GN=N

In [17]:
# Save human fasta to disk
with open(HUMAN_FASTA_PATH, 'w') as human_fasta_file:
    human_fasta_file.write(human_fasta)