In [1]:
# Dependencies
import modules.blast as blast
import modules.uniprot as uniprot
import pandas as pd
import time

In [2]:
# Constants
BLAST_MATCHES_PATH = 'data/uniref90_matches.csv'  # Blast matches dataset
BLAST_FASTA_PATH = 'data/uniref90_blast.fasta'  # Blast matched proteins fasta file

In [3]:
# Check available parameters
status, params, _ = blast.get_parameters()

print(params)

['program', 'task', 'matrix', 'alignments', 'scores', 'exp', 'dropoff', 'match_scores', 'gapopen', 'gapext', 'filter', 'seqrange', 'gapalign', 'wordsize', 'taxids', 'compstats', 'align', 'transltable', 'stype', 'sequence', 'database']


In [4]:
# # Start a BLAST job
# status, job_id, _ = blast.run_job(email='damiano.clementel@studenti.unipd.it',
#                               sequence='VPSGWKAVFDDEYQTWYYVDLSTNSSQWEPP',
#                               params={'database': 'uniref90',
#                                       'matrix': 'BLOSUM62',
#                                       'alignments': 1000, 
#                                       'gapalign': True, 
#                                       'exp': '1e-3'})
# 
# # Check output
# print(status, job_id)

In [5]:
# Define results container
job_result = None
job_id = 'ncbiblast-R20200205-095307-0450-62657352-p1m'

# Retruieve BLAST job results
while True:
    # Make request for job status
    status, job_status, _ = blast.get_job_status(job_id)
    print('Job status: {:s}'.format(job_status))  # LOG
    # Check error
    if not status: break
    # Check if job has finished running
    if job_status == 'FINISHED':
        # Retrieve results
        status, job_result, _ = blast.get_job_result(job_id)
        print('Job exited with status: {:s}'.format(job_status))  # LOG
        break  # Exit loop
    # Wait 10 seconds befroe making another call
    time.sleep(10)

Job status: FINISHED
Job exited with status: FINISHED


In [6]:
# Turn job result into pandas DataFrame object
matches = pd.DataFrame(job_result)
# Retrieve full sequence
# matches['full_seq'] = matches['id'].apply(lambda x: uniprot.get_protein(x)[1])
# Save dataframe to file
matches.to_csv(BLAST_MATCHES_PATH, index=False)
# Show dataframe
matches.head()

Unnamed: 0,score,bits,expectation,identity,positives,gaps,strand,pattern_seq,match_seq,match_start,match_end,database,id,ac,description
0,182,74.7146,1e-15,100.0,100.0,0,none/none,VPSGWKAVFDDEYQTWYYVDLSTNSSQWEPP,VPSGWKAVFDDEYQTWYYVDLSTNSSQWEPP,11,41,UR90,UniRef90_P43582,WW,WW domain-containing protein WWM1 n=9 Tax=Sacc...
1,175,72.0182,1.1e-14,93.5,100.0,0,none/none,VPSGWKAVFDDEYQTW+YVDLSTN+SQWEPP,VPSGWKAVFDDEYQTWFYVDLSTNNSQWEPP,11,41,UR90,UniRef90_J8Q8J2,Wwm1p,Wwm1p n=1 Tax=Saccharomyces arboricola (strain...
2,173,71.2478,1.8e-14,93.5,100.0,0,none/none,VPSGWKAVFDDEYQTW+YV+LSTNSSQWEPP,VPSGWKAVFDDEYQTWFYVNLSTNSSQWEPP,11,41,UR90,UniRef90_J5RH20,WWM1-like,WWM1-like protein n=2 Tax=Saccharomyces TaxID=...
3,170,70.0922,4.7e-14,93.5,96.8,0,none/none,VPSGWKAVFDDEYQTW+YVDLSTNSSQWE P,VPSGWKAVFDDEYQTWFYVDLSTNSSQWEAP,11,41,UR90,UniRef90_A0A0L8RJY2,WWM1-like,WWM1-like protein n=1 Tax=Saccharomyces eubaya...
4,158,65.4698,2.8e-12,80.6,93.5,0,none/none,VP GWKAVFDDEY+TW+YV+L+TN SQWEPP,VPKGWKAVFDDEYKTWFYVNLATNQSQWEPP,11,41,UR90,UniRef90_A0A212M9M4,WW,WW domain-containing protein n=4 Tax=Zygosacch...


In [11]:
# Save full fasta file
with open(BLAST_FASTA_PATH, 'w') as out_file:
    # Get fasta file
    status, out, res = uniprot.get_proteins(matches.id.tolist(), batch_size=100)
    # Write fasta file to disk
    out_file.write(out)

In [None]:
# Check number of retrieved fasta files
fasta_len = sum([1 for c in out if c == '>'])  # Number of rows in fasta file
matches_len = matches.shape[0]  # Number of rows in dataframe

assert  fasta_len == matches_len, 'Fasta file and dataset lengths do not coincide'