In [1]:
# Dependencies
import modules.blast as blast
import modules.uniprot as uniprot
import modules.msa as msa
import pandas as pd
import numpy as np
import re
import time

In [2]:
# Constants
BLAST_MATCHES_PATH = 'data/blast.csv'  # Blast matches dataset
BLAST_FASTA_PATH = 'data/blast.fasta'  # Blast matched proteins fasta file
MSA_FASTA_PATH = 'data/msa.fasta'  # Multiple sequence alignment file
HUMAN_CSV_PATH = 'data/human.csv'  # Test set csv
HUMAN_IN_PFAM_CSV_PATH = 'data/human_in_pfam.csv'
HUMAN_NO_PFAM_CSV_PATH = 'data/human_no_pfam.csv'
HUMAN_FASTA_PATH = 'data/human.fasta'  # Test set fasta
GO_CSV_PATH = 'data/go.csv'  # GO terms dataset
FAMILIES_CSV_PATH = 'data/families.csv'  # Pfam families dataset

## BLAST search

Execute a blast search using EBI restful web service

In [None]:
# Start a BLAST job
status, job_id, _ = blast.run_job(email='damiano.clementel@studenti.unipd.it',
                              sequence='VPSGWKAVFDDEYQTWYYVDLSTNSSQWEPP',
                              params={'database': 'uniref90',
                                      'matrix': 'BLOSUM62',
                                      'alignments': 1000, 
                                      'gapalign': True, 
                                      'exp': '1e-3'})

# Check output
print('Started job: {:s}'.format(job_id))

In [None]:
# Define results container
job_result = None
# job_id = 'ncbiblast-R20200206-172705-0918-98731783-p1m'

# Retruieve BLAST job results
while True:
    # Make request for job status
    status, job_status, _ = blast.get_job_status(job_id)
    print('Job status: {:s}'.format(job_status))  # LOG
    # Check error
    if not status: break
    # Check if job has finished running
    if job_status == 'FINISHED':
        # Retrieve results
        status, job_result, _ = blast.get_job_result(job_id)
        print('Job exited with status: {:s}'.format(job_status))  # LOG
        break  # Exit loop
    # Wait 10 seconds befroe making another call
    time.sleep(10)

In [None]:
# Turn job result into pandas DataFrame object
matches = pd.DataFrame(job_result)
matches.to_csv(BLAST_MATCHES_PATH, index=False)  # Save to disk

# Show dataframe
matches.head()

## Create FASTA file from BLAST

In [None]:
# Save full fasta file
with open(BLAST_FASTA_PATH, 'w') as blast_fasta_file:
    # Get fasta file
    status, blast_fasta, _ = uniprot.get_proteins(matches.id.tolist(), batch_size=100)
    # Write fasta file to disk
    blast_fasta_file.write(blast_fasta)

In [None]:
# Check number of retrieved fasta files
blast_fasta_len = sum([1 for c in blast_fasta if c == '>'])  # Number of rows in fasta file
matches_len = matches.shape[0]  # Number of rows in dataframe

assert  blast_fasta_len == matches_len, 'Fasta file and dataset lengths do not coincide'

## Execute multiple sequence alignment

In [None]:
# Make multiple sequence alignment

# Create MSA job
status, job_id, _ = msa.run_clustalo(email='damiano.clementel@studenti.unipd.it', sequence=blast_fasta)

# Wait until MSA job finishes
while True:
    # Check response status
    status, job_status, _ = msa.get_job_status(job_id, algorithm=msa.CLUSTALO)
    # Check if response status is 200 OK
    if not status:
        print('Error: job exited')
        break 
    # Check if job has finished, then go on
    print('Job status: {:s}'.format(job_status))
    if job_status == 'FINISHED': break
    # Add delay
    time.sleep(3)
print()
    
# Get results
if status:
    # Retrieve result as fasta
    status, msa_fasta, _ = msa.get_job_result(job_id, algorithm=msa.CLUSTALO)
    # Save msa to file
    with open(MSA_FASTA_PATH, 'w') as msa_fasta_file:
        msa_fasta_file.write(msa_fasta)
    # Show fasta file head
    print('Retrieved fasta file')
    print(msa_fasta[:2000])

In [None]:
# Check length of retrieved fasta
msa_fasta_len = sum([1 for c in blast_fasta if c == '>'])  # Number of rows in fasta file

assert blast_fasta_len == matches_len, 'Alignments fasta file and dataset lengths do not coincide'

## Create a dataset of human proteins

In [3]:
# Retrieve dataset of human proteins
status, table, _ = uniprot.make_query('reviewed:yes AND organism:"Homo sapiens (Human) [9606]"', params={
    'compress': 'no',
    'columns': ','.join(['id', 'entry name', 'reviewed', 'protein names', 'genes', 'length', 'go', 
                         'database(Pfam)', 'feature(DOMAIN EXTENT)']),
    'sort': 'score',
    'format': 'tab'
})

In [4]:
# Create DataFrame object

# Get rows and header row
rows = table.split('\n')
header = rows[0].split('\t')

# Instantiate new dataframe
human_ds = pd.DataFrame([row.split('\t') for row in rows[1:-1]], columns=header)
human_ds.to_csv(HUMAN_CSV_PATH, sep='\t', index=False)
human_ds.head()

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Length,Gene ontology (GO),Cross-reference (Pfam),Domain [FT]
0,Q9Y263,PLAP_HUMAN,reviewed,Phospholipase A-2-activating protein (PLA2P) (...,PLAA PLAP,795,cell [GO:0005623]; cell junction [GO:0030054];...,PF09070;PF08324;PF00400;,"DOMAIN 366..465; /note=""PFU""; /evidence=""ECO..."
1,Q96RE7,NACC1_HUMAN,reviewed,Nucleus accumbens-associated protein 1 (NAC-1)...,NACC1 BTBD14B NAC1,527,cell junction [GO:0030054]; cytoplasm [GO:0005...,PF10523;PF00651;,"DOMAIN 30..94; /note=""BTB""; /evidence=""ECO:0..."
2,O43312,MTSS1_HUMAN,reviewed,Protein MTSS 1 (Metastasis suppressor YGL-1) (...,MTSS1 KIAA0429 MIM,755,actin cytoskeleton [GO:0015629]; cytoplasm [GO...,PF08397;PF02205;,"DOMAIN 1..250; /note=""IMD""; /evidence=""ECO:0..."
3,Q9NP80,PLPL8_HUMAN,reviewed,Calcium-independent phospholipase A2-gamma (EC...,PNPLA8 IPLA22 IPLA2G BM-043,782,endoplasmic reticulum membrane [GO:0005789]; G...,PF01734;,"DOMAIN 445..640; /note=""PNPLA""; /evidence=""E..."
4,Q15319,PO4F3_HUMAN,reviewed,"POU domain, class 4, transcription factor 3 (B...",POU4F3 BRN3C,338,cytoplasm [GO:0005737]; nuclear chromatin [GO:...,PF00046;PF00157;,"DOMAIN 179..256; /note=""POU-specific""; /evid..."


In [8]:
print(human_ds.iloc[0]['Domain [FT]'])

DOMAIN 366..465;  /note="PFU";  /evidence="ECO:0000255|PROSITE-ProRule:PRU00727"; DOMAIN 533..794;  /note="PUL";  /evidence="ECO:0000255|PROSITE-ProRule:PRU00729"


In [None]:
# Add boolean column: is given protein in given PF00397 family?

# Define reference pfam
pfam = 'PF00397'
# Define matching pattern
pattern = r'(\;?){:s};'.format(pfam)

# Get families column
families = human_ds['Cross-reference (Pfam)'] 
# Add "is in family?" column
human_ds[pfam] = families.apply(lambda x: re.search(pattern, str(x)) is not None)
human_ds.to_csv(HUMAN_CSV_PATH, sep='\t', index=False)
human_ds.head()

In [None]:
# Create gene ontology dataset

# Initialize gene ontology dataset
go_ds = {'Entry': [], 'GO id': [], 'GO descr': []}

# Iterate through each human dataset row
for i, row in human_ds.iterrows():
    entry = str(row['Entry'])
    go_str = str(row['Gene ontology (GO)'])
    go_ids = re.findall(r'\[(GO:.+?)\]', go_str)
    go_descr = [re.sub(r'\[(GO:.+?)\]', '', s).strip() for s in re.split('; ', go_str)]
    # Create entries in new dataframe
    for j in range(len(go_ids)):
        go_ds['Entry'].append(entry)
        go_id = re.sub('GO:', '', go_ids[j])
        go_ds['GO id'].append(go_id)
        go_ds['GO descr'].append(go_descr[j])
    
# Turn gene ontology dataset into Pandas DataFrame object
go_ds = pd.DataFrame(go_ds)
go_ds.to_csv(GO_CSV_PATH, sep='\t', index=False)
go_ds.head(5)

In [None]:
# Create protein family dataset

# Initialize protein family dataset
families_ds = {'Entry': [], 'Pfam': []}

# Iterate through each human dataset row
for i, row in human_ds.iterrows():
    # Get list of protein accessions
    entry = row['Entry']
    families = [f for f in str(row['Cross-reference (Pfam)']).split(';') if f != '']
    # Create a row for each family
    for j in range(len(families)):
        families_ds['Entry'].append(entry)
        families_ds['Pfam'].append(families[j])
        
# Turn dataset into a Pandas DataFrame object
families_ds = pd.DataFrame(families_ds)
families_ds.to_csv(FAMILIES_CSV_PATH, sep='\t', index=False)  # Save to disk
families_ds.head()

In [None]:
# Get proteins in PF00397 family
human_in_pfam_ds = human_ds[human_ds['PF00397']]
human_in_pfam_ds.to_csv(HUMAN_IN_PFAM_CSV_PATH, sep='\t', index=False)
human_in_pfam_ds.head()

In [None]:
# Get human proteins not in PF00397 family
human_no_pfam_ds = human_ds[~human_ds['PF00397']]
human_no_pfam_ds.to_csv(HUMAN_NO_PFAM_CSV_PATH, sep='\t', index=False)
human_no_pfam_ds.head()

In [None]:
# Download fasta file
status, human_fasta, _ =  uniprot.make_query('reviewed:yes AND organism:"Homo sapiens (Human) [9606]"', params={
    'compress': 'no',
    'sort': 'score',
    'format': 'fasta'
})

In [None]:
# Check the retrieved fasta file
print(human_fasta[:250])

In [None]:
# Save human fasta to disk
with open(HUMAN_FASTA_PATH, 'w') as human_fasta_file:
    human_fasta_file.write(human_fasta)