In [None]:
!pip install bio



#Blast KNN Implementation

BLAST is abbreviated to Basic Local Alignment Search Tool, which is used to find the regions of similarity between biological sequences.

Biopython library provides Bio.Blast module to deal with the NCBI BLAST operation.

The concept behind the Blast KNN involves the process of finding the homologous proteins (proteins with similar sequences) that have already been characterized and annoted with GO terms. This works under the assumption that proteins with similar sequences have similar functions and thus share GO terms

**Step by Step process**

**Step 1: Sequence Alignment with BLAST:**

  **-->Input your query protein sequence:** This is the sequence for which the GO terms need to be predicted.

  **-->Run BLAST:** The BLAST (Basic Local Alignment Search Tool) is used to compare the query sequence against a database of known protein sequences. For this non-redundant (nr) database is used from NCBI.

  **-->Retrieve homologous sequences:** BLAST returns a list of homologous sequences ranked by their similarity to the query sequence, using metrics such as E-values and bit-scores.


**Step 2: Collect GO Annotations:**


  **-->Identify homologous proteins with known GO terms:** From the BLAST results, identify proteins that are well-characterized and have GO term annotations.

  **-->Extract GO terms:** The GO terms for these homologous proteins can often be found in the same database from which the sequences were retrieved or through linked databases such as UniProt or NCBI Gene.


**Step 3: Assign GO Terms to the Query Sequence:**

  **-->Aggregate GO terms from homologs:** Compile the GO terms from all significant homologs. The significance is usually determined by a threshold E-value,which is set as 0.001 here and this indicates the probability that the sequence alignment occurred by chance. Only matches below this threshold are considered.

  **-->Score GO terms:** Optionally, score the GO terms based on the degree of similarity to the query sequence. This could involve simple counting, weighting terms by the similarity score of the homolog, or using more sophisticated statistical methods.

  **-->Predict GO terms:** Assign GO terms to the query sequence based on the aggregated information. For instance, terms that are common among the most similar homologs might be assigned to the query protein.

**Step 4: Evaluate and Refine:**

  **-->Evaluate predictions:** If you have a set of proteins with known GO annotations, you can evaluate the accuracy of your predictions against this "gold standard."

  **-->Refine the method:** Depending on the evaluation, you may adjust your method, such as changing the E-value threshold, incorporating more databases, or improving how you score and choose GO terms.

In [None]:
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os
import random

In [None]:
annotated_proteins=pd.read_csv('protein_data.csv')
annotated_proteins.head()

  annotated_proteins=pd.read_csv('protein_data.csv')


Unnamed: 0.1,Unnamed: 0,EntryID,sequence,organism_id,taxonomyID,term
0,0,P20536,MNSVTVSHAPYTITYHDDWEPVMSQLVEFYNEVASWLLRDETSPIP...,10249,10249.0,"GO:0008152, GO:0071897, GO:0044249, GO:0006259..."
1,1,O73864,MTEYRNFLLLFITSLSVIYPCTGISWLGLTINGSSVGWNQTHHCKL...,7955,7955.0,"GO:0061371, GO:0048589, GO:0051641, GO:0048856..."
2,2,O95231,MRLSSSPPRGPQQLSSFGSVDWLSQSSCSGPTHTPRPADFSLGSLP...,9606,9606.0,"GO:0006357, GO:0010557, GO:0045935, GO:0065007..."
3,3,A0A0B4J1F4,MGGEAGADGPRGRVKSLGLVFEDESKGCYSSGETVAGHVLLEAAEP...,10090,10090.0,"GO:0008152, GO:0051234, GO:0036211, GO:0070727..."
4,4,P54366,MVETNSPPAGYTLKRSPSDLGEQQQPPRQISRSPGNTAAYHLTTAM...,7227,7227.0,"GO:0005622, GO:0043229, GO:0043226, GO:0110165..."


Example: Use the protein sequence of P20536 to identify homologous proteins using BLAST

In [None]:
sequence = annotated_proteins[annotated_proteins['EntryID']=='P20536']

sequence_Id= annotated_proteins.EntryID.tolist()[0]

result= NCBIWWW.qblast("blastp", "nr", sequence)

result

<_io.StringIO at 0x7d753da57f40>

In [None]:
blast_record = NCBIXML.read(result)


Below are the homologs for the given sequence

In [None]:
homologs_proteins = []
for alignment in blast_record.alignments:
      for hsp in alignment.hsps:
          homologs_proteins.append((alignment.accession, hsp.expect, hsp.bits))
sorted_homologs = sorted(homologs_proteins, key=lambda x: x[2],reverse=True)
print(sorted_homologs)

[('5JKT_A', 7.51506e-25, 104.375), ('5JKS_A', 7.76141e-25, 104.375), ('4OD8_A', 8.73577e-25, 104.375), ('YP_232991', 9.58999e-24, 101.293), ('UPV00359', 9.79872e-24, 101.293), ('SMZ64663', 9.90479e-24, 101.293), ('ATB55385', 9.90479e-24, 101.293), ('P20536', 1.02299e-23, 101.293), ('P04303', 1.02299e-23, 101.293), ('AAX78445', 1.69105e-23, 101.293), ('WOW88425', 9.70149e-24, 100.908), ('UZL87749', 9.95016e-24, 100.908), ('ADZ29444', 1.05657e-23, 100.908), ('UZS34520', 1.13927e-23, 100.908), ('WHP54032', 1.13927e-23, 100.908), ('UZS34695', 1.21528e-23, 100.908), ('UWO73681', 1.24173e-23, 100.908), ('WRO04390', 1.24173e-23, 100.908), ('UUV50126', 1.32458e-23, 100.908), ('UZV32540', 1.35341e-23, 100.908), ('UXB90208', 1.36806e-23, 100.908), ('UZV17383', 1.39783e-23, 100.908), ('UUV52990', 1.41296e-23, 100.908), ('3NT7_A', 1.65864e-23, 100.908), ('ATB55163', 7.23763e-25, 100.523), ('UXL95541', 1.12822e-24, 100.523), ('WEW78007', 5.00448e-24, 100.523), ('UYX46851', 1.42298e-23, 100.523), ('

In [None]:
accessions=[]
for accession,e_value,bits in sorted_homologs:
  if e_value<0.001:
    accessions.append(accession)
print(accessions)

['5JKT_A', '5JKS_A', '4OD8_A', 'YP_232991', 'UPV00359', 'SMZ64663', 'ATB55385', 'P20536', 'P04303', 'AAX78445', 'WOW88425', 'UZL87749', 'ADZ29444', 'UZS34520', 'WHP54032', 'UZS34695', 'UWO73681', 'WRO04390', 'UUV50126', 'UZV32540', 'UXB90208', 'UZV17383', 'UUV52990', '3NT7_A', 'ATB55163', 'UXL95541', 'WEW78007', 'UYX46851', 'AGR36562', 'WDO57146', 'UUV52453', 'URP85043', 'NP_536528', 'UZS35389', 'WOW73496', 'UXP42083', 'UVT70085', 'UXL62039', 'UTZ20153', 'UVT69552', 'URQ22802', 'WCS73224', 'WKW66547', 'WAB04291', 'WMP27482', 'WOW76001', 'UTZ19261', 'UWO44168', 'WEF39764', 'WVM35526']


Accessions contain the homologous list of proteins of the source protein sequence 'P20536'. Let us consider top-5 homologous proteins for simplicity

Our next goal is to identify the GO terms for these five protein sequences

In [None]:
import requests

def QuickGo(accession):
    url = "https://www.ebi.ac.uk/QuickGO/services/annotation/search"
    params = {'geneProductId': accession}
    response = requests.get(url, params=params)

    try:
        if response.status_code == 200:
            data = response.json()
            if data.get('results'):
                annotations = [{
                    'goId': result['goId'],
                    'evidenceCode': result['evidenceCode'],
                    'goAspect': result['goAspect']
                } for result in data['results']]
                return annotations
            else:
                print(f"No annotations found for {accession}.")
                return None
        else:
            #print(f"Error: {response.status_code}")
            #print(f"Message: {response.json().get('message', 'No error message available')}")
            return None
    except ValueError:
        print("Failed to decode JSON from response.")
        print("Response status code:", response.status_code)
        print("Response text:", response.text)
        return None

accessions = accessions  # Your list of accessions

homologs_proteins_go = []

for accession in accessions:
    annotations = QuickGo(accession)
    if annotations:
        homologs_proteins_go.append((accession, annotations))

# Now `homologs_proteins_go` contains tuples of accession numbers and their GO annotations
for item in homologs_proteins_go:
    print(item[0], item[1])  # This prints each accession and its annotations


P20536 [{'goId': 'GO:0003677', 'evidenceCode': 'ECO:0007322', 'goAspect': 'molecular_function'}, {'goId': 'GO:0004844', 'evidenceCode': 'ECO:0000501', 'goAspect': 'molecular_function'}, {'goId': 'GO:0005515', 'evidenceCode': 'ECO:0000353', 'goAspect': 'molecular_function'}, {'goId': 'GO:0016787', 'evidenceCode': 'ECO:0007322', 'goAspect': 'molecular_function'}, {'goId': 'GO:0016799', 'evidenceCode': 'ECO:0000256', 'goAspect': 'molecular_function'}, {'goId': 'GO:0006281', 'evidenceCode': 'ECO:0000256', 'goAspect': 'biological_process'}, {'goId': 'GO:0006281', 'evidenceCode': 'ECO:0007322', 'goAspect': 'biological_process'}, {'goId': 'GO:0006974', 'evidenceCode': 'ECO:0007322', 'goAspect': 'biological_process'}, {'goId': 'GO:0039693', 'evidenceCode': 'ECO:0000314', 'goAspect': 'biological_process'}]
P04303 [{'goId': 'GO:0003677', 'evidenceCode': 'ECO:0007322', 'goAspect': 'molecular_function'}, {'goId': 'GO:0004844', 'evidenceCode': 'ECO:0000501', 'goAspect': 'molecular_function'}, {'goI

Now let us integrate all the blocks to one modular programming block

In [None]:
from Bio.Blast import NCBIWWW, NCBIXML
import pandas as pd
import requests

# Let's assume annotated_proteins is a DataFrame that has been defined earlier
# and has columns 'EntryID' and 'Sequence'

def QuickGo(accession):
    url = "https://www.ebi.ac.uk/QuickGO/services/annotation/search"
    params = {'geneProductId': accession}
    response = requests.get(url, params=params)

    if response.status_code == 200:
        try:
            data = response.json()
            if data.get('results'):
                return [result['goId'] for result in data['results']]
            else:
                # Uncomment the print statement below for debugging
                # print(f"No annotations found for {accession}.")
                return []
        except ValueError:
            # Uncomment the print statements below for debugging
            # print(f"Invalid JSON response for {accession}.")
            # print("Response text:", response.text)
            return []
    else:
        # Uncomment the print statement below for debugging
        # print(f"Error {response.status_code} for {accession}: {response.text}")
        return []

# Assuming `accessions` is a list of accession numbers obtained from the previous BLAST results
homologs_proteins_go = {}

for accession in accessions:
    go_terms = QuickGo(accession)
    if go_terms:
        homologs_proteins_go[accession] = ','.join(go_terms)

# Now `homologs_proteins_go` is a dictionary where keys are accession numbers,
# and values are comma-separated GO terms for each accession.

# Example code to perform BLAST and retrieve GO terms for each entry ID
# Limiting to the first 25 entries for demonstration; remove the slice [:25] for full processing
entry_ids = annotated_proteins['EntryID'].tolist()[:25]

# Dictionary to store the comma-separated GO terms for each EntryID
entry_to_go_terms = {}

for entry_id in entry_ids:
    sequence = annotated_proteins.loc[annotated_proteins['EntryID'] == entry_id, 'sequence'].iloc[0]
    result = NCBIWWW.qblast("blastp", "nr", sequence)
    blast_record = NCBIXML.read(result)

    homologs_proteins = []
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < 0.001:
                homologs_proteins.append(alignment.accession)

    for accession in homologs_proteins:
        go_terms = QuickGo(accession)
        if go_terms:
            # Store or update the GO terms for the current EntryID
            existing_terms = entry_to_go_terms.get(entry_id, "")
            new_terms = ','.join(go_terms)
            entry_to_go_terms[entry_id] = f"{existing_terms},{new_terms}" if existing_terms else new_terms



Extract the required go-terms for each of the accession proteins

In [None]:
homologs_proteins_go

{'P20536': 'GO:0003677,GO:0004844,GO:0005515,GO:0016787,GO:0016799,GO:0006281,GO:0006281,GO:0006974,GO:0039693',
 'P04303': 'GO:0003677,GO:0004844,GO:0016787,GO:0016799,GO:0006281,GO:0006281,GO:0006974'}