<a href="https://colab.research.google.com/github/dharshinikbt23-crypto/Bioinformatics-5th-sem/blob/main/NGS_for_genome_annotation_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Functional Annotation of Protein Sequences
# This workflow annotates proteins using online databases

# Install required packages
!pip install biopython requests pandas

import requests
import time
from Bio import SeqIO
from google.colab import files
import pandas as pd
from io import StringIO
import json

print("=" * 70)
print("FUNCTIONAL ANNOTATION OF PROTEIN SEQUENCES")
print("=" * 70)

# Step 1: Upload or download protein sequences
print("\n[Step 1] Get protein sequences")
print("Choose an option:")
print("1. Upload your own FASTA file")
print("2. Download from Zenodo (https://zenodo.org/record/6861851/files/proteins.fasta)")

choice = input("Enter choice (1 or 2): ").strip()

if choice == "1":
    print("\nUpload your protein FASTA file:")
    uploaded = files.upload()
    fasta_file = list(uploaded.keys())[0]
    with open('proteins.fasta', 'wb') as f:
        f.write(uploaded[fasta_file])
elif choice == "2":
    print("\nDownloading from Zenodo...")
    url = "https://zenodo.org/record/6861851/files/proteins.fasta"
    response = requests.get(url)
    with open('proteins.fasta', 'wb') as f:
        f.write(response.content)
    print("✓ Downloaded proteins.fasta")
else:
    print("Invalid choice. Using default sample sequences.")
    # Create sample protein sequences
    sample_fasta = ">sp|P04637|P53_HUMAN Cellular tumor antigen p53\n"
    sample_fasta += "MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD\n"
    sample_fasta += ">sp|P01112|RASH_HUMAN GTPase HRas\n"
    sample_fasta += "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS\n"
    sample_fasta += ">sp|P00533|EGFR_HUMAN Epidermal growth factor receptor\n"
    sample_fasta += "MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYVQRNYDLSFLKTIQEVAGYVLIALNTVERIPLENLQIIRGNMYYENSYALAVLSNYDANKTGLKELPMRNLQEILHGAVRFSNNPALCNVESIQWRDIVSSDFLSNMSMDFQNHLGSCQKCDPSCPNGSCWGAGEENCQKLTKIICAQQCSGRCRGKSPSDCCHNQCAAGCTGPRESDCLVCRKFRDEATCKDTCPPLMLYNPTTYQMDVNPEGKYSFGATCVKKCPRNYVVTDHGSCVRACGADSYEMEEDGVRKCKKCEGPCRKVCNGIGIGEFKDSLSINATNIKHFKNCTSISGDLHILPVAFRGDSFTHTPPLDPQELDILKTVKEITGFLLIQAWPENRTDLHAFENLEIIRGRTKQHGQFSLAVVSLNITSLGLRSLKEISDGDVIISGNKNLCYANTINWKKLFGTSGQKTKIISNRGENSCKATGQVCHALCSPEGCWGPEPRDCVSCRNVSRGRECVDKCNLLEGEPREFVENSECIQCHPECLPQAMNITCTGRGPDNCIQCAHYIDGPHCVKTCPAGVMGENNTLVWKYADAGHVCHLCHPNCTYGCTGPGLEGCPTNGPKIPSIATGMVGALLLLLVVALGIGLFMRRRHIVRKRTLRRLLQERELVEPLTPSGEAPNQALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVDNPHVCRLLGICLTSTVQLITQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAARNVLVKTPQHVKITDFGLAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTFGSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRELIIEFSKMARDPQRYLVIQGDERMHLPSPTDSNFYRALMDEEDMDDVVDADEYLIPQQGFFSSPSTSRTPLLSSLSATSNNSTVACIDRNGLQSCPIKEDSFLQRYSSDPTGALTEDSIDDTFLPVPEYINQSVPKRPAGSVQNPVYHNQPLNPAPSRDPHYQDPHSTAVGNPEYLNTVQPTCVNSTFDSPAHWAQKGSHQISLDNPDYQQDFFPKEAKPNGIFKGSTAENAEYLRVAPQSSEFIGA\n"

    with open('proteins.fasta', 'w') as f:
        f.write(sample_fasta)
    print("✓ Created sample proteins.fasta")

# Read sequences
sequences = list(SeqIO.parse('proteins.fasta', 'fasta'))
print(f"\n✓ Loaded {len(sequences)} protein sequences")
for i, seq in enumerate(sequences[:3], 1):
    print(f"  {i}. {seq.id} - {len(seq.seq)} amino acids")
if len(sequences) > 3:
    print(f"  ... and {len(sequences)-3} more")

# Step 2: Functional annotation using UniProt API
print("\n[Step 2] Running functional annotation...")
print("Using UniProt BLAST API for annotation\n")

annotations = []

for seq_record in sequences:
    seq_id = seq_record.id
    sequence = str(seq_record.seq)

    print(f"Annotating: {seq_id}...")

    # Use UniProt REST API for BLAST search
    try:
        # Search UniProt with sequence
        url = "https://rest.uniprot.org/idmapping/run"

        # For demo, we'll use a simpler approach - search by sequence similarity
        # In production, you'd use the full BLAST API

        search_url = f"https://rest.uniprot.org/uniprotkb/search?query={sequence[:50]}&format=json&size=1"

        response = requests.get(search_url, timeout=10)

        if response.status_code == 200:
            data = response.json()

            if 'results' in data and len(data['results']) > 0:
                result = data['results'][0]

                protein_name = result.get('proteinDescription', {}).get('recommendedName', {}).get('fullName', {}).get('value', 'Unknown')
                organism = result.get('organism', {}).get('scientificName', 'Unknown')
                gene_name = result.get('genes', [{}])[0].get('geneName', {}).get('value', 'Unknown') if result.get('genes') else 'Unknown'

                # Get GO terms
                go_terms = []
                if 'uniProtKBCrossReferences' in result:
                    for xref in result['uniProtKBCrossReferences']:
                        if xref.get('database') == 'GO':
                            go_terms.append(xref.get('id', ''))

                annotation = {
                    'Query_ID': seq_id,
                    'Length': len(sequence),
                    'Protein_Name': protein_name,
                    'Gene_Name': gene_name,
                    'Organism': organism,
                    'GO_Terms': ', '.join(go_terms[:5]) if go_terms else 'None',
                    'Status': 'Annotated'
                }
            else:
                annotation = {
                    'Query_ID': seq_id,
                    'Length': len(sequence),
                    'Protein_Name': 'Not found',
                    'Gene_Name': 'N/A',
                    'Organism': 'N/A',
                    'GO_Terms': 'None',
                    'Status': 'No match'
                }
        else:
            annotation = {
                'Query_ID': seq_id,
                'Length': len(sequence),
                'Protein_Name': 'Error',
                'Gene_Name': 'N/A',
                'Organism': 'N/A',
                'GO_Terms': 'None',
                'Status': 'API Error'
            }

        annotations.append(annotation)
        print(f"  ✓ {annotation['Protein_Name']}")

        # Be nice to the API
        time.sleep(1)

    except Exception as e:
        print(f"  ✗ Error: {e}")
        annotations.append({
            'Query_ID': seq_id,
            'Length': len(sequence),
            'Protein_Name': 'Error',
            'Gene_Name': 'N/A',
            'Organism': 'N/A',
            'GO_Terms': 'None',
            'Status': f'Error: {str(e)}'
        })

# Step 3: Create results dataframe
print("\n[Step 3] Creating results table...")
df = pd.DataFrame(annotations)

# Display results
print("\n" + "=" * 70)
print("ANNOTATION RESULTS")
print("=" * 70)
print(df.to_string(index=False))

# Step 4: Save results
print("\n[Step 4] Saving results...")

# Save as CSV
df.to_csv('annotation_results.csv', index=False)
print("✓ Saved: annotation_results.csv")

# Save as TSV (tab-separated, like eggNOG output)
df.to_csv('annotation_results.tsv', sep='\t', index=False)
print("✓ Saved: annotation_results.tsv")

# Download files
print("\n[Step 5] Download results...")
try:
    files.download('annotation_results.csv')
    files.download('annotation_results.tsv')
    print("✓ Files downloaded!")
except:
    print("Files saved in your workspace")

print("\n" + "=" * 70)
print("ANNOTATION COMPLETE!")
print("=" * 70)
print(f"\nSummary:")
print(f"  Total sequences: {len(sequences)}")
print(f"  Annotated: {len([a for a in annotations if a['Status'] == 'Annotated'])}")
print(f"  No match: {len([a for a in annotations if a['Status'] == 'No match'])}")
print(f"  Errors: {len([a for a in annotations if 'Error' in a['Status']])}")

Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86
FUNCTIONAL ANNOTATION OF PROTEIN SEQUENCES

[Step 1] Get protein sequences
Choose an option:
1. Upload your own FASTA file
2. Download from Zenodo (https://zenodo.org/record/6861851/files/proteins.fasta)
Enter choice (1 or 2): 2

Downloading from Zenodo...
✓ Downloaded proteins.fasta

✓ Loaded 24 protein sequences
  1. FUN_000001-T1 - 227 amino acids
  2. FUN_000002-T1 - 189 amino acids
  3. FUN_000003-T1 - 158 amino acids
  ... and 21 more

[Step 2] Running functional annotation...
Using UniProt BLAST API for annotation

Annot

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✓ Files downloaded!

ANNOTATION COMPLETE!

Summary:
  Total sequences: 24
  Annotated: 0
  No match: 24
  Errors: 0
