In [1]:
pip install Biopython

Collecting Biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Biopython
Successfully installed Biopython-1.86


In [2]:
from Bio import ExPASy
from Bio import SwissProt
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [3]:
uniprot_id = "A0A0H3K5Y8"

handle = ExPASy.get_sprot_raw(uniprot_id)
record = SwissProt.read(handle)

In [4]:
protein_sequence = record.sequence
protein_length = len(protein_sequence)

In [5]:
analysis = ProteinAnalysis(protein_sequence)
amino_acid_composition = analysis.get_amino_acids_percent()



In [6]:
output_file = "../results/qc_summary.txt"

In [7]:
import os

os.makedirs("results", exist_ok=True)
output_file = "results/qc_summary.txt"


In [8]:
with open(output_file, "w") as f:
    f.write(f"Sequence ID: {record.accessions[0]}\n")
    f.write(f"Description: {record.description}\n")
    f.write(f"Organism: {record.organism}\n\n")

    f.write("STEP 2: SEQUENCE QUALITY & BASIC ANALYSIS\n")
    f.write("---------------------------------------\n")
    f.write(f"Protein Length: {protein_length} amino acids\n\n")

    f.write("Amino Acid Composition (%):\n")
    for aa, percent in sorted(amino_acid_composition.items()):
        f.write(f"{aa}: {percent * 100:.2f}%\n")


In [9]:
print("Step 2 completed successfully using UniProt ID.")

Step 2 completed successfully using UniProt ID.


In [10]:
with open("results/qc_summary.txt") as f:
    print(f.read())

Sequence ID: A0A0H3K5Y8
Description: SubName: Full=Similar to thiol methyltransferase 1 {ECO:0000313|EMBL:BAD79577.1};
Organism: Synechococcus sp. (strain ATCC 27144 / PCC 6301 / SAUG 1402/1) (Anacystis nidulans).

STEP 2: SEQUENCE QUALITY & BASIC ANALYSIS
---------------------------------------
Protein Length: 443 amino acids

Amino Acid Composition (%):
A: 5.19%
C: 2.48%
D: 5.64%
E: 5.19%
F: 3.61%
G: 5.42%
H: 2.48%
I: 8.35%
K: 4.97%
L: 10.38%
M: 2.03%
N: 4.97%
P: 3.16%
Q: 2.93%
R: 4.74%
S: 10.16%
T: 4.74%
V: 6.09%
W: 3.61%
Y: 3.84%



In [11]:
with open("results/qc_summary.txt") as f:
    print(f.read())

Sequence ID: A0A0H3K5Y8
Description: SubName: Full=Similar to thiol methyltransferase 1 {ECO:0000313|EMBL:BAD79577.1};
Organism: Synechococcus sp. (strain ATCC 27144 / PCC 6301 / SAUG 1402/1) (Anacystis nidulans).

STEP 2: SEQUENCE QUALITY & BASIC ANALYSIS
---------------------------------------
Protein Length: 443 amino acids

Amino Acid Composition (%):
A: 5.19%
C: 2.48%
D: 5.64%
E: 5.19%
F: 3.61%
G: 5.42%
H: 2.48%
I: 8.35%
K: 4.97%
L: 10.38%
M: 2.03%
N: 4.97%
P: 3.16%
Q: 2.93%
R: 4.74%
S: 10.16%
T: 4.74%
V: 6.09%
W: 3.61%
Y: 3.84%



In [12]:
from Bio import ExPASy, SwissProt
import os

In [13]:
uniprot_id = "A0A0H3K5Y8"
handle = ExPASy.get_sprot_raw(uniprot_id)
record = SwissProt.read(handle)

protein_sequence = record.sequence
protein_length = len(protein_sequence)

In [14]:
MIN_LENGTH = 50
AMBIGUOUS_AA = {"X", "B", "Z", "J"}

contains_ambiguous = any(aa in AMBIGUOUS_AA for aa in protein_sequence)

In [15]:
if protein_length >= MIN_LENGTH and not contains_ambiguous:
    validation_status = "PASS"
    decision = "Sequence is suitable for downstream analysis (BLAST & annotation)."
else:
    validation_status = "FAIL"
    decision = "Sequence is NOT suitable for downstream analysis."

In [16]:
os.makedirs("results", exist_ok=True)
output_file = "results/sequence_validation.txt"

In [17]:
with open(output_file, "w") as f:
    f.write("STEP 3: SEQUENCE FILTERING & VALIDATION\n")
    f.write("-------------------------------------\n")
    f.write(f"UniProt ID: {record.accessions[0]}\n")
    f.write(f"Protein Name: {record.description}\n")
    f.write(f"Organism: {record.organism}\n\n")

    f.write(f"Protein Length: {protein_length} amino acids\n")
    f.write(f"Minimum Length Required: {MIN_LENGTH} amino acids\n")
    f.write(f"Contains Ambiguous Amino Acids: {contains_ambiguous}\n\n")

    f.write(f"Validation Status: {validation_status}\n")
    f.write(f"Decision: {decision}\n")

print("Step 3 completed successfully.")
print("Validation result saved to results/sequence_validation.txt")

Step 3 completed successfully.
Validation result saved to results/sequence_validation.txt


In [18]:
with open("results/sequence_validation.txt") as f:
    print(f.read())

STEP 3: SEQUENCE FILTERING & VALIDATION
-------------------------------------
UniProt ID: A0A0H3K5Y8
Protein Name: SubName: Full=Similar to thiol methyltransferase 1 {ECO:0000313|EMBL:BAD79577.1};
Organism: Synechococcus sp. (strain ATCC 27144 / PCC 6301 / SAUG 1402/1) (Anacystis nidulans).

Protein Length: 443 amino acids
Minimum Length Required: 50 amino acids
Contains Ambiguous Amino Acids: False

Validation Status: PASS
Decision: Sequence is suitable for downstream analysis (BLAST & annotation).



In [19]:
from Bio import ExPASy, SwissProt
from Bio.Blast import NCBIWWW, NCBIXML
import os

In [20]:
# 1. Fetch UniProt protein sequence
# -----------------------------
uniprot_id = "A0A0H3K5Y8"
handle = ExPASy.get_sprot_raw(uniprot_id)
record = SwissProt.read(handle)

protein_sequence = record.sequence

In [21]:
# 2. Run BLAST (blastp)
# -----------------------------
print("Running BLAST... this may take a few minutes.")

blast_result = NCBIWWW.qblast(
    program="blastp",
    database="nr",
    sequence=protein_sequence,
    hitlist_size=5
)

Running BLAST... this may take a few minutes.


In [22]:
os.makedirs("results", exist_ok=True)
blast_xml_file = "results/blast_results.xml"

with open(blast_xml_file, "w") as f:
    f.write(blast_result.read())

blast_result.close()

print("BLAST search completed.")
print("Results saved to results/blast_results.xml")

BLAST search completed.
Results saved to results/blast_results.xml


In [23]:
# Step 4 (continued): Parse BLAST results

blast_txt_file = "results/blast_results.txt"

with open(blast_xml_file) as result_handle:
    blast_records = NCBIXML.read(result_handle)

with open(blast_txt_file, "w") as f:
    f.write("STEP 4: BLAST HOMOLOGY SEARCH RESULTS\n")
    f.write("-----------------------------------\n")
    f.write(f"Query: {record.accessions[0]} - {record.description}\n\n")

    for alignment in blast_records.alignments[:5]:
        for hsp in alignment.hsps:
            f.write(f"Hit ID: {alignment.hit_id}\n")
            f.write(f"Hit Description: {alignment.hit_def}\n")
            f.write(f"Alignment Length: {alignment.length}\n")
            f.write(f"E-value: {hsp.expect}\n")
            f.write(f"Score: {hsp.score}\n")
            f.write("-" * 50 + "\n")
            break

print("Parsed BLAST hits saved to results/blast_results.txt")


Parsed BLAST hits saved to results/blast_results.txt


In [24]:
with open("results/blast_results.txt") as f:
    print(f.read())

STEP 4: BLAST HOMOLOGY SEARCH RESULTS
-----------------------------------
Query: A0A0H3K5Y8 - SubName: Full=Similar to thiol methyltransferase 1 {ECO:0000313|EMBL:BAD79577.1};

Hit ID: ref|WP_011243699.1|
Hit Description: hypothetical protein [Synechococcus elongatus] >gb|ABB56149.1| thiol methyltransferase 1-like [Synechococcus elongatus PCC 7942 = FACHB-805] >gb|MBD2587981.1| thiol methyltransferase [Synechococcus elongatus FACHB-242] >gb|MBD2689049.1| thiol methyltransferase [Synechococcus elongatus FACHB-1061] >gb|MBD2707311.1| thiol methyltransferase [Synechococcus elongatus PCC 7942 = FACHB-805] >gb|MGL5883036.1| thiol methyltransferase [Synechococcus elongatus]
Alignment Length: 443
E-value: 0.0
Score: 2371.0
--------------------------------------------------
Hit ID: gb|NBU76043.1|
Hit Description: thiol methyltransferase [Planctomycetota bacterium]
Alignment Length: 440
E-value: 7.5417e-119
Score: 937.0
--------------------------------------------------
Hit ID: ref|WP_255104091

In [25]:
from Bio.Blast import NCBIXML
import os

In [26]:
#1. Load BLAST XML results
# -----------------------------
blast_xml_file = "results/blast_results.xml"

with open(blast_xml_file) as handle:
    blast_record = NCBIXML.read(handle)

In [27]:
# 2. Extract top BLAST hit
# -----------------------------
top_alignment = blast_record.alignments[0]
top_hsp = top_alignment.hsps[0]

hit_id = top_alignment.hit_id
hit_description = top_alignment.hit_def
alignment_length = top_alignment.length
e_value = top_hsp.expect
score = top_hsp.score
identity = (top_hsp.identities / top_hsp.align_length) * 100

In [28]:
# 3. Save functional annotation
# -----------------------------
os.makedirs("results", exist_ok=True)
output_file = "results/functional_annotation.txt"

with open(output_file, "w") as f:
    f.write("STEP 5: FUNCTIONAL ANNOTATION\n")
    f.write("------------------------------\n\n")

    f.write("Top BLAST Hit Information:\n")
    f.write(f"Hit ID: {hit_id}\n")
    f.write(f"Hit Description: {hit_description}\n")
    f.write(f"Alignment Length: {alignment_length}\n")
    f.write(f"E-value: {e_value}\n")
    f.write(f"Bit Score: {score}\n")
    f.write(f"Percent Identity: {identity:.2f}%\n\n")

    f.write("Functional Inference:\n")
    f.write("Based on the high sequence similarity to the top BLAST hit, ")
    f.write("the query protein is predicted to have ")
    f.write("a similar molecular function and biological role as the matched protein. ")
    f.write("This annotation is inferred computationally and requires experimental validation.\n")
print("Step 5 completed successfully.")
print("Functional annotation saved to results/functional_annotation.txt")

Step 5 completed successfully.
Functional annotation saved to results/functional_annotation.txt


In [29]:
with open("results/functional_annotation.txt") as f:
    print(f.read())

STEP 5: FUNCTIONAL ANNOTATION
------------------------------

Top BLAST Hit Information:
Hit ID: ref|WP_011243699.1|
Hit Description: hypothetical protein [Synechococcus elongatus] >gb|ABB56149.1| thiol methyltransferase 1-like [Synechococcus elongatus PCC 7942 = FACHB-805] >gb|MBD2587981.1| thiol methyltransferase [Synechococcus elongatus FACHB-242] >gb|MBD2689049.1| thiol methyltransferase [Synechococcus elongatus FACHB-1061] >gb|MBD2707311.1| thiol methyltransferase [Synechococcus elongatus PCC 7942 = FACHB-805] >gb|MGL5883036.1| thiol methyltransferase [Synechococcus elongatus]
Alignment Length: 443
E-value: 0.0
Bit Score: 2371.0
Percent Identity: 100.00%

Functional Inference:
Based on the high sequence similarity to the top BLAST hit, the query protein is predicted to have a similar molecular function and biological role as the matched protein. This annotation is inferred computationally and requires experimental validation.

