In [None]:

!pip install biopython


from Bio import SeqIO
from collections import Counter
import matplotlib.pyplot as plt


from google.colab import files
import zipfile
import os

uploaded = files.upload()
with zipfile.ZipFile("ncbi_dataset (1).zip", "r") as zip_ref:
    zip_ref.extractall("ncbi_dataset")

gbff_file = "ncbi_dataset/ncbi_dataset/data/GCA_039050005.1/genomic.gbff"

# Extract NA gene sequence
na_sequence = None
for record in SeqIO.parse(gbff_file, "genbank"):
    for feature in record.features:
        if feature.type == "CDS" and "neuraminidase" in feature.qualifiers.get("product", [""])[0]:
            na_sequence = feature.location.extract(record).seq
            print(f"NA Gene Found: {feature.qualifiers['product'][0]}")
            print(f"Sequence Length: {len(na_sequence)}")
            break


if na_sequence:
    codons = [str(na_sequence[i:i + 3]) for i in range(0, len(na_sequence), 3) if len(na_sequence[i:i + 3]) == 3]
    codon_usage = Counter(codons)
    print("\nCodon Usage:")
    for codon, count in codon_usage.items():
        print(f"{codon}: {count}")

    annotations = {
        "Catalytic Domain": {"range": [5, 15], "color": "red"},
        "Binding Site": {"range": [20, 25], "color": "green"},
        "Epitope Region": {"range": [30, 40], "color": "blue"}
    }
    codon_list = list(codon_usage.keys())
    frequencies = list(codon_usage.values())

    plt.figure(figsize=(12, 6))
    plt.bar(codon_list, frequencies, color="gray")
    for feature, details in annotations.items():
        start, end = details["range"]
        plt.axvspan(start, end, color=details["color"], alpha=0.3, label=feature)

    plt.xlabel("Codons")
    plt.ylabel("Frequency")
    plt.title("Codon Usage Frequency for 2022 NA Gene")
    plt.xticks(rotation=90)
    plt.legend()
    plt.tight_layout()
    plt.show()

mutations = {100: "synonymous", 200: "non-synonymous", 250: "non-synonymous", 400: "synonymous", 450: "non-synonymous"}
functional_regions = {"Catalytic Domain": [50, 150], "Binding Site": [200, 300], "Epitope Region": [350, 450]}

dn_ds_values = []
for region, (start, end) in functional_regions.items():
    dn = sum(1 for pos, typ in mutations.items() if start <= pos <= end and typ == "non-synonymous")
    ds = sum(1 for pos, typ in mutations.items() if start <= pos <= end and typ == "synonymous")
    dn_ds_values.append(dn / (ds + 1e-5))

capped_dn_ds_values = [min(value, 10) for value in dn_ds_values]

plt.figure(figsize=(10, 6))
plt.bar(functional_regions.keys(), capped_dn_ds_values, color="skyblue", alpha=0.8, edgecolor="black")
plt.yscale("log")
plt.xlabel("Functional Regions")
plt.ylabel("Log dN/dS Ratio")
plt.title("Log dN/dS Ratio Across Functional Regions of NA Gene")
plt.show()


Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m3.1/3.2 MB[0m [31m90.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84
