In [1]:
import pandas as pd
import altair as alt
import pickle
import sys
import os
from Bio.SeqFeature import ExactPosition

# Add the parent directory of your notebook to the Python path
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'scripts'))
from Genbank import GenBankRecord


alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

There's already a reasonably complete tree of CHIKV sequences on Nextstrain (last updated in 2023). There are ~1000 sequences in the tree. However, there are ~8000 sequences on Genbank. Most of these might be incomplete.

In [5]:
all_sequences_df = pd.read_csv('../../data/sequences.acc', header=None, names=['accession'])
vienna_sequences_df = pd.read_csv('../../data/ViennaRNA_CHIKV_metadata.tsv', sep='\t').drop('accession', axis=1).rename(columns={'strain': 'accession'})

In [6]:
library_accession = "MW473668.1"
outgroup_accession = "NC_075006.1"
all_accessions = all_sequences_df.accession.unique()
vienna_accessions = vienna_sequences_df.accession.unique()
print(f"There are {len(all_accessions)} unique accessions in all_sequences_df.")
print(f"There are {len(vienna_accessions)} unique accessions in vienna_sequences_df.")
print(f"{(len(set(vienna_accessions) & set(all_accessions)) / len(set(vienna_accessions)) * 100)}% of the Vienna accessions are present in all_sequences_df.")
print(f"The library strain is in the vienna dataset: {library_accession in vienna_accessions}")

There are 8366 unique accessions in all_sequences_df.
There are 1398 unique accessions in vienna_sequences_df.
100.0% of the Vienna accessions are present in all_sequences_df.
The library strain is in the vienna dataset: True


In [7]:
with open("../../results/sequences/records.pickle", 'rb') as f:
    records = pickle.load(f)

In [9]:
metadata = pd.read_csv('../../results/sequences/metadata.csv')
metadata.head()

Unnamed: 0,accession,url,authors,title,journal,paper_link,submission,strain,organism,host,...,region,subregion,identity,feature,n_cds,feature_length,feature_translation_length,sequence,translation,feature_ambiguous
0,NC_004162.2,https://www.ncbi.nlm.nih.gov/nucleotide/NC_004...,"Khan,A.H., Morita,K., del Carmen Parquet,M., H...",Complete nucleotide sequence of chikungunya vi...,"J. Gen. Virol. 83 (Pt 12), 3075-3084 (2002)",https://pubmed.ncbi.nlm.nih.gov/12466484,Submitted (10-JAN-2003) Department of Virology...,S27-African prototype,Chikungunya virus,Unknown,...,Unknown,Unknown,88.123832,structural polyprotein,2,3747,1248,ATGGAGTTCATCCCAACCCAAACTTTTTACAACAGGAGGTACCAGC...,MEFIPTQTFYNRRYQPRPWTPRPTIQVIRPRPRPQRQAGQLAQLIS...,0
1,PV066168.1,https://www.ncbi.nlm.nih.gov/nucleotide/PV0661...,"Horthongkham,N. and Athipanyasilp,N.",Genetic characterization of chikunugunya virus...,Unpublished,,"Submitted (04-FEB-2025) Microbiology, Mahidol ...",SIMI 057,Chikungunya virus,Homo sapiens,...,Asia,South-Eastern Asia,85.321591,structural polyprotein,2,3747,1248,ATGGAGTTCATCCCAACCCAAACCTTTTACAATAGGAGGTACCAGC...,MEFIPTQTFYNRRYQPRPWTPRSTIQIIRPRPRPQRQAGQLAQLIS...,0
2,PV066169.1,https://www.ncbi.nlm.nih.gov/nucleotide/PV0661...,"Horthongkham,N. and Athipanyasilp,N.",Genetic characterization of chikunugunya virus...,Unpublished,,"Submitted (04-FEB-2025) Microbiology, Mahidol ...",SIMI 058,Chikungunya virus,Homo sapiens,...,Asia,South-Eastern Asia,85.455031,structural polyprotein,2,3747,1248,ATGGAGTTCATCCCAACCCAAACCTTTTACAATAGGAGGTACCAGC...,MEFIPTQTFYNRRYQPRPWTPRSTIQIIRPRPRPQRQAGQLAQLIS...,0
3,PV054361.1,https://www.ncbi.nlm.nih.gov/nucleotide/PV0543...,"Umair,M., Hakim,R., Jamal,Z. and Salman,M.",Direct Submission,Unpublished,,Submitted (31-JAN-2025) Department of Virology...,CHIKV-NIHPAK-03/2024,Chikungunya virus,Homo sapiens,...,Asia,Southern Asia,85.321591,structural polyprotein,2,3747,1248,ATGGAGTTCATCCCAACCCAGACTTTTTACAATAGGAGGTACCAGC...,MEFIPTQTFYNRRYQPRPWTPRSTIQIIRPRPRPQRQAGQLAQLIS...,0
4,PV054362.1,https://www.ncbi.nlm.nih.gov/nucleotide/PV0543...,"Umair,M., Hakim,R., Jamal,Z. and Salman,M.",Direct Submission,Unpublished,,Submitted (31-JAN-2025) Department of Virology...,CHIKV-NIHPAK-04/2024,Chikungunya virus,Homo sapiens,...,Asia,Southern Asia,85.321591,structural polyprotein,2,3747,1248,ATGGAGTTCATCCCAACCCAGACTTTTTACAATAGGAGGTACCAGC...,MEFIPTQTFYNRRYQPRPWTPRSTIQIIRPRPRPQRQAGQLAQLIS...,0


In [14]:
records['NC_004162.2'].coding_regions[1].check_translation()

True

In [38]:
# Loop through each row in the DataFrame and extract the accession, sequence, and translation
failed = 0
failed_links = []
failed_records = []
for row in metadata.itertuples():
    accession = row.accession
    sequence = row.sequence
    translation = row.translation

    for cds in records[accession].coding_regions:
        if cds.sequence == sequence:
            start_is_exact = isinstance(cds.feature.location.start, ExactPosition)
            end_is_exact = isinstance(cds.feature.location.end, ExactPosition)
            if not start_is_exact or not end_is_exact:
                failed += 1
                failed_links.append(row.url)
                failed_records.append(records[accession])
            # if not cds.check_translation():
            #     print(f"Warning: {accession} translation is not the same as the CDS translation")

print(f"Failed: {failed} sequences:")
for link in failed_links:
    print(link)

Failed: 681 sequences:
https://www.ncbi.nlm.nih.gov/nucleotide/PP196586.1
https://www.ncbi.nlm.nih.gov/nucleotide/PP196587.1
https://www.ncbi.nlm.nih.gov/nucleotide/PP196588.1
https://www.ncbi.nlm.nih.gov/nucleotide/PP196589.1
https://www.ncbi.nlm.nih.gov/nucleotide/PP196590.1
https://www.ncbi.nlm.nih.gov/nucleotide/PP196591.1
https://www.ncbi.nlm.nih.gov/nucleotide/PP196592.1
https://www.ncbi.nlm.nih.gov/nucleotide/PP196593.1
https://www.ncbi.nlm.nih.gov/nucleotide/PP196594.1
https://www.ncbi.nlm.nih.gov/nucleotide/PP196595.1
https://www.ncbi.nlm.nih.gov/nucleotide/PP196596.1
https://www.ncbi.nlm.nih.gov/nucleotide/PP196597.1
https://www.ncbi.nlm.nih.gov/nucleotide/PP196598.1
https://www.ncbi.nlm.nih.gov/nucleotide/PP196599.1
https://www.ncbi.nlm.nih.gov/nucleotide/PP196600.1
https://www.ncbi.nlm.nih.gov/nucleotide/PP196601.1
https://www.ncbi.nlm.nih.gov/nucleotide/PP196603.1
https://www.ncbi.nlm.nih.gov/nucleotide/PP196604.1
https://www.ncbi.nlm.nih.gov/nucleotide/PP196605.1
https://

In [35]:
failed_records[0].coding_regions[1].feature

SeqFeature(SimpleLocation(ExactPosition(7425), AfterPosition(11102), strand=1), type='CDS', qualifiers=...)

In [40]:
partial = 0
disagreed = 0
multiple = 0
start = 0
failed_accesions = []
for row in metadata.itertuples():
    accession = row.accession
    sequence = row.sequence
    translation = row.translation
    for cds in records[accession].coding_regions:
        if cds.sequence == sequence:
            start_is_exact = isinstance(cds.feature.location.start, ExactPosition)
            end_is_exact = isinstance(cds.feature.location.end, ExactPosition)
            if not start_is_exact or not end_is_exact:
                partial += 1
                failed_accesions.append(row.accession)
            elif not cds.check_translation():
                disagreed += 1
                failed_accesions.append(row.accession)
            elif len(sequence) % 3 != 0:
                multiple += 1
                failed_accesions.append(row.accession)
            elif translation[0] != 'M':
                start += 1
                failed_accesions.append(row.accession)

print(f"Failed: {failed} sequences:")
print(f"Partial: {partial} sequences:")
print(f"Disagreed: {disagreed} sequences:")
print(f"Multiple: {multiple} sequences:")
print(f"Start: {start} sequences:")

Failed: 681 sequences:
Partial: 681 sequences:
Disagreed: 0 sequences:
Multiple: 0 sequences:
Start: 0 sequences:


In [3]:
x = "ATGCGGGTCTAG"
x[-3:]

'TAG'

In [17]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord


In [18]:
alignment = SeqIO.to_dict(SeqIO.parse("../../results/alignments/protein.fa", "fasta"))
sequences = SeqIO.to_dict(SeqIO.parse("../../results/sequences/nucleotide.fa", "fasta"))

In [19]:
assert (sequences.keys() == alignment.keys()), "Keys in sequences and alignment dictionaries don't match"

In [20]:
# Iterate through protein alignments and creating codon alignments
codons = {}
for accession, translation in alignment.items():
    sequence = sequences[accession]
    codon_sequence = ""
    codon_index = 0
    for amino_acid in translation:
        if amino_acid == "-":
            codon_sequence += "---"
        else:
            codon_sequence += sequence[codon_index:codon_index + 3]
            codon_index += 3
    codons[accession] = codon_sequence

In [29]:
set([len(seq) for seq in codons.values()])

{3744}

In [24]:
# Write the codon alignments to a file
with open("../../results/alignments/codon.fa", "w") as handle:
    SeqIO.write([seq for seq in codons.values()], handle, "fasta")

In [2]:
library_accession = "MW473668.1"

In [4]:
library_seq = GenBankRecord(library_accession).fetch()

In [8]:
polyprotein = library_seq.coding_regions[1]


In [16]:
gbk = polyprotein.fetch_protein_record()

In [17]:
gbk

SeqRecord(seq=Seq('MEFIPTQTFYNRRYQPRPWTPRPTIQVIRPRPRPQRKAGQLAQLISAVNKLTMR...SRH'), id='UFI00980.1', name='UFI00980', description='structural polyprotein [Chikungunya virus]', dbxrefs=[])

In [7]:
import pickle
import sys
import os
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation, Seq
# Add the parent directory of your notebook to the Python path
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'scripts'))
from Genbank import GenBankRecord


In [8]:
library_accession = "MW473668.1"

In [9]:
record = GenBankRecord(library_accession).fetch()

In [10]:
for cds in record.coding_regions:
    if cds.product == 'structural polyprotein':
        polyprotein = cds
        break

In [11]:
start = int(cds.feature.location.start)
end = int(cds.feature.location.end)
strand = cds.feature.location.strand

In [12]:
peptides = []
for feature in record.record.features:
    if feature.type == "mat_peptide":
        if feature.location.start >= start and feature.location.end <= end and feature.location.strand == strand:
            peptides.append(feature)
peptides

[SeqFeature(SimpleLocation(ExactPosition(7540), ExactPosition(8323), strand=1), type='mat_peptide', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(8323), ExactPosition(8515), strand=1), type='mat_peptide', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(8515), ExactPosition(9784), strand=1), type='mat_peptide', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(9784), ExactPosition(9967), strand=1), type='mat_peptide', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(9967), ExactPosition(11284), strand=1), type='mat_peptide', qualifiers=...)]

In [60]:
def genbank_from_cds(accession, product, name, include_mat_peptides=True):
    """
    Make a new GenBank reference for a subset of the original GenBank record.

    Parameters
    ----------
    accession : str
        The accession of the reference GenBank record.
    product : str
        The name of the CDS product to extract.
    name : str
        The name of the new GenBank record.
    include_mat_peptides : bool, optional
        Whether to include the mat_peptide features in the new record. Default is True.
    """
    # Fetch the GenBank record
    reference = GenBankRecord(accession).fetch()
    # Initialize a list of features for the new record
    features = []
    # Find the main CDS with the specified product
    for cds in reference.coding_regions:
        if cds.product == product:
            features.append(cds.feature)
    if len(features) == 0:
            raise ValueError(f"CDS with product {product} not found in record {accession}")
    # Get position information for the coding sequence
    start = int(features[0].location.start)
    end = int(features[0].location.end)
    strand = features[0].location.strand
    length = end - start
    # Collect a list of mature peptide features within the cds region
    if include_mat_peptides:
        for feature in reference.record.features:
            if feature.type == "mat_peptide":
                if feature.location.start >= start and feature.location.end <= end:
                    features.append(feature)
    # Create a new GenBank record
    record = SeqRecord(
        features[0].location.extract(reference.record).seq, 
        id=name,
        name=name,
        description="",
        annotations={
            "molecule_type": reference.record.annotations["molecule_type"]
        }
    )
    # Add the source feature to the new record
    source = (
        SeqFeature(
            FeatureLocation(start=0, end=length, strand=strand),
            type="source",
            qualifiers=features[0].qualifiers
        )
    )
    source.qualifiers["product"] = name
    record.features.append(source)
    # Add the other features to the new record
    for i, feature in enumerate(features):
        relative_start = feature.location.start - start
        relative_end = feature.location.end - start
        if i == 0:
            gene_name = name
            translation = feature.extract(reference.record.seq).translate(to_stop=True)
            feature.qualifiers["product"] = gene_name
            feature.qualifiers["gene"] = gene_name
            feature.qualifiers["locus_tag"] = gene_name
        else:
            gene_name = feature.qualifiers.get("product", [""])[0]
            translation = feature.extract(reference.record.seq).translate(to_stop=True)
            feature.qualifiers["gene"] = gene_name
            feature.qualifiers["locus_tag"] = gene_name
        if "translation" not in feature.qualifiers:
            feature.qualifiers["translation"] = [str(translation)]
        cds = SeqFeature(FeatureLocation(start=relative_start, end=relative_end), type="CDS", qualifiers=feature.qualifiers)
        record.features.append(cds)

    return record

In [61]:
rec = genbank_from_cds("MW473668.1", "structural polyprotein", "CHIKVgp2", include_mat_peptides=True)

In [62]:
# Save the record as a GenBank file
with open("./library.gb", "w") as handle:
    SeqIO.write(rec, handle, "genbank")