# Analyze CHIKV Passaging Mutations

This notebook converts the nucleotide numbering of the passaging mutations from [this](https://www.nature.com/articles/s41598-018-34561-x#additional-information) paper into the same reference amino acid numbering we used for our DMS library.

In [108]:
import pandas as pd
import numpy as np
from Bio import Entrez, SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature

In [109]:
# Get the genbank record for the strain used in the study
Entrez.email = "email@email.com"
handle = Entrez.efetch(db="nucleotide", id="EU224268", rettype="gb", retmode="text")
strain = SeqIO.read(handle, "genbank")
handle.close()
print(strain)

ID: EU224268.1
Name: EU224268
Description: Cloning vector pCHIKV-LR ic, complete sequence
Number of features: 8
/molecule_type=DNA
/topology=circular
/data_file_division=SYN
/date=24-NOV-2007
/accessions=['EU224268']
/sequence_version=1
/keywords=['']
/source=Cloning vector pCHIKV-LR ic
/organism=Cloning vector pCHIKV-LR ic
/taxonomy=['other sequences', 'artificial sequences', 'vectors']
/references=[Reference(title='Infectious clones of Chikungunya virus (La Reunion isolate) for vector competence studies', ...), Reference(title='Direct Submission', ...)]
Seq('ATGGCTGCGTGAGACACACGTAGCCTACCAGTTTCTTACTGCTCTACTCTGCAA...TAG')


In [110]:
# Get the start of the structural polyprotein gene (E glycoprotein)
for feature in strain.features:
    if feature.type == "CDS":
        if feature.qualifiers.get("product")[0] == 'structural polyprotein':
            # Use the 1-indexed start position
            gene_start = feature.location.start + 1

In [111]:
# Get the passaging mutations and get the protein position in the polyprotein
passaging_mutations = pd.read_csv('mutations.csv')
passaging_mutations['AA_POS'] = ((passaging_mutations['POS'] - gene_start) // 3) + 1
passaging_mutations.head()

Unnamed: 0,POS,Passage,Region,REF,ALT,AA_REF,AA_ALT,FREQ,Sample,Species,AA_POS
0,8549,51,E2,A,G,K,R,0.04,A20-1,aegypti,328
1,8549,15,E2,A,G,K,R,0.25,U4-1,albopictus,328
2,8564,15,E2,T,C,V,A,0.03,U4-1,albopictus,333
3,8566,50,E2,T,C,Y,H,0.48,C6-1,albopictus,334
4,8566,25,E2,T,C,Y,H,0.25,C6-1,albopictus,334


In [112]:
# Convert the AA_POS into the sequential position relative to our DMS data
passaging_mutations['sequential_site'] = ((passaging_mutations['AA_POS'] - 262 + 1) + 1)
passaging_mutations.head()

Unnamed: 0,POS,Passage,Region,REF,ALT,AA_REF,AA_ALT,FREQ,Sample,Species,AA_POS,sequential_site
0,8549,51,E2,A,G,K,R,0.04,A20-1,aegypti,328,68
1,8549,15,E2,A,G,K,R,0.25,U4-1,albopictus,328,68
2,8564,15,E2,T,C,V,A,0.03,U4-1,albopictus,333,73
3,8566,50,E2,T,C,Y,H,0.48,C6-1,albopictus,334,74
4,8566,25,E2,T,C,Y,H,0.25,C6-1,albopictus,334,74


In [113]:
# Join the position from the DMS sitemap to the passaging mutations data
sitemap = pd.read_csv('../../../data/site_numbering_map.csv')

# Join with the passaging mutations by the sequential_site
passaging_mutations = passaging_mutations.merge(sitemap, on='sequential_site', how='left')
columns_to_keep = ['reference_site', 'AA_REF', 'AA_ALT', 'FREQ', 'Species', 'Passage', 'Sample']
passaging_mutations = passaging_mutations[columns_to_keep]
passaging_mutations.head()

Unnamed: 0,reference_site,AA_REF,AA_ALT,FREQ,Species,Passage,Sample
0,3(E2),K,R,0.04,aegypti,51,A20-1
1,3(E2),K,R,0.25,albopictus,15,U4-1
2,8(E2),V,A,0.03,albopictus,15,U4-1
3,9(E2),Y,H,0.48,albopictus,50,C6-1
4,9(E2),Y,H,0.25,albopictus,25,C6-1


In [115]:
# Write to a new CSV file
passaging_mutations.to_csv('passaging_mutations.csv', index=False)