In [104]:
import pandas as pd

# Analyze mutations from [Morley et al., 2018](https://academic.oup.com/ve/article/4/1/vey012/5032629)

[Morley et al., 2018](https://academic.oup.com/ve/article/4/1/vey012/5032629) engineered an attenuated CHIKV strain with a large (258 nt) deletion in the 3′UTR that lowered infectivity in mosquito cells (Aag2) and maintained infectivity in primate cells (Vero). The original strain is the SL07 CHIKV strain of the Indian Ocean lineage.

The study has three conditions:

1. CHIKV passaged in Vero cells for 25 rounds.
2. CHIKV passaged in Aag2 cells for 7 rounds——the attenuated strain struggles to grow in mosquito cells.
3. CHIKV passaged alternative between Vero and Aag2 cells for 25 rounds. Round 24 is Vero and 25 is Aag2.

They isolated genomic viral RNA from ninety-seven virus samples (ancestral virus stock and twenty-four samples each from Vero Passage 25, Alternating Passage 24, Alternating Passage 25, and Aag2 Passage 7). They used virus specific primers to amplify overlapping PCR fragments 1.5–2.1 kb in length. Samples were sequenced in a single lane via paired-end, 75-bp read Illumina HiSeq 2500.

Nearly all of the passaging conditions imrpoved fitness on **both** cell lines:
> In summary, passaged virus populations generally showed improved fitness on both cell lines. Passage on Aag2 cells resulted in high correlated fitness gains on both cell lines, while passage on Vero cells resulted in high fitness gains on Vero cells but only modest gains on Aag2 cells. Alternating passage resulted in high fitness gains on both cell lines.

In [105]:
# Download the relevant data
all_variants_df = pd.read_csv("../results/summary/all_variants.csv")
PRJNA437311_metadata_df = pd.read_csv("./PRJNA437311_metadata.csv")
# Make a new column by joining 'reference_site' with 'variant_amino_acid'
all_variants_df['variant'] = all_variants_df['reference_site'].astype(str) + all_variants_df['variant_amino_acid']
# Get only runs from Morley et al., 2018
PRJNA437311_variants_df = all_variants_df.query("BioProject == 'PRJNA437311'")
# Process and join the experimental metadata
metadata_columns_to_keep = ['Run', 'Library Name', 'passage_history', 'lab_host']
PRJNA437311_metadata_df = PRJNA437311_metadata_df[metadata_columns_to_keep]
PRJNA437311_metadata_df = PRJNA437311_metadata_df.rename(columns={
    'Run': 'Accession',
    'Library Name': 'Sample'
})
passaging_variants_df = PRJNA437311_variants_df.merge(
    PRJNA437311_metadata_df,
    on='Accession',
    how='left'
)

In [106]:
print(f"All {len(set(passaging_variants_df.Accession.to_list()))} runs in PRJNA437311 have variants in the dataset.")

All 97 runs in PRJNA437311 have variants in the dataset.


In [107]:
# Simplify the passage history
simplify_passaging_history = {
    '24 passages alternating Aag2 cells and Vero cells, isolated from Vero cell passage': 'Alt_Vero',
    '25 passages alternating Aag2 cells and Vero cells, isolated from Aag2 cell passage': 'Alt_Aag2',
    '25 passages on Vero cells': 'Vero',
    '7 passages on Aag2 cells': 'Aag2',
    'single passage on BHK-21 cells': 'Stock',
}
passaging_variants_df['Passage'] = passaging_variants_df.passage_history.map(
    simplify_passaging_history
)

In [108]:
# Extract the population replicate from the sample name
passaging_variants_df['Population'] = passaging_variants_df['Sample'].str.split('_').str[1].str.replace('Population', '', case=False)

In [109]:
# Join the DMS data based on the variant to see how variants change entry in each cell type.
dms_mut_diffs_df = pd.read_csv("../../results/compare_cell_entry/mut_diffs.csv")
dms_mut_diffs_df['variant'] = dms_mut_diffs_df['site'].astype(str) + dms_mut_diffs_df['mutant']
dms_mut_diffs_df = dms_mut_diffs_df[['293T_Mxra8', 'C636', '293T_TIM1', '293T_Mxra8 minus C636', 'variant']]
# Merge the DMS data with the passaging variants
passaging_variants_df = passaging_variants_df.merge(
    dms_mut_diffs_df,
    on='variant',
    how='left'
)

In [110]:
# Get all of the mutations in the 'ancestral' stock virus
stock_variants_df = passaging_variants_df.query("Passage == 'Stock'")
# Get the 'fixed' variants in the stock (frequency > .95)
fixed_variants = stock_variants_df.query("variant_frequency > 0.95")['variant'].unique()
(f"There are {len(fixed_variants)} fixed variants in the stock virus relative to our library strain.")
# Exclude the fixed variants from the passaging variants dataframe
passaging_variants_df = passaging_variants_df.query("variant not in @fixed_variants")

## Which mutations arise in each passaging condition?

In [111]:
passaging_variants_summary = passaging_variants_df.groupby(['Passage', 'variant', 'C636', '293T_Mxra8', '293T_TIM1', '293T_Mxra8 minus C636']).agg({
    'variant_frequency': [
        lambda x: f"({', '.join([f'{val:.3f}' for val in x])})",
        'size'
    ]
}).reset_index()
passaging_variants_summary.columns = ['Passage', 'variant', 'C636', '293T_Mxra8', '293T_TIM1', '293T_Mxra8 minus C636', 'variant_frequencies', 'count']

### Variants in the Stock that weren't fixed in the Stock

In [112]:
passaging_variants_summary.query("Passage == 'Stock'").sort_values(by='count', ascending=False).reset_index(drop=True)

Unnamed: 0,Passage,variant,C636,293T_Mxra8,293T_TIM1,293T_Mxra8 minus C636,variant_frequencies,count
0,Stock,157(E2)V,0.507,0.2,0.218,-0.307,(0.388),1
1,Stock,164(E2)T,-0.109,-0.123,-0.041,-0.014,(0.817),1


### Variants in Aag2 cells after 7 passages

In [113]:
passaging_variants_summary.query("Passage == 'Aag2'").sort_values(by='count', ascending=False).reset_index(drop=True)

Unnamed: 0,Passage,variant,C636,293T_Mxra8,293T_TIM1,293T_Mxra8 minus C636,variant_frequencies,count
0,Aag2,340(E1)L,-1.055,-0.567,-0.862,0.488,"(0.919, 0.121, 0.618, 0.629)",4
1,Aag2,220(E1)L,-0.255,-0.228,-0.119,0.027,"(0.076, 0.083, 0.067)",3
2,Aag2,157(E2)V,0.507,0.2,0.218,-0.307,"(0.222, 0.422)",2
3,Aag2,164(E2)T,-0.109,-0.123,-0.041,-0.014,"(0.757, 0.561)",2
4,Aag2,386(E2)M,-1.051,-0.68,-0.594,0.371,"(0.227, 0.281)",2
5,Aag2,392(E1)E,-7.227,-7.521,-4.931,0.0,"(0.056, 0.181)",2
6,Aag2,47(6K)I,-0.253,-0.398,-0.197,-0.145,"(0.102, 0.051)",2
7,Aag2,110(E2)A,-0.28,0.021,-0.465,0.301,(0.064),1
8,Aag2,421(E1)A,-4.434,-1.8,-1.545,2.634,(0.076),1
9,Aag2,75(E1)N,-1.153,-0.527,-0.507,0.626,(0.059),1


### Variants in Vero cells after 25 passages

In [114]:
passaging_variants_summary.query("Passage == 'Vero'").sort_values(by='count', ascending=False).reset_index(drop=True)

Unnamed: 0,Passage,variant,C636,293T_Mxra8,293T_TIM1,293T_Mxra8 minus C636,variant_frequencies,count
0,Vero,73(E2)Y,-6.611,-4.467,-4.192,0.533,"(0.395, 0.129, 0.057, 0.927, 0.089)",5
1,Vero,221(E2)R,0.19,0.235,0.229,0.045,"(0.676, 0.385)",2
2,Vero,60(E2)N,-0.732,0.003,-0.301,0.735,"(0.116, 0.138)",2
3,Vero,265(E2)I,-2.055,-1.443,-0.994,0.612,"(0.082, 0.171)",2
4,Vero,43(6K)R,-0.069,-0.046,-0.085,0.022,"(0.988, 0.996)",2
5,Vero,55(E2)R,-7.438,-7.208,-7.242,0.0,"(0.151, 0.989)",2
6,Vero,146(E2)R,-3.74,-3.74,-2.551,0.0,"(0.751, 0.451)",2
7,Vero,64(E2)R,-2.87,-0.714,-0.741,2.156,"(0.314, 0.063)",2
8,Vero,79(E2)V,-6.969,-7.242,-6.109,0.0,"(0.924, 0.991)",2
9,Vero,63(E2)V,-7.033,-7.323,-5.773,0.0,(0.891),1


### Variants in Vero cells after 24 alternating passages between Vero and Aag2

In [115]:
passaging_variants_summary.query("Passage == 'Alt_Vero'").sort_values(by='count', ascending=False).reset_index(drop=True)

Unnamed: 0,Passage,variant,C636,293T_Mxra8,293T_TIM1,293T_Mxra8 minus C636,variant_frequencies,count
0,Alt_Vero,55(E2)R,-7.438,-7.208,-7.242,0.0,"(0.061, 0.120, 0.308)",3
1,Alt_Vero,66(E1)V,0.015,-0.088,-0.036,-0.103,"(0.085, 0.439)",2
2,Alt_Vero,213(E2)I,0.229,0.094,0.123,-0.135,"(0.104, 0.065)",2
3,Alt_Vero,60(E2)N,-0.732,0.003,-0.301,0.735,"(0.104, 0.253)",2
4,Alt_Vero,13(E2)G,-4.665,-2.332,-1.897,2.333,(0.077),1
5,Alt_Vero,385(E1)E,0.091,0.133,-0.085,0.042,(0.355),1
6,Alt_Vero,70(E2)T,-5.958,-4.937,-3.547,0.063,(0.978),1
7,Alt_Vero,62(E2)R,-2.791,-0.419,-0.797,2.372,(0.228),1
8,Alt_Vero,48(E3)N,-1.214,-0.838,-0.517,0.376,(0.442),1
9,Alt_Vero,47(6K)I,-0.253,-0.398,-0.197,-0.145,(0.055),1


### Variants in Aag2 cells after 25 alternating passages between Vero and Aag2

In [116]:
passaging_variants_summary.query("Passage == 'Alt_Aag2'").sort_values(by='count', ascending=False).reset_index(drop=True)

Unnamed: 0,Passage,variant,C636,293T_Mxra8,293T_TIM1,293T_Mxra8 minus C636,variant_frequencies,count
0,Alt_Aag2,55(E2)R,-7.438,-7.208,-7.242,0.0,"(0.078, 0.195, 0.422, 0.065)",4
1,Alt_Aag2,391(E1)Q,-0.791,-0.635,-1.142,0.156,"(0.182, 0.086, 0.075)",3
2,Alt_Aag2,392(E1)E,-7.227,-7.521,-4.931,0.0,"(0.056, 0.072, 0.107)",3
3,Alt_Aag2,47(6K)I,-0.253,-0.398,-0.197,-0.145,"(0.060, 0.072, 0.051)",3
4,Alt_Aag2,66(E1)V,0.015,-0.088,-0.036,-0.103,"(0.091, 0.459)",2
5,Alt_Aag2,60(E2)N,-0.732,0.003,-0.301,0.735,"(0.210, 0.137)",2
6,Alt_Aag2,112(E1)K,-3.935,-4.966,-5.174,-1.031,(0.076),1
7,Alt_Aag2,46(E1)H,-7.418,-7.605,-7.582,0.0,(0.050),1
8,Alt_Aag2,389(E1)K,-3.447,-5.166,-5.232,-1.553,(0.061),1
9,Alt_Aag2,419(E1)M,-0.006,0.079,0.035,0.086,(0.120),1


In [118]:
# Write the summary out to a file
passaging_variants_summary.sort_values(by=['Passage', 'count'], ascending=False).reset_index(drop=True).to_csv("PRJNA437311_mutation_summary.csv", index=False)