# Filter `ivar` variants

I used `ivar` to identify variants relative to the sequence of the DMS library strain [MW473668](https://www.ncbi.nlm.nih.gov/nuccore/MW473668). In this notebook, I filter the variants to get only the coding mutations in the CHIKV E glycoprotein.

In [155]:
import os
import pandas as pd
from Bio import SeqIO

In [156]:
raw_variants = pd.read_csv("../../results/variants/variants.ivar.csv")

Only keep the variants with a more than 100 reads at a site, a minimum frequency of 5%, a significant p-value < 0.05 and variants in the 'structural polyprotein' that encode the CHIKV E protein.

In [157]:
# Filter the variants
min_depth = 100
min_freq = 0.05
pvalue = 0.05
feature = "structural polyprotein:"
filtered_variants = raw_variants[
    (raw_variants['GFF_FEATURE'] == feature) &
    (raw_variants['TOTAL_DP'] >= min_depth) &
    (raw_variants['ALT_FREQ'] >= min_freq) &
    (raw_variants['PVAL'] <= pvalue)
]

Convert the numbering so it's relative to each region in the CHIKV E protein.

In [158]:
# Regions in the polyprotein
polyprotein_numbering = {
    "C": [1, 261],
    "E3": [262, 325],
    "E2": [326, 748],
    "6K": [749, 809],
    "E1": [810, 1248],
}
# Map the mature peptide region and position for each variant
def map_peptide_and_position(pos_aa, polyprotein_numbering):
    if pd.isna(pos_aa):
        return None, None
    
    for peptide, (start, end) in polyprotein_numbering.items():
        if start <= pos_aa <= end:
            relative_pos = pos_aa - start + 1  # +1 for 1-based indexing
            return peptide, relative_pos
    
    # If position doesn't fall within any range
    return None, None
# Apply the function to create both columns
filtered_variants = filtered_variants.copy()
filtered_variants[['peptide', 'peptide_position']] = filtered_variants['POS_AA'].apply(
    lambda x: pd.Series(map_peptide_and_position(x, polyprotein_numbering))
)

Only keep non-synonymous mutations in the subunits of the E protein.

In [159]:
# Drop some columns from the dataframe
columns_to_drop = ["REGION", "GFF_FEATURE", "POS_AA", "PASS", "ALT_QUAL", "REF_QUAL"]
filtered_variants.drop(columns=columns_to_drop, inplace=True)
# Rename columns for clarity
filtered_variants.rename(columns={
    "ALT_FREQ": "frequency",
    "TOTAL_DP": "depth",
    "PVAL": "p_value",
    "peptide": "region",
    "peptide_position": "region_position"
}, inplace=True)
# Convert region_position to integer
filtered_variants['region_position'] = filtered_variants['region_position'].astype('Int64')
# Filter to keep only regions in the region list
regions_to_keep = ["E1", "E2", "6K", "E3"]
filtered_variants = filtered_variants[
    filtered_variants['region'].isin(regions_to_keep)
]
# Only keep non-synonymous variants where REF_AA != ALT_AA
filtered_variants = filtered_variants[
    filtered_variants['REF_AA'] != filtered_variants['ALT_AA']
]
filtered_variants.head()

Unnamed: 0,POS,REF,ALT,REF_DP,REF_RV,ALT_DP,ALT_RV,frequency,depth,p_value,REF_CODON,REF_AA,ALT_CODON,ALT_AA,Accession,region,region_position
270,9618,T,C,0,0,185,52,1.0,185.0,5.63859e-131,GTG,V,GCG,A,SRR7613172,E2,368
271,9626,G,T,1,0,190,52,0.994764,191.0,1.84862e-132,GTG,V,TTG,L,SRR7613172,E2,371
272,9644,G,A,0,0,373,104,0.997326,374.0,1.42144e-260,GTA,V,ATA,I,SRR7613172,E2,377
276,9842,C,A,8,3,7122,3359,0.997898,7137.0,0.0,CTG,L,ATG,M,SRR7613172,6K,20
277,9845,C,T,1717,939,5524,2489,0.755884,7308.0,0.0,CAA,Q,TAA,*,SRR7613172,6K,21


Get the 'fixed' variants. These are variants with a frequency that's greater than `1 - minimum frequency` such that they cannot be converted to minor alleles with a frequency > our `minimum frequency`.

In [160]:
fixed_variants = filtered_variants[filtered_variants['frequency'] > (1 - min_freq)].reset_index(drop=True)
fixed_variants.head()

Unnamed: 0,POS,REF,ALT,REF_DP,REF_RV,ALT_DP,ALT_RV,frequency,depth,p_value,REF_CODON,REF_AA,ALT_CODON,ALT_AA,Accession,region,region_position
0,9618,T,C,0,0,185,52,1.0,185.0,5.63859e-131,GTG,V,GCG,A,SRR7613172,E2,368
1,9626,G,T,1,0,190,52,0.994764,191.0,1.84862e-132,GTG,V,TTG,L,SRR7613172,E2,371
2,9644,G,A,0,0,373,104,0.997326,374.0,1.42144e-260,GTA,V,ATA,I,SRR7613172,E2,377
3,9842,C,A,8,3,7122,3359,0.997898,7137.0,0.0,CTG,L,ATG,M,SRR7613172,6K,20
4,9909,T,G,28,13,8577,4230,0.995705,8614.0,0.0,TTT,F,TGT,C,SRR7613172,6K,42


Convert the remaining variants with a frequency > 50% into 'minor alleles' by reversing the reference and alternative columns and subtracting the allele frequency from 1.

In [161]:
# Get the minor variants (anything that didn't end up in the fixed_variants dataframe)
not_fixed_variants = filtered_variants[filtered_variants['frequency'] <= (1 - min_freq)].reset_index(drop=True)

# Split dataframe based on frequency
major_alleles = not_fixed_variants[not_fixed_variants['frequency'] > 0.5].copy()
minor_alleles = not_fixed_variants[not_fixed_variants['frequency'] <= 0.5].copy()

if len(major_alleles) > 0:
    # Convert frequency for major alleles
    major_alleles['frequency'] = 1 - major_alleles['frequency']
    
    # Swap REF and ALT columns
    # First, identify all REF_ and ALT_ columns
    ref_cols = [col for col in major_alleles.columns if col.startswith('REF_')]
    alt_cols = [col for col in major_alleles.columns if col.startswith('ALT_')]
    
    # Create mapping for column swapping
    swap_mapping = {}
    for ref_col in ref_cols:
        alt_col = ref_col.replace('REF_', 'ALT_')
        if alt_col in alt_cols:
            swap_mapping[ref_col] = alt_col
            swap_mapping[alt_col] = ref_col
    
    # Also swap the basic REF and ALT columns
    swap_mapping['REF'] = 'ALT'
    swap_mapping['ALT'] = 'REF'
    
    # Perform the swapping
    major_alleles = major_alleles.rename(columns=swap_mapping)
    
    # Reorder columns to match original dataframe
    major_alleles = major_alleles[not_fixed_variants.columns]

# Combine the dataframes
joined_df = pd.concat([minor_alleles, major_alleles], ignore_index=True)

# Sort by original index to maintain some order (optional)
minor_variants_df = joined_df.sort_values('POS').reset_index(drop=True)
minor_variants_df.head()


Unnamed: 0,POS,REF,ALT,REF_DP,REF_RV,ALT_DP,ALT_RV,frequency,depth,p_value,REF_CODON,REF_AA,ALT_CODON,ALT_AA,Accession,region,region_position
0,8324,C,A,385,219,20,10,0.054054,407.0,2.1728299999999998e-254,CGT,R,AGT,S,SRR13963679,E3,1
1,8327,C,A,1260,507,78,77,0.058252,1339.0,4.2759100000000005e-17,CTT,L,ATT,I,SRR12789644,E3,2
2,8327,C,A,1278,552,74,73,0.054734,1352.0,4.22466e-30,CTT,L,ATT,I,SRR12789651,E3,2
3,8327,C,A,1316,527,86,84,0.061297,1403.0,2.0459e-35,CTT,L,ATT,I,SRR12789645,E3,2
4,8327,C,A,1374,574,77,75,0.052957,1454.0,4.54883e-16,CTT,L,ATT,I,SRR12789646,E3,2


Save each of these tables to the summary directory to be tracked by `git`.

In [162]:
# Make a 'summary' directory in results if it doesn't exist
summary_dir = "../../results/summary"
if not os.path.exists(summary_dir):
    os.makedirs(summary_dir)
# Save the minor variants to a CSV file
minor_variants_csv_path = os.path.join(summary_dir, "minor_variants.csv")
minor_variants_df.to_csv(minor_variants_csv_path, index=False)
# Save the fixed variants to a CSV file
fixed_variants_csv_path = os.path.join(summary_dir, "fixed_variants.csv")
fixed_variants.to_csv(fixed_variants_csv_path, index=False) 
# Save the filtered variants to a CSV file
filtered_variants_csv_path = os.path.join(summary_dir, "all_variants.csv")
filtered_variants.to_csv(filtered_variants_csv_path, index=False)