# Create primers for mutations in Usher data

This notebooks takes Usher data, compared it to GISAID variant tables, filters it for higher frequency changes and creates primers for all usher sites.

In [None]:
import pandas as pd
from plotnine import *
from Bio.SeqUtils import MeltingTemp as mt
import statistics

In [None]:
usher_df = snakemake.input.usher_df
reference_lookup = snakemake.input.reference_lookup
gisaid_mutations = snakemake.input.gisaid_mutations
usher_count_filter = snakemake.params.usher_count_filter
codon_table = snakemake.input.codon_table
spike_extended = snakemake.input.spike_extended
create_primers_srcipt = snakemake.input.create_primers_srcipt

new_usher_df_out = snakemake.output.new_usher_df_out
usher_primers = snakemake.output.usher_primers

In [None]:
# usher_df = pd.read_csv ("../results/spike_mutcounts.csv")
# reference_lookup = pd.read_csv("../reference_sequences/reference_sequence_position_lookup.csv")
# gisaid_mutations = pd.read_csv("../results/spike_alignment_counts.csv")
# usher_count_filter = 24
# codon_table = "../reference_sequences/homo_codon_freq_del.csv"
# spike_extended = "../reference_sequences/Omicron_BA.1_extended_ends_for_primers.txt"
# create_primers_srcipt = '../scripts/create_primers_del.py'

# new_usher_df_out = '../results/variant_usher_mutations.csv'
# usher_primers = '../results/usher_primers.csv'

In [None]:
usher_df = pd.read_csv(usher_df)
reference_lookup = pd.read_csv(reference_lookup)
gisaid_mutations = pd.read_csv(gisaid_mutations)

In [None]:
#find duplicates for position and amino acid and sum n_mutations_to column if such duplicates exist
usher_df = usher_df.groupby(['site','amino_acid']).agg({'n_mutations_to': 'sum'})
usher_df.reset_index(inplace=True)


## Compare GISAID and Usher data

In [None]:
#rename columns
gisaid_mutations = gisaid_mutations.drop(columns=['wildtype'])
gisaid_mutations = gisaid_mutations.rename(columns={"mutant": "amino_acid", "count": "alignment_counts"}).sort_values(by=['site']).reset_index(drop=True)
# change deletion character
gisaid_mutations['amino_acid'] = gisaid_mutations['amino_acid'].str.replace('del','-')
# remove any mutations that are not single deletions
gisaid_mutations = gisaid_mutations[gisaid_mutations['amino_acid'].str.len() == 1]


First check if all mutations on usher are present in gisaid data, as usher should be a subset of GISAID.

In [None]:
#merge reference_lookup and gisaid_mutations tables
#remove GISAID mutations that are in variant spike already
new_df = pd.merge(gisaid_mutations,
                  reference_lookup,
                  how='left',
                  left_on=['site','amino_acid'],
                  right_on = ['parent_pos','parent_seq'],
                  indicator=True)

#filter on amino acids not already present in variant
new_df_noWU = new_df.loc[(new_df['_merge'] == 'left_only')]
new_gisaid_mutations = new_df[new_df.index.isin(new_df_noWU.index)]
new_gisaid_mutations = new_gisaid_mutations[['site','amino_acid','alignment_counts']]
new_gisaid_mutations

In [None]:
#now merge tables just on parent position, remove rows with no usher data, sort by position
new_gisaid_mutations = pd.merge(new_gisaid_mutations,
                  usher_df,
                  how='left',
                  left_on=['site', 'amino_acid'],
                  right_on = ['site', 'amino_acid'])

#drop positions with NaN in parent sequence removed CTD amino acids
new_gisaid_mutations = new_gisaid_mutations[new_gisaid_mutations['n_mutations_to'].notna()]
new_gisaid_mutations = new_gisaid_mutations.sort_values(by=['site'],ignore_index=True )
new_gisaid_mutations

Now plot correlation between usher and GISAID data

In [None]:
p = (ggplot(new_gisaid_mutations) + 
     aes('alignment_counts', 'n_mutations_to') + 
     geom_point(alpha = 0.3)+
     scale_x_log10()+
     scale_y_log10()
    )
p.draw

Now look at which mutations in usher are not present in GISAID

In [None]:
usher_df["site_mutations"] = usher_df["site"].astype(str) + usher_df["amino_acid"]
gisaid_mutations["site_mutations"] = gisaid_mutations["site"].astype(str) + gisaid_mutations["amino_acid"]

notInGISAID=usher_df.loc[~usher_df.site_mutations.isin(gisaid_mutations.site_mutations),]
notInGISAID.sort_values(by=['n_mutations_to'], ascending=False)

Now plot counts for usher sites not present in GISAID

In [None]:
p = (ggplot(notInGISAID) + 
     aes('n_mutations_to') + 
     geom_histogram(bins = 12)
    )
p.draw

In [None]:
#plot sites of usher changes
p = (ggplot(new_gisaid_mutations) + 
     aes('site') + 
     geom_bar()
    )
p.draw

## Filter usher dataset

We want to filter usher data for more frequent mutations

In [None]:
#merge lookup and usher tables
#remove mutations that are in variant spike already
new_usher_df = pd.merge(usher_df,
                  reference_lookup,
                  how='left',
                  left_on=['site','amino_acid'],
                  right_on = ['parent_pos','parent_seq'],
                  indicator=True)

#filter on amino acids not already present in variant
new_new_usher_df_noRef = new_usher_df.loc[(new_usher_df['_merge'] == 'left_only') | (new_usher_df['variant_sig'] == 'Yes')]
new_usher_df = new_usher_df[new_usher_df.index.isin(new_new_usher_df_noRef.index)]
new_usher_df = new_usher_df[['site','amino_acid','n_mutations_to']]


In [None]:
#now merge tables just on parent position
new_usher_df = pd.merge(new_usher_df,
                  reference_lookup,
                  how='left',
                  left_on=['site'],
                  right_on = ['parent_pos'])

#drop positions with NaN in parent sequence removed CTD amino acids
new_usher_df = new_usher_df[new_usher_df['parent_pos'].notna()]
new_usher_df = new_usher_df.sort_values(by=['site'],ignore_index=True)
new_usher_df

In [None]:
p = (ggplot(new_usher_df) + 
     aes('n_mutations_to') + 
     geom_histogram(color='white', bins=100)+
     xlim(0,200)
    )

# ggsave(plot = p, filename = 'results/primer_tm_dist.png')
p.draw

In [None]:
#drop mutations that occur less >= <usher_count_filter> times
new_usher_df=new_usher_df.loc[new_usher_df['n_mutations_to'] >= usher_count_filter]
len(new_usher_df)

In [None]:
p = (ggplot(new_usher_df) + 
     aes('n_mutations_to') + 
     geom_histogram(color='white', bins=100)+
     xlim(0,200)
    )

# ggsave(plot = p, filename = 'results/primer_tm_dist.png')
p.draw

Now filter drop columns that we don't need and relabel for variant

In [None]:
#create new table with mutations according to variant numbering
new_usher_df = new_usher_df[['variant_pos', 'amino_acid', 'n_mutations_to']].copy()
new_usher_df = new_usher_df.rename(
                            columns={"variant_pos": "site",
                                     "amino_acid": "mutant",
                                     "n_mutations_to": "n_mutations_to" }
)
new_usher_df
new_usher_df.to_csv(new_usher_df_out, index=False) 

## Make usher primers

In [None]:
# run primer design script
!python {create_primers_srcipt} \
    {spike_extended} \
    {new_usher_df_out} \
    {codon_table} \
    variant_usher \
    {usher_primers} \
    --minprimertm 60.5 \
    --maxprimertm 61.5

## Check usher primers

In [None]:
#import primer table
header_list = ["primer_name", "seq"]
variant_usher_primers = pd.read_csv(usher_primers, names=header_list)
variant_usher_primers

In [None]:
variant_usher_primers['Tm'] = variant_usher_primers.apply(lambda x: '%0.2f' % mt.Tm_NN(x.seq, strict=False), axis=1)
variant_usher_primers['Tm'] = variant_usher_primers['Tm'].astype('float')
variant_usher_primers['length'] = variant_usher_primers.apply(lambda x: len(x.seq), axis=1)
variant_usher_primers

In [None]:
p = (ggplot(variant_usher_primers) + 
     aes('length') + 
     geom_histogram(color='white',bins=30)
    )

p.draw

In [None]:
p = (ggplot(variant_usher_primers) + 
     aes('Tm') + 
     geom_histogram(color='white', bins=30)+
     xlim(58,72)
    )

p.draw

In [None]:
statistics.pvariance(variant_usher_primers['Tm'])