# Intrahost analysis in SARS-CoV-2 re-sequencing samples

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/galaxyproject/SARS-CoV-2/blob/master/data/ipynb/intrahost.ipynb)

## Run these first

Three sections below are ingesting dependencies, initializating functions etc... So run it first!

### Requirements

In [None]:
!pip install -U pandasql

Collecting pandasql
  Downloading https://files.pythonhosted.org/packages/6b/c4/ee4096ffa2eeeca0c749b26f0371bd26aa5c8b611c43de99a4f86d3de0a7/pandasql-0.7.3.tar.gz
Building wheels for collected packages: pandasql
  Building wheel for pandasql (setup.py) ... [?25l[?25hdone
  Created wheel for pandasql: filename=pandasql-0.7.3-cp37-none-any.whl size=26820 sha256=767c3136b6fc6e57fdd0a5b5742b9ba823198a9bc280b04e232eca84412a62ef
  Stored in directory: /root/.cache/pip/wheels/53/6c/18/b87a2e5fa8a82e9c026311de56210b8d1c01846e18a9607fc9
Successfully built pandasql
Installing collected packages: pandasql
Successfully installed pandasql-0.7.3


In [None]:
import pandas as pd
import numpy as np
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

### Aux datasets

In [None]:
gnm_url = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_genomic.fna.gz'
gnm_file = gnm_url.split('/')[-1]

In [None]:
# Get SARS-CoV-2 RefSeq genomes (in GenBank format) from NCBI
import os.path
from os import path
if not path.exists(gnm_file[:-3]):
    !wget -nc {gnm_url}
    !gunzip {gnm_file}
else:
    print('File {} is already here\nDoing nothing!'.format(gnm_file))

--2021-03-24 15:51:40--  https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_genomic.fna.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.10, 130.14.250.11, 2607:f220:41e:250::10, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9591 (9.4K) [application/x-gzip]
Saving to: ‘GCF_009858895.2_ASM985889v3_genomic.fna.gz’


2021-03-24 15:51:40 (104 MB/s) - ‘GCF_009858895.2_ASM985889v3_genomic.fna.gz’ saved [9591/9591]



In [None]:
seq = ""
with open(gnm_file[:-3],'r') as f:
    for line in f:
        if not line.startswith('>'):
            seq += line.rstrip()

In [None]:
annot = {
    'start': [    265,    805,   2719,   8554,   10054,  10972,  11842,  12091,  12685,   13024,  13441,      16236,  18039,   19620,    20658,   13441,    21562,   25392, 26244, 26522, 27201,   27393,   27755,  27893, 28273, 29557],
    'end':   [    805,   2719,   8554,  10054,   10972,  11842,  12091,  12685,  13024,   13441,  16236,      18039,  19620,   20658,    21552,   13480,    25384,   26220, 26472, 27191, 27387,   27759,   27887,  28259, 29533, 29674 ], 
    'func':  ['leader', 'nsp2', 'nsp3', 'nsp4', '3Cpro', 'nsp6', 'nsp7', 'nsp8', 'nsp9', 'nsp10', 'RdRp', 'helicase', 'ExoN', 'endoR', 'MethTr', 'nsp11', 'S', 'orf3a',   'E',   'M', 'orf6', 'orf7a', 'orf7b', 'orf8',  'N', 'orf10'], 
    }

In [None]:
gene_track = pd.DataFrame.from_dict(annot)[['start','end','func']].sort_values(by=['start']).reset_index()
gene_track['top']= gene_track.index % 2
gene_track['bottom'] = gene_track['top']-1
gene_track.loc[gene_track['top'] == 0, 'color'] = 'red'
gene_track.loc[gene_track['top'] != 0, 'color'] = 'blue'

In [None]:
voc_url = 'https://github.com/galaxyproject/SARS-CoV-2/raw/master/data/voc/voc.tsv.gz'

In [None]:
# Sites under selection URL
sel_url = 'https://github.com/galaxyproject/SARS-CoV-2/raw/master/data/selection/selection.tsv.gz'
sel = pd.read_csv(sel_url,sep='\t')

### Functions

In [None]:
# Validation function for checking against genome

def check_against_genome(seq,df,fields,name,debug=False):
    
    """ Takes sequence (seq), dataframe (df),
    names of columns containing position and reference allele
    (e.g., ['POS','REF']), and dataframe name.
    Setting debud to True outputs a list of problematic sites.

    Returns a dict with counts and list of wrong sites if any.
    """

    df.name = df
    wrong = []
    bad = 0
    good = 0
    pb = df[fields].to_numpy()
    for pos,ref in pb:
        base = seq[pos:pos+len(ref)]
        if base == ref:
            good += 1
        else:
            bad += 1
            wrong.append([pos,ref])
    if debug is True:
        return(wrong)
    else:
        return({
            'name':name,
            'good':good,
            'bad':bad,
            'num_sites':len(df)
            })

In [None]:
class coordinateError(Exception):   
    def __init__(self, data):    
        self.data = data
    def __str__(self):
        return repr(self.data)

In [None]:
# Computing poisson expectations
import math

def ps(l,s,N):
    el = math.exp(-l)
    lkf = 1
    i = 0
    sp = {'samples':[],'N':[]}
    while (el*lkf*N)>0.1:
        sp['samples'].append(i)
        sp['N'].append((el*lkf*N))
        i += 1
        lkf *= l/i
    return(sp)                 

In [None]:
# Histogram generation

def create_hist(df,col,bins=100):
    hist, edges = np.histogram(df[col],bins)
    hist_df = pd.DataFrame({col: hist, "left": edges[:-1], "right": edges[1:]})
    hist_df["interval"] = ["%d to %d" % (left, right) for left, right in zip(hist_df["left"], hist_df["right"])]
    return(hist_df)

In [None]:
# Simple counts
def site_stats(df):
    tot_var = len(df)
    tot_sites = len(df.groupby(['POS','ALT','REF']).groups)
    tot_samples = df['Sample'].nunique()
    return(tot_var,tot_sites,tot_samples)

In [None]:
# Plotting AF distribution for a partticular site

import scipy.stats as stats
from bokeh.models import Rect

# non-parametric pdf
def site_example(site):
    af = var[var['POS']==site]['AF']
    nparam_density = stats.kde.gaussian_kde(af.values.ravel())
    x = np.linspace(0, 1, 100)
    pdf = nparam_density(x)
    plot = figure(
        plot_height = 200, plot_width = 800,
        title = 'Site {} ({} Samples)'.format(site,len(af)),
        x_axis_label = 'AF',
        y_axis_label = "Density",
        x_range=Range1d(start=0, end=1,bounds=(0, 1)),
        y_range=Range1d(start=0, end=5,bounds=(0, 30)),
        toolbar_location="below"
        )  

    ticks = ColumnDataSource({'tick':af.values.ravel()})
    plot.line(
        x=x, 
        y=pdf,
        line_color='orange',
        line_width=5
        )
    glyph = Rect(
        x='tick',
        y=0,
        width=0.0001, 
        height=1,
        line_color='red',
        line_alpha=.5,
        fill_alpha =.5)
    plot.add_glyph(ticks,glyph)
    show(plot)

## Ingesting and validating

- Translate `FUNCLASS` names
- Convert coordinates to 0-based (because VCFs are 1-based)
- Validate all sites by comparing contains of `REF` field to genome

In [None]:
# URL containing output of Galaxy workflow
# These are stored here -> https://github.com/galaxyproject/SARS-CoV-2/tree/master/data
per_sample_url = 'https://github.com/galaxyproject/SARS-CoV-2/raw/master/data/var/cog_20201120_by_sample.tsv.gz'
per_variant_url = 'https://github.com/galaxyproject/SARS-CoV-2/raw/master/data/var/cog_20201120_by_var.tsv.gz'
# Name of dataset that will be used in plots and reports
dataset = 'COG-Post'

In [None]:
# Read data into Pandas dataframe
var = pd.read_csv(per_sample_url,sep='\t')

In [None]:
# Humanize FUNCLASS names
funclass_translation = {'SILENT':'Synonymous','MISSENSE':'Non-synonymous','NONSENSE':'Stop','.':'Non-coding','NONE':'Indel'}
var = var.replace({'FUNCLASS':funclass_translation})

In [None]:
# Change coordinates to 0-based
var['POS'] = var['POS']-1

In [None]:
# Shorten names of some columns
var = var.rename(columns={'countunique(change)':'unique_changes',    # Number of all samples
                          'min(AF)':'mAF',                           # Minimum AF
                          'max(AF)':'xAF',                           # Max AF
                          'countunique(FUNCLASS)':'unique_funclass', # Accession numbers of samples with AF above threshold
                          })

In [None]:
# Check against Genome
# This step matches contect of REF field
# Against genome
# 'bad' should be 0

outcome = check_against_genome(seq,var,['POS','REF'],'var')
if outcome['bad'] > 0:
    raise coordinateError("{} sites were not verified. Run check_agianst_genome with debig=True".format(outcome['bad']))
print(outcome)

{'name': 'var', 'good': 38919, 'bad': 0, 'num_sites': 38919}


  del sys.path[0]


In [None]:
site_stats(var)

(38919, 5760, 1818)

## Thresholding

Here we assess how common variants are (how many samples contain each variant) and compute threshold of how many samples should share a variants for us to consider it in future analysis

For thresholding we only select sites with allele frequencing between 5% and 50% because these are more likely to be erroneous than sites with allele frequencies between 50% and 100%.

In [None]:
# For each genome position count how many unique samples contain a variant at that position woth freq < 0.5
# This is very slow approach (make it faster in the future)

cpb = []
for i in range(len(seq)):
    cpb.append(len( np.unique( var[ ( var['POS']==i ) & ( var['AF']<=0.5 ) ]['Sample'].values ) ) )

In [None]:
# Poisson lambda
l = np.sum(cpb)/len(seq)

In [None]:
print(l)

0.1502190415677357


In [None]:
# Aggregate sample counts
# How many sites are found in 1, 2, 3, 4 and so on samples
unique, counts = np.unique(cpb, return_counts=True)
bySiteCount = {'samples':unique,'N':counts}

In [None]:
# Compute Poisson estimate (see "Functions' section below)
estimate = ps(l,bySiteCount.values(),len(seq))

In [None]:
import bokeh.io
import bokeh.plotting
from bokeh.models import ColumnDataSource,Range1d
from bokeh.plotting import figure, show,output_file,save
bokeh.io.output_notebook()

exp = ColumnDataSource(estimate)
obs = ColumnDataSource(bySiteCount)
p = figure(plot_width=600, 
           plot_height=300,
           y_axis_type='log',
            x_axis_type='linear',

           y_axis_label='# Individual variants',
           x_axis_label='# Samples sharing a variant',
            x_range=Range1d(start=0, end=20,bounds=(0, 100)),
            y_range=Range1d(start=0, end=100000,bounds=(0, 100000)),
           
           )
p.line(y='N',x='samples',source=obs,line_color='red')
p.circle(y='N',x='samples',source=obs,line_color='red')
p.line(x='samples',y='N',source=exp,line_color='orange')
output_file("thresholding_{}.html".format(dataset))
show(p)

In [None]:
for i,item in enumerate(zip(estimate['N'],bySiteCount['N'])):
    if i > 0 and item[0]<item[1]:
        cutOff = i
        print('Sample count cutOff is {}'.format(cutOff))
        break


Sample count cutOff is 2


In [None]:
af_by_var = var.groupby(['POS','REF','ALT']).agg({'AF':[np.max],'Sample':['nunique']}).reset_index()
# Flatten column index
af_by_var.columns = [''.join(col).strip() for col in af_by_var.columns.values]

In [None]:
var = pd.merge(var,af_by_var, how='left', left_on=['POS','REF','ALT'], right_on=['POS','REF','ALT'])

In [None]:
# Use cutOff value to filter varinats using the following logic
# If maxAF for a site is < 0.5 and Sample count is below cutOff -> remove this site
# Remove ALL sites where sample count is less than 2
var = var[~((var['AFamax']<0.5) & (var['Samplenunique']<=cutOff))]
var = var[var['Samplenunique']>1]

In [None]:
# Create site-specific table
sites = var.groupby(['POS','REF','ALT','FUNCLASS','TRID','GENE','AA','change']).agg({'DP':[list],'AF':[list,np.std,np.min,np.max,np.mean],'Sample':[list,'nunique']}).reset_index()

In [None]:
# Flatten column index
sites.columns = [''.join(col).strip() for col in sites.columns.values]

In [None]:
# Compute Coefficient of Variation (CoV)
sites['CoV'] = sites['AFstd']/sites['AFmean']

## Descriptive stats

In [None]:
# Numbers of variants, sites, samples:
site_stats(var)

(34813, 1795, 1818)

In [None]:
changes = sites[( sites['REF'].str.len()==1 ) & (sites['ALT'].str.len()==1)].groupby(['REF','ALT']).agg({'Samplenunique':np.sum}).reset_index()

In [None]:
changes

Unnamed: 0,REF,ALT,Samplenunique
0,A,C,79
1,A,G,2965
2,A,T,361
3,C,A,307
4,C,G,1281
5,C,T,18521
6,G,A,1318
7,G,C,1900
8,G,T,4867
9,T,A,147


In [None]:
from bokeh.palettes import Viridis256
from bokeh.models.annotations import Title,Label, LabelSet
from bokeh.transform import transform

from bokeh.models import (BasicTicker, ColorBar, ColumnDataSource,
                          LogColorMapper, PrintfTickFormatter,LinearColorMapper,ContinuousColorMapper,LogTicker)

colors_tile = list(reversed(Viridis256))
colors_font = Viridis256

source=ColumnDataSource(changes)
mapper_tile = LogColorMapper(palette=colors_tile, low=changes['Samplenunique'].min(), high=changes['Samplenunique'].max())
mapper_font = LogColorMapper(palette=colors_font, low=changes['Samplenunique'].min(), high=changes['Samplenunique'].max())

TOOLTIPS = [
            ("Count","@Samplenunique")
]
p = figure(
    plot_width=400,
    plot_height=300,
    x_range=['A','C','T','G'],
    y_range=['A','C','T','G'],
    x_axis_label = 'Reference base',
    y_axis_label = 'Alternative base',
    tooltips=TOOLTIPS,


)

p.rect(
    x='REF',
    y='ALT',
    source=source,
    width=1, 
    height=1, 
    line_color=None,
    fill_color=transform('Samplenunique', mapper_tile)
)
color_bar = ColorBar(
    color_mapper=mapper_tile, 
    location=(0, 0),
    ticker=LogTicker(),
    label_standoff=12,
    formatter=PrintfTickFormatter(format="%d")
    )
t = Title()
t.text = "Frequencies of different substitutions in {} dataset".format(dataset.capitalize())
p.title = t
labels = LabelSet(x='REF', y='ALT', text='Samplenunique',
              x_offset=-15, y_offset=15, source=source, render_mode='canvas',text_color='white',text_font_size='7pt',)
p.add_layout(labels)

p.add_layout(color_bar, 'right')
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.ygrid.grid_line_color = None
p.xgrid.grid_line_color = None
output_file("substitutions_types_{}.html".format(dataset))
show(p)

In [None]:
import collections
colors = {'Synonymous':'green','Non-synonymous':'orange','Stop':'red','Non-coding':'blue', 'Indel':'magenta'}

srcs = {}

for func in var['FUNCLASS'].unique():
    srcs[func]=create_hist(var[var['FUNCLASS']==func],'AF')

srcs = collections.OrderedDict(sorted(srcs.items()))

plot = figure(
    plot_height = 300, 
    plot_width = 600,
    x_axis_label = 'AF',
    y_axis_label = "Count"
    )  

site_count_total = 0
unique_site_count_total = 0

for i,key in enumerate(srcs):
    site_count = len(var[var['FUNCLASS']==key])
    #unique_site_count = len(pysqldf('select (POS || REF || ALT),count(distinct (POS || REF || ALT)) as N from var where FUNCLASS = "{}" group by POS, REF, ALT order by N desc'.format(key)))
    unique_site_count = len(var[var['FUNCLASS']==key].groupby(['POS','ALT','REF']).groups)
    plot.quad(
        bottom = 0, 
        top = 'AF',
        left = "left", 
        right = "right", 
        source = srcs[key], 
        fill_color = colors[key],
        line_color = "black", 
        fill_alpha = 0.4,
        legend_label='{} (all = {}, distinct = {})'.format(key,site_count,unique_site_count)
        )
    site_count_total += site_count
    unique_site_count_total += unique_site_count
plot.legend.click_policy="hide"
plot.legend.location = "top_center"
t = Title()
t.text = "Histogram of Allele Frequencies for {} dataset (all = {}, distinct = {})".format(dataset.capitalize(),site_count_total,unique_site_count_total)
plot.title = t
output_file("af_histogram_{}.html".format(dataset))
show(plot)

In [None]:
for bounds in [[0,0.1],[0.1,0.8],[0.8,1]]:
    print(pysqldf('select Sample, count(*) as "{0}-{1}" from var where AF >= {0} and AF < {1} group by Sample '.format(bounds[0],bounds[1])).describe().loc[['count','mean','min','max']].T.to_markdown())

|       |   count |    mean |   min |   max |
|:------|--------:|--------:|------:|------:|
| 0-0.1 |     474 | 1.98734 |     1 |    17 |
|         |   count |    mean |   min |   max |
|:--------|--------:|--------:|------:|------:|
| 0.1-0.8 |    1789 | 3.77306 |     1 |    29 |
|       |   count |    mean |   min |   max |
|:------|--------:|--------:|------:|------:|
| 0.8-1 |    1818 | 14.8839 |     3 |    34 |


## Distribution of AFs across samples

In [None]:
# DataFrame for plotting AF stats
af_plot_data = sites[['POS','REF','ALT','FUNCLASS','AFmean','CoV','Samplenunique']]
# Fraction of sample containing a given varinat (aka polulation frequency)
af_plot_data.loc[:,('frac')]=af_plot_data['Samplenunique']/var['Sample'].nunique()
for key in colors:
    af_plot_data.loc[(af_plot_data['FUNCLASS']==key),'color'] = colors[key]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [None]:
from bokeh.layouts import row,column

af_cov_sample = ColumnDataSource(af_plot_data)

TOOLTIPS_p1 = [
    ("(mean,cov)", "(@AFmean{(0.000)}, @CoV{(0.000)})"),
    ("Position","@POS{0,0}"),
    ("Funclass",'@FUNCLASS'),
    ("# Samples","@Samplenunique")
]

TOOLTIPS_p2 = [
        ("(CoV,PF)", "(@CoV{(0.000)}, @frac{(0.000)})"),
        ("Position","@POS{0,0}"),
        ("Funclass",'@FUNCLASS'),
        ("# Samples","@Samplenunique")
]

p1 = figure(
    plot_height = 400, plot_width = 400,
    title = "AF: Mean versus CoV",
    y_axis_label = 'Coefficient of Variation for AF',
    x_axis_label = "Mean AF",
    tooltips=TOOLTIPS_p1,
    #tools='save'
    )

p1.circle(
    y='CoV',
    x='AFmean',
    color='color',
    alpha=.5,
    source=af_cov_sample,
    size=8
    )

p2 = figure(
    plot_height = 400, 
    plot_width = 400,
    title = "AF: CoV versus PF",
    x_axis_label = 'Fraction of samples containing a variant (aka PF)',
    y_axis_label = 'Coefficient of Variation for AF',
    tooltips=TOOLTIPS_p2,
    y_range=p1.y_range
    #tools='save'
    )
        
p2.circle(
    y='CoV',
    x='frac',
    line_color=None,
    fill_color='color',
    alpha=.5,
    source=af_cov_sample,
    size=8
    )
output_file("cov_vs_pf_{}.html".format(dataset))
show(column(p1,p2))

## Distribution across genome

In [None]:
for key in colors:
    sites.loc[(sites['FUNCLASS']==key),'color'] = colors[key]

In [None]:
source = ColumnDataSource(sites)
TOOLTIPS = [
    ("Position", "@POS{0,0}"),
    ("GENE","@TRID"),
    ("min(AF)","@AFamin{(0.000)}"),
    ("max(AF)","@AFamax{(0.000)}"),
    ("# samples","@Samplenunique"),
    ('AA','@AA'),
    ('REF','@REF'),
    ('ALT','@ALT')
]
p = figure(
    plot_height=300,
    plot_width=600,
    y_range=sorted(sites['FUNCLASS'].unique(),key=str.lower),
    tooltips = TOOLTIPS,
    x_range=Range1d(
        start=0, 
        end=30000,
        bounds=(0, 30000)
        )
    )
glyph_p = Rect(
    x='POS',
    y='FUNCLASS',
    width=100, 
    height='CoV',
    fill_alpha=.75,
    line_color=None,
    fill_color='color'
    )
p.add_glyph(source,glyph_p)
output_file("genomic_dist_{}.html".format(dataset))
show(p)

## Distribution across genes

In [None]:
def size_factor(l):
    return np.mean(l)*20
by_gene_fc = var.groupby(['TRID','FUNCLASS']).agg({'POS':'nunique','AF':size_factor}).reset_index()

In [None]:
from bokeh.palettes import cividis
from bokeh.models import ColorBar
from bokeh.transform import linear_cmap
from math import pi

source = ColumnDataSource(by_gene_fc)
mapper = linear_cmap(field_name='POS', palette=cividis(by_gene_fc['POS'].nunique()) ,low=min(by_gene_fc['POS'].unique()) ,high=max(by_gene_fc['POS'].unique()))

TOOLTIPS = [
    ("Distinct variants", "@POS"),
    ("mean AF","@AF")
]

p = figure(plot_height=300,
           plot_width=600,
           y_range=sorted(by_gene_fc['FUNCLASS'].unique(),key=str.lower),
           x_range=annot['func'],
           x_axis_label = 'Genes',
           title = "Variants per gene in {} dataset (circle size = average AF, color = number of distinct variants)".format(dataset),
           tools='save',
           tooltips = TOOLTIPS
           )
p.circle(x='TRID',
         color=mapper,
         y='FUNCLASS',
         size='AF',
         source=source,
         alpha=.4
         )
color_bar = ColorBar(color_mapper=mapper['transform'], width=8,  location=(0,0))
p.add_layout(color_bar, 'right')
p.xaxis.major_label_orientation = pi/2
output_file("gene_dist_{}.html".format(dataset))
show(p)

## VOC in intrahost context

In [None]:
voc = pd.read_csv(voc_url, sep='\t', header=None, names=['set','aa','position'])

In [None]:
voc_axis = pysqldf('select "set" || "/" || aa as id from voc order by "set", position')

In [None]:
voc.head()

Unnamed: 0,set,aa,position
0,B1351,P71L,26454
1,B1351,T205I,28885
2,B1351,K1655N,5227
3,B1351,D80A,21799
4,B1351,D215G,22204


In [None]:
# Subset of sites without aggregated arrays as they cause problems for pysqldf
sites_lite = sites[['POS','REF','ALT','FUNCLASS','TRID','AA','Samplenunique','AFmean','AFamin','AFamax','CoV']]

In [None]:
voc.head()

Unnamed: 0,set,aa,position
0,B1351,P71L,26454
1,B1351,T205I,28885
2,B1351,K1655N,5227
3,B1351,D80A,21799
4,B1351,D215G,22204


In [None]:
voc  = pysqldf('select voc.*, Samplenunique, CoV, AFmean,TRID from voc left join sites_lite on POS >= position and POS < position+3 order by "set",position')

In [None]:
voc = pysqldf('select  "set" || "/" || aa as id, * from voc order by "set", position')

In [None]:
from bokeh.palettes import Turbo256, linear_palette
palette = linear_palette(Turbo256,(len(annot['func'])))
gene_colors = {}
for i, gene in enumerate(annot['func']):
    gene_colors[gene] = palette[i]

In [None]:
for gene in voc['TRID'].unique():
    if gene is not None:
        voc.loc[voc['TRID']==gene,'color'] = gene_colors[gene]

In [None]:
line_ends = {}
for mut_set in voc['set'].unique():
    start = voc[ voc['set']==mut_set ]['position'].min()
    end   = voc[ voc['set']==mut_set ]['position'].max()
    a = pysqldf('select id from voc where "set" = "{}" and position = {}'.format(mut_set,start)).iloc[0][0]
    b = pysqldf('select id from voc where "set" = "{}" and position = {}'.format(mut_set,end)).iloc[0][0]
    line_ends[mut_set]=[a,b]

In [None]:
line_ends

{'A231': ['A231/F157L', 'A231/P681'],
 'B117': ['B117/T1001I', 'B117/S235F'],
 'B1351': ['B1351/K1655N', 'B1351/T205I'],
 'BLOOM': ['BLOOM/Y365', 'BLOOM/G496'],
 'P1': ['P1/S1188L', 'P1/P80R']}

In [None]:
#Making glyphs visible
size_factor = 100

In [None]:
voc['frac']=(voc['Samplenunique']/var['Sample'].nunique())*size_factor

In [None]:
voc.dropna(how='any')

Unnamed: 0,id,set,aa,position,Samplenunique,CoV,AFmean,TRID,color,frac
2,A231/Q613H,A231,Q613H,23398,2.0,0.020682,0.899425,S,#f2c83a,0.110011
3,A231/P681,A231,P681,23602,34.0,0.012589,0.945791,S,#f2c83a,1.870187
4,B117/T1001I,B117,T1001I,3265,33.0,0.015482,0.946245,nsp3,#424bb5,1.815182
5,B117/A1708D,B117,A1708D,5386,32.0,0.010993,0.936863,nsp3,#424bb5,1.760176
6,B117/I2230T,B117,I2230T,6952,33.0,0.025431,0.792387,nsp3,#424bb5,1.815182
10,B117/N501Y,B117,N501Y,23062,34.0,0.016316,0.900227,S,#f2c83a,1.870187
11,B117/A570D,B117,A570D,23269,32.0,0.010062,0.886,S,#f2c83a,1.760176
12,B117/A570D,B117,A570D,23269,2.0,1.170677,0.49177,S,#f2c83a,0.110011
13,B117/P681H,B117,P681H,23602,34.0,0.012589,0.945791,S,#f2c83a,1.870187
14,B117/T716I,B117,T716I,23707,34.0,0.165442,0.912401,S,#f2c83a,1.870187


In [None]:
from bokeh.models import Legend
from bokeh.transform import linear_cmap
from math import pi
from bokeh.transform import factor_cmap
from bokeh.models import BoxAnnotation, Toggle
from bokeh.palettes import Reds

source = ColumnDataSource(voc.dropna(how='any').rename(columns={"countunique(Sample)": "samples"}))
TOOLTIPS = [
            ("# samples","@Samplenunique"),
            ("Set","@set"),
            ("Mutation","@aa"),
            ("Position","@position{0,0}"),
            ("Product","@TRID"),
            ("Mean AF","@AFmean{0,0}")
]

p = figure(plot_height=300,
    plot_width=600,
    x_range=voc['id'].unique(),
    x_axis_label = 'VOC Set/Change',
    y_axis_label = 'Coef of Var for AF',
    tooltips=TOOLTIPS,
    title = "VOCs in {} data ([min;max] = [{:.0f},{:.0f}] out of {})".format(dataset.capitalize(),
                                                                                                                    voc['Samplenunique'].min(),
                                                                                                                    voc['Samplenunique'].max(),
                                                                                                                    var['Sample'].nunique()
    ),
    y_range=Range1d(
        start=-0.5, 
        end=2,
        bounds=(-.5, 2))
    )
p.add_layout(Legend(), 'right')

p.circle(x='id',
    color='color',
    y='CoV',
    radius='frac',
    source=source,
    legend_field="TRID",
    alpha=.5,
    line_color='black'
    )

for i,mut_set in enumerate(voc['set'].unique()):
    p.line([line_ends[mut_set][0],line_ends[mut_set][1]], [-.4, -.4], line_color=Reds[5][i],width=10)

p.ygrid.grid_line_color = None
orientation = pi/2
p.legend.location = "top_center"
p.xaxis.major_label_orientation = pi/2
p.xaxis.major_label_text_font_size = "6pt"
p.yaxis.bounds= (0,2)
output_file("voc_{}.html".format(dataset))
show(p)

In [None]:
voc.dropna(how='any').sort_values(by=['frac'],ascending=False)

Unnamed: 0,id,set,aa,position,Samplenunique,CoV,AFmean,TRID,color,frac
56,P1/L18F,P1,L18F,21613,719.0,0.064168,0.875699,S,#f2c83a,39.548955
70,P1/E92K,P1,E92K,28166,54.0,0.013424,0.929534,orf8,#b91e01,2.970297
25,B1351/D215G,B1351,D215G,22204,41.0,0.02689,0.892518,S,#f2c83a,2.255226
65,P1/N501Y,P1,N501Y,23062,34.0,0.016316,0.900227,S,#f2c83a,1.870187
10,B117/N501Y,B117,N501Y,23062,34.0,0.016316,0.900227,S,#f2c83a,1.870187
28,B1351/N501Y,B1351,N501Y,23062,34.0,0.016316,0.900227,S,#f2c83a,1.870187
13,B117/P681H,B117,P681H,23602,34.0,0.012589,0.945791,S,#f2c83a,1.870187
14,B117/T716I,B117,T716I,23707,34.0,0.165442,0.912401,S,#f2c83a,1.870187
3,A231/P681,A231,P681,23602,34.0,0.012589,0.945791,S,#f2c83a,1.870187
4,B117/T1001I,B117,T1001I,3265,33.0,0.015482,0.946245,nsp3,#424bb5,1.815182


## Sites under selection in intrahost context

In [None]:
sel = pysqldf("select * from sel where fel_p <= 0.0001 or meme_p <= 0.0001")

In [None]:
sel.head()

Unnamed: 0,pos,gene,site,fel_p,fel_a,fel_b,meme_p,meme_a,meme_b_p,meme_b_m,freq,REF,ALT,codon,slip
0,26678,M,53,9.106369e-10,10.407975,0.0,0.666667,10.383362,95.789779,0.0,0.005273,314445,1667,TTC,0
1,26732,M,71,4.974971e-06,9.322545,0.0,0.666667,9.324877,13.987316,0.0,0.047293,301162,14950,TAC,0
2,26798,M,93,1.665335e-15,19.926007,0.0,0.666667,19.938639,0.93732,0.0,0.215041,248135,67977,CTC,0
3,26855,M,112,5.416528e-05,4.459943,0.0,0.666667,4.447241,11.443467,0.0,0.002123,315441,671,TTC,0
4,26873,M,118,2.65645e-08,13.114604,0.0,0.666667,13.128376,0.907803,0.0,0.023691,308623,7489,ATT,0


In [None]:
sites_sel = pysqldf('select sites_lite.*, freq,codon,fel_p, meme_p from sites_lite join sel on sites_lite.POS >= sel.pos and sites_lite.POS < sel.pos+3')

In [None]:
# Load per variant table
# This table contains additional information including allele frequecnes for all varinats 
# Even if they did not pass our filters
per_var = pd.read_csv(per_variant_url,sep='\t')

In [None]:
# Convert coordinates into 0-based
per_var['POS'] = per_var['POS']-1

In [None]:
# Create dictionary with raw above 5% and below 5% counts
sample_counts = {'POS':[],'above':[],'below':[]}
for row in per_var.iterrows():
    sample_counts['POS'].append(row[1]['POS'])
    above = 0
    below = 0
    for af in row[1]['AFs(all)'].split(','):
        if float(af)>=0.05:
            above += 1
        else:
            below += 1
    sample_counts['above'].append(above)
    sample_counts['below'].append(below)

In [None]:
# Create data frame from above dict
sc = pd.DataFrame.from_dict(sample_counts)

In [None]:
sites_sel = pysqldf('select sites_sel.*, above,below from sites_sel left join sc on sites_sel.POS = sc.POS')

In [None]:
sites_sel.head()

Unnamed: 0,POS,REF,ALT,FUNCLASS,TRID,AA,Samplenunique,AFmean,AFamin,AFamax,CoV,freq,codon,fel_p,meme_p,above,below
0,312,C,T,Synonymous,leader,L16,15,0.763461,0.056962,0.909091,0.366996,0.043302,CTC,1.955128e-07,0.6666667,15,32
1,444,T,C,Synonymous,leader,V60,1266,0.983335,0.214249,1.0,0.035167,0.214465,GTT,3.191356e-09,0.6666667,1267,3
2,808,T,C,Non-synonymous,nsp2,Y2H,5,0.988706,0.987383,0.990969,0.001422,0.000455,TAC,3.186229e-07,8.729344e-07,5,2
3,814,C,T,Non-synonymous,nsp2,R4C,7,0.232731,0.055832,0.857494,1.239712,0.000804,CGC,1.922906e-13,7.21978e-13,7,113
4,822,C,T,Synonymous,nsp2,V6,2,0.499553,0.137966,0.861139,1.023637,0.002167,GTC,3.327888e-05,0.6666667,2,23


In [None]:
for func in sites['FUNCLASS'].unique():
    sites_sel.loc[sites_sel['FUNCLASS'] == func, 'color'] = colors[func]

In [None]:
colors

{'Indel': 'magenta',
 'Non-coding': 'blue',
 'Non-synonymous': 'orange',
 'Stop': 'red',
 'Synonymous': 'green'}

In [None]:
sites_sel['frac']=(sites_sel['Samplenunique']/var['Sample'].nunique())*100
sites_sel['CoV']=sites_sel['CoV']*20

In [None]:
sites_sel.head()

Unnamed: 0,POS,REF,ALT,FUNCLASS,TRID,AA,Samplenunique,AFmean,AFamin,AFamax,CoV,freq,codon,fel_p,meme_p,above,below,color,frac
0,312,C,T,Synonymous,leader,L16,15,0.763461,0.056962,0.909091,7.339917,0.043302,CTC,1.955128e-07,0.6666667,15,32,green,0.825083
1,444,T,C,Synonymous,leader,V60,1266,0.983335,0.214249,1.0,0.703332,0.214465,GTT,3.191356e-09,0.6666667,1267,3,green,69.636964
2,808,T,C,Non-synonymous,nsp2,Y2H,5,0.988706,0.987383,0.990969,0.028448,0.000455,TAC,3.186229e-07,8.729344e-07,5,2,orange,0.275028
3,814,C,T,Non-synonymous,nsp2,R4C,7,0.232731,0.055832,0.857494,24.794244,0.000804,CGC,1.922906e-13,7.21978e-13,7,113,orange,0.385039
4,822,C,T,Synonymous,nsp2,V6,2,0.499553,0.137966,0.861139,20.472744,0.002167,GTC,3.327888e-05,0.6666667,2,23,green,0.110011


In [None]:
sites_sel['frac'].max()

99.94499449944995

In [None]:
from bokeh.models import ColumnDataSource, Label, LabelSet,Range1d
from bokeh.layouts import column
import bokeh.io
import bokeh.plotting
from bokeh.models import Legend,ColumnDataSource
from bokeh.transform import linear_cmap
from math import pi
from bokeh.transform import factor_cmap
from bokeh.models import BoxAnnotation, Toggle,Range1d,Rect,Quad
from bokeh.plotting import figure, show
from bokeh.palettes import Set1_7,Reds
from bokeh.layouts import gridplot,column

bokeh.io.output_notebook()

af_cut = .8
source_var = ColumnDataSource(sites_sel[sites_sel['AFamax']<=af_cut])
gene_coord = ColumnDataSource(gene_track)
TOOLTIPS = [
            ("Position","@POS"),
            ("Funclass","@FUNCLASS"),
            ("Codon","@codon"),
            ("Amino acid",'@AA'),
            ("Samples with variant","@Samplenunique"),
            ("Gene","@TRID"),
            ("GISAID freq","@freq"),
            ("AF (min,max)","@AFamin,@AFamax")
    ]
p = figure(
    plot_height=400,
    plot_width=1200,
    #y_range=df['Sample'].unique(),
    #tooltips = TOOLTIPS,
    x_axis_label = 'Genome coordinate',
    y_axis_label = 'Coefficient of Variation for AF',
    x_range=Range1d(start=0, end=30000,bounds=(0, 30000)),
    #y_range=Range1d(start=0, end=2,bounds=(0, 2)),
    title='Sites under selection ({} dataset; {} AF cutoff)'.format(dataset.capitalize(),af_cut),
    tooltips=TOOLTIPS
    )
p.circle(
    x='POS',
    y='CoV',
    source=source_var,
    size="frac",
    alpha=.5,
    fill_color='color'
    )
#p.xaxis.ticker = df['POS']
p.xaxis.major_label_orientation = pi/2
p_labels = LabelSet(
    x='POS', 
    y='CoV', 
    text='POS', 
    level='glyph',
    x_offset=5, 
    y_offset=5, 
    source=source_var, 
    render_mode='canvas',
    angle=np.pi/6,
    text_font_size='6pt'
    )
p.add_layout(p_labels)

# Figure for annotation track
r = figure(
    x_range=p.x_range,
    plot_height=100,
    plot_width=1200,
    #**plot_options,
    y_axis_label='Genes',
    x_axis_label='Position in genome',
    y_range=Range1d(start=-1, end=2,bounds=(-1, 2)),
    tools='save'
    )

r_labels = LabelSet(
    x='start', 
    y='bottom', 
    text='func', 
    level='glyph',
    x_offset=0, 
    y_offset=0, 
    source=gene_coord, 
    render_mode='canvas',
    text_font_size='7pt', 
    angle=np.pi/2  
    )

genes = Quad(left="start", bottom='bottom', right='end', top='top',line_color='color',fill_color='color',fill_alpha=.5)
r.add_glyph(gene_coord, genes)
r.add_layout(r_labels)
r.yaxis.visible = False
r.ygrid.grid_line_color = None
output_file("selection_{}.html".format(dataset))
show(column(p,r))

In [None]:
print(sites_sel[(sites_sel['AFamax']<0.8) & (sites_sel['FUNCLASS']=='Non-synonymous')][['POS','REF','ALT','TRID','AA','frac','AFmean','AFamin','AFamax','codon','above','below']].sort_values(by=['frac'],ascending=False).to_markdown())

|     |   POS | REF   | ALT   | TRID   | AA     |     frac |   AFmean |   AFamin |   AFamax | codon   |   above |   below |
|----:|------:|:------|:------|:-------|:-------|---------:|---------:|---------:|---------:|:--------|--------:|--------:|
|  11 |  1463 | G     | A     | nsp2   | G220D  | 0.715072 | 0.737533 | 0.669195 | 0.767157 | GGT     |      13 |       0 |
| 172 | 22343 | G     | T     | S      | G261V  | 0.660066 | 0.703605 | 0.672658 | 0.737692 | GGT     |      12 |       4 |
| 220 | 25217 | G     | T     | S      | G1219V | 0.330033 | 0.752613 | 0.74618  | 0.756799 | GGT     |       6 |      11 |
| 157 | 21845 | C     | T     | S      | T95I   | 0.275028 | 0.466971 | 0.069175 | 0.731614 | ACT     |       5 |      54 |
| 312 | 29252 | C     | T     | N      | S327L  | 0.220022 | 0.196384 | 0.08399  | 0.359015 | TCG     |       4 |     120 |
| 305 | 29170 | C     | T     | N      | H300Y  | 0.165017 | 0.314531 | 0.076923 | 0.735849 | CAT     |       3 |      26 |
|  45 | 