In [1]:
%matplotlib widget

# Set display to the full length

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import ipywidgets as widgets
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from ipywidgets import interact
from ipywidgets import interactive

from matplotlib.widgets import Button
from matplotlib.text import Annotation

from Bio import SeqIO

In [2]:
# Read in variant list and genbank file with records

df = pd.read_csv('variant_list.tsv', sep='\t')
df.columns = [c if not c.startswith('EFF[*].') else c[7:] for c in df.columns]
record = SeqIO.read('NC_045512.2.genbank','genbank')

# Add a column with a textual representation of each variant
df['textual_variant'] = df['POS'].astype(str) + df['REF'] + '/' + df['ALT']

# Add a column with the delta of higest to lowest AF
min_max_af = df.groupby('textual_variant').agg({'AF': (min, max)})
af_delta = min_max_af['AF']['max'] - min_max_af['AF']['min']
df.index = df['textual_variant']
df.loc[af_delta.index, 'AF_delta'] = af_delta

In [3]:
df

Unnamed: 0_level_0,Sample,CHROM,POS,REF,ALT,DP,AF,SB,DP4,IMPACT,FUNCLASS,EFFECT,GENE,CODON,textual_variant,AF_delta
textual_variant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1409C/T,SRR10903401,NC_045512,1409,C,T,126,0.039683,1,675423,MODERATE,MISSENSE,NON_SYNONYMOUS_CODING,orf1ab,Cat/Tat,1409C/T,0.038964
1821G/A,SRR10903401,NC_045512,1821,G,A,93,0.096774,0,483654,MODERATE,MISSENSE,NON_SYNONYMOUS_CODING,orf1ab,gGt/gAt,1821G/A,0.352701
1895G/A,SRR10903401,NC_045512,1895,G,A,106,0.037736,0,515122,MODERATE,MISSENSE,NON_SYNONYMOUS_CODING,orf1ab,Gta/Ata,1895G/A,0.000000
2407G/T,SRR10903401,NC_045512,2407,G,T,123,0.024390,0,576312,MODERATE,MISSENSE,NON_SYNONYMOUS_CODING,orf1ab,aaG/aaT,2407G/T,0.000000
3379A/G,SRR10903401,NC_045512,3379,A,G,121,0.024793,0,566212,LOW,SILENT,SYNONYMOUS_CODING,orf1ab,gtA/gtG,3379A/G,0.023311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29515T/A,SRR11454615,NC_045512,29515,T,A,172,0.029070,1,689414,LOW,SILENT,SYNONYMOUS_CODING,N,gcT/gcA,29515T/A,0.084854
29515T/G,SRR11454615,NC_045512,29515,T,G,172,0.029070,1,689414,LOW,SILENT,SYNONYMOUS_CODING,N,gcT/gcG,29515T/G,0.000000
29556A/G,SRR11454615,NC_045512,29556,A,G,170,0.023529,0,749222,.,.,.,.,.,29556A/G,0.022859
29573G/A,SRR11454615,NC_045512,29573,G,A,219,0.018265,0,9911522,MODERATE,MISSENSE,NON_SYNONYMOUS_CODING,ORF10,Gtt/Att,29573G/A,0.000000


In [10]:
# Define an interactive scatterplot function

DF=df
HANDLER = {}

def scatterplot_interactive(min_af=0, min_af_delta=0, min_occurence=1):
    df = DF
    df = df[(df['AF'] >= min_af)]
    by_var = df.groupby(['POS', 'REF', 'ALT']).size()
    df.index = pd.MultiIndex.from_frame(df[['POS', 'REF', 'ALT']])
    df.loc[by_var.index, 'Occurence'] = by_var;
    df = df[(df['Occurence'] >= min_occurence) & (df['AF_delta'] >= min_af_delta)]


    def scatterplot(df, ax=None):
        markers = {".":"*", "MISSENSE":"^", "NONSENSE":"X", "SILENT":"o"}
        ax = sns.scatterplot(x='POS',y='AF',data=df[df['AF']>=0.05],hue='Sample',style='FUNCLASS',legend='brief',alpha=0.6,s=500,markers=markers, ax=plt.gca(), picker=True)
        i = 0
        for feature in record.features:
            if feature.type == 'gene':
                i += 1
                gene_name = feature.qualifiers['gene'][0]
                if gene_name.startswith('ORF'):
                    gene_name=gene_name[3:]
                if i % 2 == 0:
                    color ='tomato'
                    y = 0.01
                else:
                    color ='deepskyblue'
                    y = -0.01
                plt.hlines(y,feature.location._start.position, feature.location._end.position, linewidth=20, color=color)
                plt.text(feature.location._start.position+(feature.location._end.position-feature.location._start.position)//2, y, gene_name,horizontalalignment='center',verticalalignment='center',fontsize=15)
        plt.legend(bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0.)
        return ax.figure

    plt.clf()
    scatterplot(df)

    # create and add an annotation object (a text label)
    def annotate(axis, text, x, y):
        text_annotation = Annotation(text, xy=(x, y), xycoords='data')
        axis.add_artist(text_annotation)


    # define the behaviour -> what happens when you pick a dot on the scatterplot by clicking close to it

    def onpick(event):
        # step 1: take the index of the dot which was picked
        if event.mouseevent.button in {'up', 'down'}:
            return
        ind = event.ind
        ax = plt.gca()

        # step 2: save the actual coordinates of the click, so we can position the text label properly
        label_pos_x = event.mouseevent.xdata
        label_pos_y = event.mouseevent.ydata

        # just in case two dots are very close, this offset will help the labels not appear one on top of each other
        offset = 0

        # if the dots are to close one to another, a list of dots clicked is returned by the matplotlib library
        for i in ind:
            # step 3: take the label for the corresponding instance of the data
            row = df.iloc[i,]
            label = "{row.POS} {row.REF}/{row.ALT}".format(row=row)

            # step 5: create and add the text annotation to the scatterplot
            annotate(
                ax,
                label,
                label_pos_x + offset,
                label_pos_y + offset
            )

            # step 6: force re-draw

            ax.figure.canvas.draw_idle()

            # alter the offset just in case there are more than one dots affected by the click
            offset += 0.05


    # connect the click handler function to the scatterplot
    fig = plt.gcf()
    if 'CID' in HANDLER:
        fig.canvas.mpl_disconnect(HANDLER['CID'])
    HANDLER['CID'] = fig.canvas.mpl_connect('pick_event', onpick)
    plt.show()

Let's have a look at the Allele Frequencies of intraindividual variants.

In [11]:
fig, ax = plt.subplots(figsize=(20, 7.5))
interactive(scatterplot_interactive,
            min_af=widgets.FloatSlider(min=0.0, max=1.0, value=0.05, step=0.01),
            min_occurence=widgets.IntSlider(min=1, step=1),
            min_af_delta=widgets.FloatSlider(min=0.0, max=1.0, value=0, step=0.01),
           )

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

interactive(children=(FloatSlider(value=0.05, description='min_af', max=1.0, step=0.01), FloatSlider(value=0.0…

There are 3 slides that control the scatterplot above. One can set the minimum allele frequecy for a variant to be plotted,
the minimum allele frequency delta (the highest allele frequency minus the lowest allele frequency observed), and the min_occurence
slider, which selects only vartiants that have been observed at least N times.

At a minimum allele frequency of 0.05, allele frequency delta of 0.5 and a minimum occurence of 3 (meaning that variant has been sampled 3 times)
we see multiple variants.

10779T/A for instance can be the major variant as well as the minor variant.

In [14]:
DELTA = 0.5


filtered_af_delta = df[df['AF_delta'] > DELTA].reset_index(drop=True).sort_values(['textual_variant', 'AF_delta'])
filtered_af_delta.style.bar(subset=['AF'],color='#d65f5f')

Unnamed: 0,Sample,CHROM,POS,REF,ALT,DP,AF,SB,DP4,IMPACT,FUNCLASS,EFFECT,GENE,CODON,textual_variant,AF_delta
6,SRR11059942,NC_045512,10024,A,T,68,0.632353,1,8171231,LOW,SILENT,SYNONYMOUS_CODING,orf1ab,ccA/ccT,10024A/T,0.612513
261,SRR11454613,NC_045512,10024,A,T,1521,0.023669,8,7727091624,LOW,SILENT,SYNONYMOUS_CODING,orf1ab,ccA/ccT,10024A/T,0.612513
269,SRR11454614,NC_045512,10024,A,T,2873,0.01984,20,148813192138,LOW,SILENT,SYNONYMOUS_CODING,orf1ab,ccA/ccT,10024A/T,0.612513
41,SRR11059947,NC_045512,1059,C,T,55521,0.000666,0,23658317602634,MODERATE,MISSENSE,NON_SYNONYMOUS_CODING,orf1ab,aCc/aTc,1059C/T,0.982719
87,SRR11177792,NC_045512,1059,C,T,50495,0.000733,36,25773245635521,MODERATE,MISSENSE,NON_SYNONYMOUS_CODING,orf1ab,aCc/aTc,1059C/T,0.982719
197,SRR11410529,NC_045512,1059,C,T,1442,0.979196,0,1119323,MODERATE,MISSENSE,NON_SYNONYMOUS_CODING,orf1ab,aCc/aTc,1059C/T,0.982719
242,SRR11410528,NC_045512,1059,C,T,963,0.983385,0,784179,MODERATE,MISSENSE,NON_SYNONYMOUS_CODING,orf1ab,aCc/aTc,1059C/T,0.982719
18,SRR11059943,NC_045512,10773,A,C,81,0.975309,0,1368,MODERATE,MISSENSE,NON_SYNONYMOUS_CODING,orf1ab,gAa/gCa,10773A/C,0.967969
21,SRR11059944,NC_045512,10773,A,C,1771,0.00734,9,5601122214,MODERATE,MISSENSE,NON_SYNONYMOUS_CODING,orf1ab,gAa/gCa,10773A/C,0.967969
47,SRR11059947,NC_045512,10773,A,C,40480,0.00919,87,1763920571235146,MODERATE,MISSENSE,NON_SYNONYMOUS_CODING,orf1ab,gAa/gCa,10773A/C,0.967969
