# Browser

> Contains the GenomeBrowser class

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp browser

In [None]:
#| hide
from nbdev.showdoc import *


In [None]:
#| export
from fastcore.basics import *

from genomenotebook.utils import (
    get_genome_annotations,
    get_gene_patches, 
    get_genes_from_annotation, 
    create_genome_browser_plot,
    get_all_glyphs)
from bokeh.models import (
    CustomJS,
    Range1d,
    ColumnDataSource,
    LabelSet, 
    TextInput,
    AutocompleteInput,
    Rect,
    Div,
    Styles,
    NumeralTickFormatter
)
from bokeh.plotting import show, figure
from bokeh.layouts import column, row

from Bio import SeqIO
from genomenotebook.js_callback_code import (
    x_range_change_callback_code, 
    search_callback_code, 
    track_callback_code,
    get_example_data_dir
)
from bokeh.io import output_notebook
import numpy as np
import pandas as pd
import os
import itertools

try: #pyBigWig cannot be installed on Windows. This might make it possible for windows users to still install
    import pyBigWig
except ImportError:
    pyBigWig = None
    
import warnings

In [None]:
#| hide
output_notebook()

In [None]:
#| export
class GenomeBrowser:
    def __init__(self,
                 genome_path: str, #path to the fasta file of the genome sequence
                 gff_path: str, #path to the gff3 file of the annotations
                 seq_id: str = None, #id of the sequence to show for genomes with multiple contigs
                 init_pos: int = None, #initial position to display
                 init_win: int = 10000, #initial window size (max=20000)
                 bounds: tuple = None, #bounds can be specified. This helps preserve memory by not loading the whole genome if not needed.
                 show_seq: bool = True, #shows the sequence when zooming in
                 search: bool = True, #enables a search bar to lookup a gene name or a DNA sequence
                 **kwargs):
        
        self.genome_path = genome_path
        self.gff_path = gff_path
        self.rec = self._get_sequence_record(seq_id)            
        self.seq_len = len(self.rec.seq) #length of the reference sequence before bounds are applied
        self._apply_bounds(bounds)
        self._set_init_pos(init_pos)

        self.init_win = min(init_win,self.bounds[1]-self.bounds[0])

        self.show_seq = show_seq
        self.max_glyph_loading_range = 20000
        self.frame_width = 600

        self.elements = self._get_browser(**kwargs)
        if search:
            self.elements = [self._get_search_box()]+self.elements
            
        self.tracks=[]
    
    
    def _set_init_pos(self, init_pos):
        if init_pos == None:
            self.init_pos=sum(self.bounds)//2
        elif init_pos>self.bounds[1] or init_pos<self.bounds[0]:
            warnings.warn("Requested an initial position outside of the browser bounds")
            self.init_pos=sum(self.bounds)//2
        else:
            self.init_pos=init_pos
    
    def _apply_bounds(self, bounds):
        if bounds == None:
            self.bounds=(0,self.seq_len)
        else:
            self.bounds=bounds
        
        self.rec.seq=self.rec.seq[self.bounds[0]:self.bounds[1]]

    def _get_sequence_record(self, seq_id):
        if seq_id==None: #when no seq_id is provided we take the first element
            rec = next(SeqIO.parse(self.genome_path, 'fasta'))
        else:
            rec_found=False
            for rec in SeqIO.parse(self.genome_path, 'fasta'):
                if rec.id==seq_id:
                    rec_found=True
                    break
            
            if not rec_found:
                warnings.warn("seq_id not found in fasta file")
        return rec

    def _get_browser(self, **kwargs):

        semi_win = self.init_win / 2
        x_range = Range1d(
            max(self.bounds[0],self.init_pos - semi_win), min(self.bounds[1],self.init_pos + semi_win), 
            bounds=self.bounds, 
            max_interval=100000,
            min_interval=40
        )
        
        annotation = get_genome_annotations(self.gff_path,
                                            seq_id = self.rec.id,
                                            bounds = self.bounds)
        
        genes = get_genes_from_annotation(annotation) 
        
        #This contains the glyphs plotted by bokeh
        self.glyph_source = ColumnDataSource(get_gene_patches(genes, x_range.start, x_range.end))

        #This contains the positions of the glyphs plotted by bokeh
        self.loaded_range = ColumnDataSource({"start":[x_range.start],
                                                "end":[x_range.end], 
                                                "range":[self.max_glyph_loading_range]})

        #This contains the glyphs for the whole genome
        self.all_glyphs=get_all_glyphs(genes, self.bounds)

        p = create_genome_browser_plot(self.glyph_source, 
                                       x_range, **kwargs)
        p.frame_width=self.frame_width

        sty=Styles(font_size='14px',
                font_family="Courrier",
                color="black",
                display="inline-block",
                background_color = "white",
                margin="0",
                margin_left= "2px",
                )
        
        ## Adding the ability to display the sequence when zooming in
        sequence = {
            'seq': str(self.rec.seq).upper(),
            'bounds':self.bounds
        }

        self.div = Div(height=18, height_policy="fixed", 
                    width=600, width_policy="fixed",
                    styles = sty
                    )
        
        xcb = CustomJS(
            args={
                "x_range": p.x_range,
                "sequence": sequence,
                "all_glyphs":self.all_glyphs,
                "glyph_source": self.glyph_source,
                "div": self.div,
                "loaded_range":self.loaded_range,
            },
            code=x_range_change_callback_code
        )

        p.x_range.js_on_change('start', xcb)
        self.x_range=p.x_range
        self.gene_track=p

        if self.show_seq:
            return [p,self.div]
        else:
            return [p]
        
    def _get_search_box(self):
        ## Create a text input widget for search
        text_input = AutocompleteInput(completions=self.all_glyphs["names"], value="")

        ## Adding BoxAnnotation to highlight search results
        search_span_source = ColumnDataSource({"x":[],"width":[]})#"y":[]
        h=Rect(x='x',y=-2,width='width',height=self.gene_track.height,fill_color='green',fill_alpha=0.2,line_alpha=0)
        self.gene_track.add_glyph(search_span_source, h)

        call_back_search = CustomJS(
            args={
                "x_range": self.x_range,
                "glyph_source": self.glyph_source,
                "bounds": self.bounds,
                "all_glyphs": self.all_glyphs,
                "loaded_range": self.loaded_range,
                "text_input": text_input,
                "search_span_source": search_span_source,
                "div": self.div,
            },
            code=search_callback_code
        )

        text_input.js_on_change('value',call_back_search)#,xcb)

        return text_input
    
    def show(self):
        show(column(self.elements + [t.fig for t in self.tracks]))



In [None]:
#| hide
#Useful for javascript development as it is not autmatically reimported 

file_path = "../genomenotebook/javascript/x_range_change_callback_code.js"
# Open the file and read its contents
with open(file_path, 'r') as handle:
    x_range_change_callback_code = ''.join(handle.readlines())

file_path = "../genomenotebook/javascript/search_callback_code.js"
with open(file_path, 'r') as handle:
    search_callback_code =''.join(handle.readlines())

In [None]:
data_path = get_example_data_dir()
genome_path = os.path.join(data_path, "GCA_000189435.3_ASM18943v3_genomic.fna")
gff_path = os.path.join(data_path, "GCA_000189435.3_ASM18943v3_genomic.gff")

g=GenomeBrowser(genome_path=genome_path, gff_path=gff_path, init_pos=10000)
g.show()


In [None]:
#| hide
#Testing svg output
g=GenomeBrowser(genome_path=genome_path, gff_path=gff_path, bounds=(2000,5000), output_backend="svg")
g.show()

In [None]:
#| hide
#Testing out of bounds warning and seq_id warning
g=GenomeBrowser(genome_path=genome_path, gff_path=gff_path, init_pos=10000, bounds=(2000,5000), seq_id="AF")
g.show()



#### Visualising multiple contigs

In [None]:
genome_path = os.path.join(data_path, "jmh43.fna")
gff_path = os.path.join(data_path, "jmh43.gff")

for rec in itertools.islice(SeqIO.parse(genome_path,"fasta"),5):
    
    g=GenomeBrowser(genome_path=genome_path, 
                    gff_path=gff_path, 
                    seq_id=rec.id, 
                    search=False)
    g.show()

In [None]:
#| export
#| hide
class Track:
    def __init__(self,
                 height: int = 200, #size of the track
                 output_backend="webgl" 
                ):        
        self.height = height
        self.fig = figure(tools="xwheel_zoom,xpan,save,reset",
                          active_scroll="xwheel_zoom",
                          height=height,
                          y_axis_location="right", #this is required in order to keep a proper alignment with the sequence
                          output_backend=output_backend)
        self.fig.xaxis[0].formatter = NumeralTickFormatter(format="0,0")
        self.track_loaded_data = None
        self.track_all_data = None
        self.loaded_range = None
        
        


In [None]:
#| hide
t=Track(height=300)
assert t.fig.height == 300

In [None]:
#| export
@patch
def add_track(self:GenomeBrowser,
             height:int = 200, #size of the track
             output_backend="webgl", #can be set to webgl (more efficient) or svg (for figure export)
             ) -> Track:
    """Adds a track to the the GenomeBrowser. Ensures that the x_range are shared and figure widths are identical."""
    t = Track(height=height, 
              output_backend=output_backend)
    t.fig.x_range = self.x_range
    t.fig.frame_width = self.frame_width
    t.bounds = self.bounds
    t.loaded_range = ColumnDataSource(self.loaded_range.data)
    t.max_glyph_loading_range = self.max_glyph_loading_range
    self.tracks.append(t)
    return t
    

Adding a track with random points as a demonstration. 
genomeNotebook uses the Bokeh library and `track.fig` is a simple Bokeh figure on which you can plot anything you want using Bokeh.

In [None]:
data_path = get_example_data_dir()
genome_path = os.path.join(data_path, "MG1655_U00096.fasta")
gff_path = os.path.join(data_path, "MG1655_U00096.gff3")

g=GenomeBrowser(genome_path=genome_path, gff_path=gff_path, bounds=(0,100000), search=False, show_seq=False)

track = g.add_track()

x= np.arange(0,100000,100)
y= np.random.randint(0,10,size=x.shape)
track.fig.scatter(x=x,y=y)
g.show()

In [None]:
#| export
#| hide
@patch
def _set_track_data_source(self:Track, data, pos, columns):
    columns=[c for c in columns if c] #some arguments can be None => remove them
    data=data.loc[(self.bounds[0] < data[pos]) & (data[pos] < self.bounds[1]),
                  [pos]+columns]
    data=data.sort_values("pos")
    if len(data)>10**5:
        warnings.warn("You are trying to plot more than 10^5 glyphs, this might crash your memory. \
        Consider using bounds or reducing the number of datapoints.")
        
    self.all_data=ColumnDataSource(data)
    self.loaded_data=ColumnDataSource(
        data.loc[(self.fig.x_range.start - self.loaded_range.data["start"][0] < data[pos]
                 ) & (
                 data[pos] < self.fig.x_range.end + self.loaded_range.data["end"][0])]
    )
    
    xcb = CustomJS(
            args = {
                "x_range": self.fig.x_range,
                "all_data":self.all_data,
                "loaded_data": self.loaded_data,
                "track_loaded_range":self.loaded_range,
            },
            code = track_callback_code
        )

    self.fig.x_range.js_on_change('start', xcb)


In [None]:
#| export
@patch
def line(self:Track,
         data: pd.DataFrame, #pandas DataFrame containing the data
         pos: str, #name of the column containing the positions along the genome
         y: str, #name of the column containing the data to be plotted on the y-axis
         **kwargs #enables to pass keyword arguments used by the Bokeh function
        ):
    self._set_track_data_source(data, pos, columns=[y])
    self.fig.line(source=self.loaded_data, x=pos, y=y, **kwargs)


#### Plotting some ChIP-seq data 

In [None]:
g=GenomeBrowser(genome_path=genome_path, 
                gff_path=gff_path, 
                init_pos=50000,
                bounds=(30000,85000), 
                search=False, 
                show_seq=False)

#Importing some coverage data from a BigWig file
bw_file_path=os.path.join(data_path,"ChIP-ACCCA-1.bw")
refname='NC_000913'
with pyBigWig.open(bw_file_path) as bw:
    cov=bw.values(refname,0,g.seq_len,numpy=True)
    
data=pd.DataFrame({"pos": np.arange(0,g.seq_len,10),
                     "cov": cov[::10]})

track=g.add_track()
track.line(data,pos="pos",y="cov", 
           line_color="blue",
           line_width=2)
g.show()

In [None]:
#| export
from bokeh.transform import factor_cmap

In [None]:
#| export
@patch
def scatter(self:Track,
         data: pd.DataFrame, #pandas DataFrame containing the data
         pos: str, #name of the column containing the positions along the genome
         y: str, #name of the column containing the data to be plotted on the y-axis
         factors: str = None, #name of a column of values to be used as factors
         **kwargs, #enables to pass keyword arguments used by the Bokeh function
        ):
    self._set_track_data_source(data, pos, columns=[y,factors])
    
    if factors!=None:
        color=factor_cmap(factors,"Category10_3",tuple(set(data[factors].values)))
        
        self.fig.scatter(source=self.loaded_data, x=pos, y=y, color=color, legend_group=factors, **kwargs)
        
        self.fig.legend.title = factors
        self.fig.legend.location = "top_left"
    else:
        self.fig.scatter(source=self.loaded_data, x=pos, y=y, **kwargs)


#### Plotting some CRISPR screen data

In [None]:
#Opening the Cui 2018 CRISPRi screen data
cui2018data="https://gitlab.pasteur.fr/dbikard/badSeed_public/raw/master/screen_data.csv"
cui2018data=pd.read_csv(cui2018data,index_col=0)
cui2018data.head()

Unnamed: 0_level_0,gene,essential,pos,ori,coding,fit18,fit75,ntargets,seq
guide,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AAAAAACCTGCTGGTGAGGC,,,2202483,-,,-4.850012,-1.437546,1,AAAGCAGATCACAGTAAATAAAAAAACCTGCTGGTGAGGCAGGTTC...
AAAAAACGTATTCGCTTGCA,curA,False,1517891,+,False,-0.094026,-0.100313,1,TGTTGATGGCTACAGTGCTGAAAAAACGTATTCGCTTGCAAGGTTT...
AAAAAAGCGCACTTTTTGAC,,,1919717,+,,-1.10931,-0.24674,1,GTAACGCCTGACAGCGCACAAAAAAAGCGCACTTTTTGACTGGCAC...
AAAAAAGCGGTGACTTACGA,bglA,False,3042929,+,False,-1.328831,-0.905068,1,GCGCCCATATCGAAGAGATGAAAAAAGCGGTGACTTACGATGGCGT...
AAAAAATCTGCCCGTGTCGT,gyrA,True,2337231,-,False,-0.840373,-0.598858,1,ATGACTGGAACAAAGCCTATAAAAAATCTGCCCGTGTCGTTGGTGA...


In [None]:
g=GenomeBrowser(genome_path=genome_path, gff_path=gff_path, bounds=(0,100000), search=False, show_seq=False)

track=g.add_track(height=100)
track.scatter(data=cui2018data,pos="pos",y="fit75",factors="ori")

track2=g.add_track(height=100)
track2.scatter(data=cui2018data,pos="pos",y="fit18",factors="ori")
g.show()


In [None]:
#| export
@patch
def bar(self:Track,
         source: pd.DataFrame, #pandas DataFrame containing the data
         pos: str, #name of the column containing the positions along the genome
         y: str, #name of the column containing the data to be plotted on the y-axis
         z: str = None, #name of a column containing numerical data rendered as a linear color map (cannot be used for line plots)
         factors: str = None, #name of a column of values to be used as factors
         **kwargs, #enables to pass keyword arguments used by the Bokeh function
        ):
    source=self.filter_source(source, pos)
    
    if factors!=None:
        color=factor_cmap(factors,"Category10_3",tuple(set(source[factors].values)))
        
        self.fig.vbar(source=source, x=pos, top=y, color=color, legend_group=factors, **kwargs)

        self.fig.legend.location = "top_left"
        self.fig.legend.title = "ori"
    elif z!=None:
        pass
    else:
        self.fig.vbar(source=source, x=pos, top=y, **kwargs)
        

Showing the same data as vertical bars

In [None]:
g=GenomeBrowser(genome_path=genome_path, gff_path=gff_path, bounds=(70000,110000))
track=g.add_track()
track.bar(source=cui2018data,pos="pos",y="fit75",factors="ori")
g.show()


AttributeError: 'Track' object has no attribute 'filter_source'

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()