# browser

> Contains the GenomeBrowser class

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp browser

In [None]:
#| hide
from nbdev.showdoc import *


In [None]:
#| export
from fastcore.basics import *

from genomenotebook.utils import (
    get_feature_patches, 
    create_genome_browser_plot,
    get_default_glyphs,
    parse_gff,
)

from genomenotebook.javascript import (
    x_range_change_callback_code, 
    search_callback_code, 
)
from bokeh.models import (
    CustomJS,
    Range1d,
    ColumnDataSource,
    AutocompleteInput,
    Rect,
    Div,
    Styles
)
from bokeh.plotting import show
from bokeh.layouts import column

from Bio import SeqIO

import numpy as np
import warnings

In [None]:
#| export
class GenomeBrowser:
    """Initialize a GenomeBrowser object."""
    def __init__(self,
                 gff_path: str, #path to the gff3 file of the annotations
                 genome_path: str = None, #path to the fasta file of the genome sequence
                 seq_id: str = None, #id of the sequence to show for genomes with multiple contigs
                 init_pos: int = None, #initial position to display
                 init_win: int = 10000, #initial window size (max=20000)
                 bounds: tuple = None, #bounds can be specified. This helps preserve memory by not loading the whole genome if not needed.
                 show_seq: bool = True, #shows the sequence when zooming in
                 search: bool = True, #enables a search bar to lookup a gene name or a DNA sequence
                 attributes: list = ["gene", "locus_tag", "product"], #list of attribute names from the GFF attributes column to be extracted
                 feature_name: str = "gene", #attribute to be displayed as the feature name
                 feature_types: list = ["CDS", "repeat_region", "ncRNA", "rRNA", "tRNA"], # list of feature types to display
                 glyphs: dict = None, #dictionnary defining the type and color of glyphs to display for each feature type
                 **kwargs):
        
        self.gff_path = gff_path
        self.genome_path = genome_path
        self.show_seq = show_seq if genome_path!=None else False
        self.attributes = attributes
        self.feature_types = feature_types
        self.feature_name = feature_name
        self.glyphs = get_default_glyphs() if glyphs==None else glyphs

        self.features = parse_gff(gff_path,
                                      seq_id=seq_id,
                                      bounds=bounds,
                                      feature_types=feature_types
                                     )
        
        self.kwargs=kwargs
        self.bounds=bounds
        self.search=search
        self.init_pos=init_pos
        self.init_win=init_win
        
        if len(self.features)>0:
            self._prepare_data()
            
    def _prepare_data(self):
        if self.feature_name not in self.features.columns:
            self.features[self.feature_name]=""

        self.seq_id = self.features.seq_id[0]
        self._get_sequence()

        if self.bounds == None: self.bounds=(0,self.seq_len)

        self.patches = get_feature_patches(self.features, 
                                         self.bounds[0], 
                                         self.bounds[1],
                                         patch_dict=self.glyphs,
                                         attributes=self.attributes,
                                         name = self.feature_name)

        self._set_init_pos()
        self.init_win = min(self.init_win,self.bounds[1]-self.bounds[0])

        
        self.tracks=[]
        semi_win = self.init_win / 2
            
        self.x_range = Range1d(
            max(self.bounds[0],self.init_pos - semi_win), min(self.bounds[1],self.init_pos + semi_win), 
            bounds=self.bounds, 
            max_interval=100000,
            min_interval=40
        )

        self.max_glyph_loading_range = 20000
        self.frame_width = 600
        self.highlight_regions = {"x":[],"width":[]}

    def _get_sequence(self):
        if self.genome_path!=None: 
            rec_found=False
            for rec in SeqIO.parse(self.genome_path, 'fasta'):
                if rec.id==self.seq_id:
                    rec_found=True
                    break

            if not rec_found:
                warnings.warn("seq_id not found in fasta file")
            
            self.rec=rec
            self.seq_len = len(self.rec.seq) #length of the reference sequence before bounds are applied
            if self.bounds:
                self.rec.seq=self.rec.seq[self.bounds[0]:self.bounds[1]]    
        else: 
            self.seq_len = self.features.right.max()
        
    def _set_init_pos(self):
        if self.init_pos == None:
            self.init_pos=sum(self.bounds)//2
        elif self.init_pos>self.bounds[1] or self.init_pos<self.bounds[0]:
            warnings.warn("Requested an initial position outside of the browser bounds")
            self.init_pos=sum(self.bounds)//2

    def _get_browser(self, **kwargs):
        
        #Filter initial glyphs by position
        feature_patches = self.patches.loc[(
            self.patches['xs'].apply(
                lambda x: max(x)>self.x_range.start-self.max_glyph_loading_range)) & (
            self.patches['xs'].apply(
                lambda x: min(x)<self.x_range.end+self.max_glyph_loading_range)
            )]
        
        self._glyph_source = ColumnDataSource(feature_patches.to_dict(orient="list"))
        
        #Information about the range currently plotted
        self._loaded_range = ColumnDataSource({"start":[self.x_range.start-self.max_glyph_loading_range],
                                                "end":[self.x_range.end+self.max_glyph_loading_range], 
                                                "range":[self.max_glyph_loading_range]})
        


        self.gene_track = create_genome_browser_plot(self._glyph_source, 
                                       self.x_range, 
                                       attributes=self.attributes,
                                       **kwargs)
        
        # Adding the possibility to highlight regions
        highlight_source = ColumnDataSource(self.highlight_regions)
        h=Rect(x='x',y=0,width='width',height=self.gene_track.height,fill_color="colors",fill_alpha="alpha",line_alpha=0)
        self.gene_track.add_glyph(highlight_source, h)
        
        self.gene_track.frame_width=self.frame_width

        sty=Styles(font_size='14px',
                font_family="Courrier",
                color="black",
                display="inline-block",
                background_color = "white",
                margin="0",
                margin_left= "2px",
                )
        
        ## Adding the ability to display the sequence when zooming in
        sequence = {
            'seq': str(self.rec.seq).upper() if self.show_seq else "",
            'bounds':self.bounds
        }

        self._div = Div(height=18, height_policy="fixed", 
                    width=600, width_policy="fixed",
                    styles = sty
                    )
        
        xcb = CustomJS(
            args={
                "x_range": self.gene_track.x_range,
                "sequence": sequence,
                "all_glyphs":self.patches.to_dict(orient="list"),
                "glyph_source": self._glyph_source,
                "div": self._div,
                "loaded_range":self._loaded_range,
            },
            code=x_range_change_callback_code
        )

        self.gene_track.x_range.js_on_change('start', xcb)
        self.x_range=self.gene_track.x_range

        if self.show_seq:
            return [self.gene_track,self._div]
        else:
            return [self.gene_track]
        
    def _get_search_box(self):
        ## Create a text input widget for search
        completions=set()
        for attr in self.patches.columns:
            if not attr in ["xs","ys","color","pos"]:
                completions.update(set(self.patches[attr]))
        
        text_input = AutocompleteInput(completions=list(completions), value="")

        ## Adding BoxAnnotation to highlight search results
        search_span_source = ColumnDataSource({"x":[],"width":[]})#"y":[]
        h=Rect(x='x',y=-2,width='width',height=self.gene_track.height,fill_color='green',fill_alpha=0.2,line_alpha=0)
        self.gene_track.add_glyph(search_span_source, h)

        call_back_search = CustomJS(
            args={
                "x_range": self.x_range,
                "glyph_source": self._glyph_source,
                "bounds": self.bounds,
                "all_glyphs": self.patches.to_dict(orient="list"),
                "loaded_range": self._loaded_range,
                "text_input": text_input,
                "search_span_source": search_span_source,
                "div": self._div,
            },
            code=search_callback_code
        )

        text_input.js_on_change('value',call_back_search)#,xcb)

        return text_input
    
    def show(self):
        if len(self.features)>0:
            self.elements = self._get_browser(**self.kwargs)
            if self.search:
                self.elements = [self._get_search_box()]+self.elements
            show(column(self.elements + [t.fig for t in self.tracks]))



In [None]:
#| hide
from genomenotebook.javascript import get_js_code

In [None]:
#| hide
#Useful for javascript development as it is not autmatically reimported 
x_range_change_callback_code = get_js_code("x_range_change_callback_code.js")
search_callback_code = get_js_code("search_callback_code.js")

In [None]:
from genomenotebook.data import get_example_data_dir
import os

In [None]:
data_path = get_example_data_dir()
genome_path = os.path.join(data_path, "GCA_000189435.3_ASM18943v3_genomic.fna")
gff_path = os.path.join(data_path, "GCA_000189435.3_ASM18943v3_genomic.gff")

g=GenomeBrowser(genome_path=genome_path, gff_path=gff_path, init_pos=10000)
g.show()


ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name. This could either be due to a misspelling or typo, or due to an expected column being missing. : fill_alpha='alpha' [no close matches], fill_color='colors' [no close matches] {renderer: GlyphRenderer(id='p1073', ...)}


In [None]:
#Providing GFF file as only input
g=GenomeBrowser(gff_path)
g.show()

ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name. This could either be due to a misspelling or typo, or due to an expected column being missing. : fill_alpha='alpha' [no close matches], fill_color='colors' [no close matches] {renderer: GlyphRenderer(id='p1329', ...)}


In [None]:
#| hide
#testing on the human genome
from genomenotebook.utils import download_file

In [None]:
#| hide
file_url = 'https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gff.gz'
human_genome_gff = 'GRCh38_latest_genomic.gff.gz'
download_file(file_url, human_genome_gff)

File already exists: GRCh38_latest_genomic.gff.gz


In [None]:
#| hide
df=parse_gff(human_genome_gff, 
             seq_id="NC_000001.11",
             bounds=(10000,50000))
df.head()

Unnamed: 0,seq_id,source,type,start,end,score,strand,phase,attributes,rank,...,assembly_bases_aln,hsp_percent_coverage,regulatory_class,gene_biotype,Note,description,standard_name,left,right,middle
0,NC_000001.11,RefSeq,region,1,248956422,.,+,.,ID=NC_000001.11:1..248956422;Dbxref=taxon:9606...,,...,,,,,,,,1,248956422,124478211.5
1,NC_000001.11,BestRefSeq,pseudogene,11874,14409,.,+,.,"ID=gene-DDX11L1;Dbxref=GeneID:100287102,HGNC:H...",,...,,,,transcribed_pseudogene,,DEAD/H-box helicase 11 like 1 (pseudogene),,11874,14409,13141.5
2,NC_000001.11,BestRefSeq,transcript,11874,14409,.,+,.,ID=rna-NR_046018.2;Parent=gene-DDX11L1;Dbxref=...,,...,,,,,,,,11874,14409,13141.5
3,NC_000001.11,BestRefSeq,exon,11874,12227,.,+,.,ID=exon-NR_046018.2-1;Parent=rna-NR_046018.2;D...,,...,,,,,,,,11874,12227,12050.5
4,NC_000001.11,BestRefSeq,exon,12613,12721,.,+,.,ID=exon-NR_046018.2-2;Parent=rna-NR_046018.2;D...,,...,,,,,,,,12613,12721,12667.0


In [None]:
#| hide
set(df.type)

{'biological_region',
 'enhancer',
 'exon',
 'gene',
 'lnc_RNA',
 'match',
 'miRNA',
 'primary_transcript',
 'pseudogene',
 'region',
 'transcript'}

In [None]:
#| hide
g=GenomeBrowser(human_genome_gff, 
                seq_id="NC_000001.11",
                bounds=(1000000,2000000),
                feature_types=["gene","exon"],
                attributes=["type","gene","product","start","end"])
g.show()

ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name. This could either be due to a misspelling or typo, or due to an expected column being missing. : fill_alpha='alpha' [no close matches], fill_color='colors' [no close matches] {renderer: GlyphRenderer(id='p1613', ...)}


In [None]:
#List available attributes
from genomenotebook.utils import available_attributes, available_feature_types

In [None]:
available_attributes(gff_path)

Index(['seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase',
       'attributes', 'product', 'ID', 'Dbxref', 'gene', 'locus_tag', 'Parent',
       'gbkey', 'genome', 'strain', 'gene_biotype', 'protein_id',
       'transl_table', 'Name', 'mol_type', 'Note', 'Is_circular', 'left',
       'right', 'middle'],
      dtype='object')

In [None]:
#Showing different attributes from the GFF file
g=GenomeBrowser(gff_path, attributes=["locus_tag","protein_id",'gene','product'],feature_name="protein_id")
g.show()

ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name. This could either be due to a misspelling or typo, or due to an expected column being missing. : fill_alpha='alpha' [no close matches], fill_color='colors' [no close matches] {renderer: GlyphRenderer(id='p1925', ...)}


In [None]:
#| hide
#Testing svg output
g=GenomeBrowser(genome_path=genome_path, gff_path=gff_path, bounds=(2000,5000), output_backend="svg")
g.show()

ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name. This could either be due to a misspelling or typo, or due to an expected column being missing. : fill_alpha='alpha' [no close matches], fill_color='colors' [no close matches] {renderer: GlyphRenderer(id='p2265', ...)}


In [None]:
#| hide
#Testing seq_id warning
g=GenomeBrowser(genome_path=genome_path, gff_path=gff_path, init_pos=10000, seq_id="AF")
g.show()



In [None]:
#| hide
#Testing out of bounds warning 
g=GenomeBrowser(genome_path=genome_path, gff_path=gff_path, init_pos=10000, bounds=(2000,5000))
g.show()

ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name. This could either be due to a misspelling or typo, or due to an expected column being missing. : fill_alpha='alpha' [no close matches], fill_color='colors' [no close matches] {renderer: GlyphRenderer(id='p2633', ...)}


#### Visualising multiple contigs

In [None]:
import itertools

In [None]:
genome_path = os.path.join(data_path, "jmh43.fna")
gff_path = os.path.join(data_path, "jmh43.gff")

for rec in itertools.islice(SeqIO.parse(genome_path,"fasta"),5):
    
    g=GenomeBrowser(genome_path=genome_path, 
                    gff_path=gff_path, 
                    seq_id=rec.id,
                    feature_name="locus_tag",
                    search=False)
    g.show()

ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name. This could either be due to a misspelling or typo, or due to an expected column being missing. : fill_alpha='alpha' [no close matches], fill_color='colors' [no close matches] {renderer: GlyphRenderer(id='p3029', ...)}


ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name. This could either be due to a misspelling or typo, or due to an expected column being missing. : fill_alpha='alpha' [no close matches], fill_color='colors' [no close matches] {renderer: GlyphRenderer(id='p3414', ...)}


ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name. This could either be due to a misspelling or typo, or due to an expected column being missing. : fill_alpha='alpha' [no close matches], fill_color='colors' [no close matches] {renderer: GlyphRenderer(id='p3822', ...)}


ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name. This could either be due to a misspelling or typo, or due to an expected column being missing. : fill_alpha='alpha' [no close matches], fill_color='colors' [no close matches] {renderer: GlyphRenderer(id='p4253', ...)}


ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name. This could either be due to a misspelling or typo, or due to an expected column being missing. : fill_alpha='alpha' [no close matches], fill_color='colors' [no close matches] {renderer: GlyphRenderer(id='p4707', ...)}


In [None]:
#| export
@patch
def highlight(self:GenomeBrowser,
              regions:list, #list of tuples with the format (start position, stop position)
              colors=None, #list of colors
              alpha=0.2, #transparency
             ):
    starts, stops = map(np.array,zip(*regions))
    width=stops-starts
    if not colors:
        colors=['green']*len(regions)
    alpha=[alpha]*len(regions)
    self.highlight_regions={"x":starts,"width":width,"colors":colors, "alpha":alpha}

In [None]:
g=GenomeBrowser(gff_path=gff_path, genome_path=genome_path, bounds=(0,10000))
g.highlight([(4000,5000),(6500,7000)], colors=["green","red"])
g.show()

In [None]:
#| export
from genomenotebook.track import Track

In [None]:
#| export
@patch
def add_track(self:GenomeBrowser,
             height:int = 200, #size of the track
             output_backend="webgl", #can be set to webgl (more efficient) or svg (for figure export)
             ) -> Track:
    """Adds a track to the GenomeBrowser. Ensures that the x_range are shared and figure widths are identical."""
    t = Track(height=height, 
              output_backend=output_backend)
    t.fig.x_range = self.x_range
    t.fig.frame_width = self.frame_width
    t.bounds = self.bounds
    t.loaded_range = ColumnDataSource({"start":[self.x_range.start-self.max_glyph_loading_range],
                                        "end":[self.x_range.end+self.max_glyph_loading_range], 
                                        "range":[self.max_glyph_loading_range]})
    t.max_glyph_loading_range = self.max_glyph_loading_range
    self.tracks.append(t)
    return t
    

In [None]:
#Empty track
data_path = get_example_data_dir()
genome_path = os.path.join(data_path, "MG1655_U00096.fasta")
gff_path = os.path.join(data_path, "MG1655_U00096.gff3")

g=GenomeBrowser(genome_path=genome_path, gff_path=gff_path, bounds=(0,100000), search=False, show_seq=False)

track = g.add_track(height=100)
g.show()

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()