# Utils

> This contains useful functions

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| default_exp utils

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import gffpandas.gffpandas as gffpd
import numpy as np

from bokeh.plotting import figure
from bokeh.models.tools import BoxZoomTool
from bokeh.models import HoverTool, NumeralTickFormatter, LabelSet
from bokeh.models.glyphs import Patches
from bokeh.models import (
    CustomJS,
    Range1d,
    ColumnDataSource,
)

import warnings

In [None]:
#| export
#| hide
def create_genome_browser_plot(glyphSource, x_range, **kwargs):
    plot_height = kwargs.get("plot_height", 150)
    label_angle = kwargs.get("label_angle", 45)
    text_font_size = kwargs.get("text_font_size", "10pt")
    output_backend = kwargs.get("output_backend", "webgl")
    
    y_min, y_max = get_y_range()
    p_annot = figure(
        tools = "xwheel_zoom,xpan,save",
        active_scroll = "xwheel_zoom",
        height = plot_height,
        x_range = x_range,
        y_range = Range1d(y_min, y_max),
        output_backend=output_backend,
    )
    # Add tool
    p_annot.add_tools(BoxZoomTool(dimensions="width"))

    #p_annot.sizing_mode = "stretch_both"

    # Format x axis values
    p_annot.xaxis[0].formatter = NumeralTickFormatter(format="0,0")
    # Hide grid
    p_annot.xgrid.visible = False
    p_annot.ygrid.visible = False
    # Hide axis
    p_annot.yaxis.visible = False
    glyph = p_annot.add_glyph(
        glyphSource, Patches(xs="xs", ys="ys", fill_color="color")
    )
    # gene labels in the annotation track
    # This seems to be necessary to show the labels
    p_annot.scatter(x="pos", y=0, size=0, source=glyphSource)
    labels = LabelSet(
        x="pos",
        y=-0.9,
        text="names",
        level="glyph",
        angle=label_angle,
        text_font_size=text_font_size,
        x_offset=-5,
        y_offset=0,
        source=glyphSource,
        text_align='left',
    )

    p_annot.add_layout(labels)
    p_annot.add_tools(
        HoverTool(
            renderers=[glyph],
            tooltips=[("locus_tag", "@locus_tag"), ("gene", "@gene"), ("product", "@product")],
        )
    )
    return p_annot

In [None]:
#| export
def get_genome_annotations(gff_path: str, seq_id: str=None, bounds=None):
    annotation = gffpd.read_gff3(gff_path)
    annotation = annotation.df
    if seq_id:
        annotation = annotation.loc[(annotation.seq_id == seq_id)]
        if len(annotation)==0:
            warnings.warn("The annotation DataFrame is empty. Check that the fasta and gff files have the same sequence id")
        
        
    if bounds:
        annotation = annotation.loc[(annotation.start<bounds[1]) & (annotation.end>bounds[0])]

    annotation.loc[:, "left"] = annotation[["start"]].values
    annotation.loc[:, "right"] = annotation[["end"]].values
    return annotation

In [None]:
#| export
from genomenotebook.js_callback_code import get_example_data_dir
import os

In [None]:
data_path = get_example_data_dir()
gff_path = os.path.join(data_path, "MG1655_U00096.gff3")
annotation=get_genome_annotations(gff_path, "U00096.3")
annotation.head()

Unnamed: 0,seq_id,source,type,start,end,score,strand,phase,attributes,left,right
0,U00096.3,Genbank,region,1,4641652,.,+,.,ID=U00096.3:1..4641652;Dbxref=taxon:511145;Is_...,1,4641652
1,U00096.3,Genbank,gene,190,255,.,+,.,"ID=gene-b0001;Dbxref=ASAP:ABE-0000006,ECOCYC:E...",190,255
2,U00096.3,Genbank,CDS,190,255,.,+,0,ID=cds-AAC73112.1;Parent=gene-b0001;Dbxref=Uni...,190,255
3,U00096.3,Genbank,gene,337,2799,.,+,.,"ID=gene-b0002;Dbxref=ASAP:ABE-0000008,ECOCYC:E...",337,2799
4,U00096.3,Genbank,CDS,337,2799,.,+,0,ID=cds-AAC73113.1;Parent=gene-b0002;Dbxref=Uni...,337,2799


In [None]:
#| hide
annotation=get_genome_annotations(gff_path, "U00097.3") #mistake in seq_id



In [None]:
#| export
import re

In [None]:
#| export
def extract_attribute(input_str:str, #attribute string to parse
                      attr_name:str, #name of the attribute to extract
                     ) -> dict:
    """Extracts the attribute called attr_name from the GFF attributes string"""
    
    pattern = f"[{attr_name[0].lower()}{attr_name[0].upper()}]{attr_name[1:]}=(?P<{attr_name}>[^;]+)"
    match = re.search(pattern, input_str)
    if match:
        return match.groupdict()[attr_name]
    else:
        return None

In [None]:
#| hide
input_str = 'ID=cds-ATV02827.1;Parent=gene-SaO11_00001;Dbxref=NCBI_GP:ATV02827.1;Name=ATV02827.1;gbkey=CDS;gene=dnaA;locus_tag=SaO11_00001;product=Chromosomal replication initiator protein DnaA;protein_id=ATV02827.1;transl_table=11'
extract_attribute(input_str,"gene")

'dnaA'

In [None]:
genes = annotation[annotation.type.isin(["CDS", "repeat_region", "ncRNA", "rRNA", "tRNA"])    ]
genes.attributes.apply(extract_attribute,attr_name='protein_id')

Series([], Name: attributes, dtype: object)

In [None]:
#| hide
input_str = 'locus_tag=SaO11_00001;product=Chromosomal replication initiator protein DnaA;protein_id=ATV02827.1;transl_table=11'
assert extract_attribute(input_str,"gene") == None

In [None]:
#| export
def get_genes_from_annotation(annotation):
    """Filters annotations to keep only features of type CDS, repeat_region, ncRNA, rRNA and tRNA.
       Extracts gene, locus_tag and product attributes as columns.
    """
    genes = annotation[
        annotation.type.isin(["CDS", "repeat_region", "ncRNA", "rRNA", "tRNA"])
    ].copy()

    genes.loc[genes["strand"] == "+", "start"] = genes.loc[
        genes["strand"] == "+", "left"
    ].values

    genes.loc[genes["strand"] == "+", "end"] = genes.loc[
        genes["strand"] == "+", "right"
    ].values

    genes.loc[genes["strand"] == "-", "start"] = genes.loc[
        genes["strand"] == "-", "right"
    ].values

    genes.loc[genes["strand"] == "-", "end"] = genes.loc[
        genes["strand"] == "-", "left"
    ].values
    
    genes['gene'] = genes.attributes.apply(extract_attribute,attr_name='gene')
    genes['locus_tag'] = genes.attributes.apply(extract_attribute,attr_name='locus_tag')
    genes['gene_or_locus'] = genes['gene'].fillna(genes['locus_tag'])
    genes['product'] = genes.attributes.apply(extract_attribute,attr_name='product')
    genes.loc[genes["type"] == "repeat_region", "gene"] = "REP"
    
    return genes

In [None]:
gff_path = os.path.join(data_path, "GCA_000189435.3_ASM18943v3_genomic.gff")
annotation=get_genome_annotations(gff_path)
genes=get_genes_from_annotation(annotation)
genes.head()

Unnamed: 0,seq_id,source,type,start,end,score,strand,phase,attributes,left,right,gene,locus_tag,gene_or_locus,product
2,CP024649.1,Genbank,CDS,517,1878,.,+,0,ID=cds-ATV02827.1;Parent=gene-SaO11_00001;Dbxr...,517,1878,dnaA,SaO11_00001,dnaA,Chromosomal replication initiator protein DnaA
4,CP024649.1,Genbank,CDS,2158,3291,.,+,0,ID=cds-ATV02828.1;Parent=gene-SaO11_00002;Dbxr...,2158,3291,dnaN,SaO11_00002,dnaN,DNA polymerase III%2C beta chain
6,CP024649.1,Genbank,CDS,3681,3917,.,+,0,ID=cds-ATV02829.1;Parent=gene-SaO11_00003;Dbxr...,3681,3917,,SaO11_00003,SaO11_00003,RNA-binding protein
8,CP024649.1,Genbank,CDS,3968,5026,.,+,0,ID=cds-ATV02830.1;Parent=gene-SaO11_00004;Dbxr...,3968,5026,recF,SaO11_00004,recF,DNA replication and repair protein RecF
10,CP024649.1,Genbank,CDS,5036,6970,.,+,0,ID=cds-ATV02831.1;Parent=gene-SaO11_00005;Dbxr...,5036,6970,gyrB,SaO11_00005,gyrB,DNA gyrase subunit B


In [None]:
#| export
#| hide
Y_RANGE = (-2, 2)
def get_y_range() -> tuple:
    """Accessor that returns the Y range for the genome browser plot
    """
    return Y_RANGE


def get_all_glyphs(genes,bounds:tuple):
    all_glyphs=get_gene_patches(genes, bounds[0], bounds[1])

    ks=list(all_glyphs.keys())
    ref_list_ix=ks.index('xs')
    # Sort all the lists in the dictionary based on the values of the reference list
    sorted_lists = sorted(zip(*[all_glyphs[k] for k in ks]), key= lambda x: x[ref_list_ix][0])

    # Convert the sorted tuples back into separate lists
    unzipped_lists = zip(*sorted_lists)

    # Create a new dictionary with the same keys as the original dictionary, but with the sorted lists as values
    all_glyphs = {k: list(t) for k, t in zip(ks, unzipped_lists)}
    
    return all_glyphs

In [None]:
#| export
#| hide
def rect_patch(genes_region):
    y_min, y_max = gene_y_range
    xs = list(
        zip(
            genes_region.start.values,
            genes_region.start.values,
            genes_region.end.values,
            genes_region.end.values,
        )
    )
    xs = [np.array(x) for x in xs]
    ys = [np.array([y_min, y_max, y_max, y_min]) for i in range(genes_region.shape[0])]
    genes_mid = genes_region.left + (genes_region.right - genes_region.left) / 2
    pos = list(genes_mid.values)
    names = list(genes_region.gene.values)
    product = list(genes_region["product"].values)
    color = ["grey"] * genes_region.shape[0]
    return dict(
        xs=xs,
        ys=ys,
        pos=pos,
        names=[""] * genes_region.shape[0],
        gene=list(genes_region.gene.values),
        locus_tag=list(genes_region.locus_tag.values),
        hover_names=names,
        product=product,
        color=color,
    )

In [None]:
#| export
#| hide
def arrow_patch(genes_region):
    arr_plus = get_arrow_patch(genes_region[genes_region["strand"] == "+"], "+")
    arr_minus = get_arrow_patch(genes_region[genes_region["strand"] == "-"], "-")
    return dict([(k, arr_plus[k] + arr_minus[k]) for k in arr_plus.keys()])

In [None]:
#| export
#| hide
gene_y_range = (-1.5, -1)

def get_arrow_patch(genes_region, ori="+"):
    y_min, y_max = gene_y_range
    y_min = y_min 
    if ori == "+":
        xs = list(
            zip(
                genes_region.start.values,
                genes_region.start.values,
                np.maximum(genes_region.start.values, genes_region.end.values - 100),
                genes_region.end.values,
                np.maximum(genes_region.start.values, genes_region.end.values - 100),
            )
        )
        color = ["orange"] * genes_region.shape[0]
    elif ori == "-":
        xs = list(
            zip(
                genes_region.start.values,
                genes_region.start.values,
                np.minimum(genes_region.start.values, genes_region.end.values + 100),
                genes_region.end.values,
                np.minimum(genes_region.start.values, genes_region.end.values + 100),
            )
        )
        color = ["purple"] * genes_region.shape[0]

    ys = [
        np.array([y_min, y_max, y_max, (y_max + y_min) / 2, y_min])
        for i in range(genes_region.shape[0])
    ]
    genes_mid = (genes_region.right + genes_region.left) / 2
    pos = list(genes_mid.values)
    return dict(
        xs=xs,
        ys=ys,
        pos=pos,
        names=list(genes_region.gene_or_locus.values),
        gene=list(genes_region.gene.values),
        locus_tag=list(genes_region.locus_tag.values),
        hover_names=list(genes_region.gene_or_locus.values),
        product=list(genes_region["product"].values),
        color=color,
    )

In [None]:
#| export
#| hide
def get_gene_patches(genes, left, right):
    genes_region = genes[
        (genes["right"] > left)
        & (genes["left"] < right)
        & (genes["type"] != "repeat_region")
    ]
    arr = arrow_patch(genes_region)
    # repeat_region
    rep_region = genes[
        (genes["right"] > left)
        & (genes["left"] < right)
        & (genes["type"] == "repeat_region")
    ]
    rect = rect_patch(rep_region)

    # concatenate patches
    res = dict([(k, arr[k] + rect[k]) for k in arr.keys()])
    return res

In [None]:
#| hide
get_gene_patches(genes,0,5000)

{'xs': [(517, 517, 1778, 1878, 1778),
  (2158, 2158, 3191, 3291, 3191),
  (3681, 3681, 3817, 3917, 3817),
  (3968, 3968, 4926, 5026, 4926)],
 'ys': [array([-1.5 , -1.  , -1.  , -1.25, -1.5 ]),
  array([-1.5 , -1.  , -1.  , -1.25, -1.5 ]),
  array([-1.5 , -1.  , -1.  , -1.25, -1.5 ]),
  array([-1.5 , -1.  , -1.  , -1.25, -1.5 ])],
 'pos': [1197.5, 2724.5, 3799.0, 4497.0],
 'names': ['dnaA', 'dnaN', 'SaO11_00003', 'recF'],
 'gene': ['dnaA', 'dnaN', None, 'recF'],
 'locus_tag': ['SaO11_00001', 'SaO11_00002', 'SaO11_00003', 'SaO11_00004'],
 'hover_names': ['dnaA', 'dnaN', 'SaO11_00003', 'recF'],
 'product': ['Chromosomal replication initiator protein DnaA',
  'DNA polymerase III%2C beta chain',
  'RNA-binding protein',
  'DNA replication and repair protein RecF'],
 'color': ['orange', 'orange', 'orange', 'orange']}

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()