In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
def regions_overlap(region1, region2, min_overlap_fraction=0.0):
    """
        regions are tuples of start and stop coordinates
        returns true if a fraction of region2 >= min_overlap_fraction overlaps with region1
        coordinates within regions must be sorted low to high
    """

    if min_overlap_fraction >= 1:
        return False
    
    # incoming coordinates are gff-base (1-based, inclusive), convert to python-base (0-based)

    region_1 = (region1[0] - 1, region1[1])
    region_2 = (region2[0] - 1, region2[1])
    region_1_size = region1[1] - region1[0]
    region_2_size = region2[1] - region2[0]
    if region_1_size == 0:
        region_1_size = 0.1
    if region_2_size == 0:
        region_2_size = 0.1

    # -region2 start is contained in region11
    if region2[0] <= region1[1] and region2[0] >= region1[0]:
        # not true for completely contained, but we don't care, should technically be min(region2[1] - region2[0], region1[1] - region2[0])
        overlap_size = region1[1] - region2[0]
        if (overlap_size / region_2_size) >= min_overlap_fraction:
            return True

    # -region2 end is contained in region1
    if region2[1] <= region1[1] and region2[1] >= region1[0]:
        # not true for completely contained, but we don't care
        overlap_size = region2[1] - region1[0]
        if (overlap_size / region_2_size) >= min_overlap_fraction:
            return True

    # -region1 start is contained in region2
    if region1[0] <= region2[1] and region1[0] >= region2[0]:
        # not true for completely contained, but we don't care
        overlap_size = region2[1] - region1[0]
        if (overlap_size / region_2_size) >= min_overlap_fraction:
            return True

    # -region1 end is contained in region2
    if region1[1] <= region2[1] and region1[1] >= region2[0]:
        # not true for completely contained, but we don't care
        overlap_size = region1[1] - region2[0]
        if (overlap_size / region_2_size) >= min_overlap_fraction:
            return True

    return False

def add_z_order(features, prescedence):
    """
        features is a dataframe of features
        prescedence is a list of feature types in order of prescedence, e.g. ["CDS", "repeat_region", "ncRNA", "rRNA", "tRNA"] will put "CDS" features closer to the bottom of the plot than "repeat_region" features.
        returns features with a z_order column added
    """
    type_order = {t: i for i, t in enumerate(prescedence)}
    features.sort_values(by="start", inplace=True)
    features.sort_values(by="type", inplace=True, key=lambda x: x.map(type_order))
    z_order = []
    added = []
    all_z = {0}
    for index, row in features.iterrows():
        left, right = row["left"], row["right"]
        z = 0
        z_found = set()
        for (l_a, r_a, z_a, z_o) in added:
            if regions_overlap((left, right), (l_a, r_a)):
                if type_order[row["type"]] > z_o:
                    for i in range(z_a+1):
                        z_found.add(i)
                else:
                    z_found.add(z_a)
        if len(z_found) == len(all_z):
            z = max(all_z) + 1
            all_z.add(z)
        else:
            z = min(all_z - z_found)
        z_order.append(z)
        added.append((left, right, z, type_order[row["type"]]))
    features["z_order"] = z_order
    


# Examples



## Simple browser

Create a simple genome browser with a search bar. The sequence appears when zooming in.

In [None]:
#Using the example E. coli genome data from the package
import genomenotebook as gn
from genomenotebook import utils
import os
import pandas as pd
from Bio import SeqIO
from bokeh.io import output_notebook #|hide_line
from bokeh.plotting import show as bk_show #|hide_line
from bokeh.layouts import column, row #|hide_line
from bokeh.plotting import output_file, save #|hide_line
from collections import OrderedDict

#### Code from Domainator
def get_cds_unique_name(feature):
    """
        If the feature already has a cds_id, then keep it, otherwise generate one based on the position on the contig.
    """
    if "cds_id" in feature.qualifiers:
        return feature.qualifiers["cds_id"][0]
    else:
        # need the strand information to account for circular contigs.
        name_parts = ["_".join( (str(p.stranded_start_human_readable), str(p.strand), str(p.stranded_end_human_readable)) ) for p in feature.location.parts]
        return " ".join(name_parts) # space so it can be split into multiple lines when writing genbank files

def get_cds_name(feature): #(contig_id, feature):
    if "gene_id" in feature.qualifiers:
        return feature.qualifiers["gene_id"][0]
    elif "locus_tag" in feature.qualifiers:
        return feature.qualifiers["locus_tag"][0]
    else:
        return get_cds_unique_name(feature)
#### End code from Domainator


def plot_multi_genbank(recs, init_pos=None, feature_types=["CDS", "repeat_region", "ncRNA", "rRNA", "tRNA"], name_func=get_cds_name, show_labels=True):
    """
        recs: iterator of SeqRecords
        init_pos: where to center the plot
    """
    strand_dict = {1: "+", -1: "-"}
    # read genbank file(s)
    feature_dfs = [] # list of dataframes, one for each seq record
    names = []
    for rec in recs:
        feature_lists = []
        names.append(rec.id)
        for feature in rec.features:
            if feature.type not in feature_types:
                continue
            for part in feature.location.parts:
                attributes_list = [("ID", name_func(feature)),]
                for key, value in feature.qualifiers.items():
                    if key == "translation":
                        continue
                    if key == "ID":
                        continue
                    if len(value) == 1:
                        attributes_list.append((key, value[0]))
                    else:
                        attributes_list.append((key, "; ".join(value)))
                attributes_dict = OrderedDict( attributes_list )
                feature_lists.append([rec.id, 'Genbank', feature.type, part.start+1, part.end, '.', strand_dict.get(part.strand, "."), ".", attributes_dict])

        feature_dfs.append(pd.DataFrame(feature_lists, columns=["seq_id", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]))
        feature_dfs[-1] = utils.set_positions(feature_dfs[-1])
        add_z_order(feature_dfs[-1], feature_types)
    
    browsers = []
    for feature_df in feature_dfs:
        browsers.append(gn.GenomeBrowser(features=feature_df, init_pos=init_pos, width=1000, show_seq=False, search=False, attributes=None, feature_types=feature_types, color_attribute="Color", label_angle=0, show_labels=show_labels, feature_height=0.15, label_vertical_offset=-0.15, label_justify="left", label_horizontal_offset = 5, glyphs=gn.get_default_glyphs(arrow_colors=("Cyan",), box_colors=("Cyan",)), feature_name={"CDS":"gene_id","Domainator":"name"})) # {x:"Cyan" for x in feature_types}))
        browsers[-1]._get_browser_elements()

    browsers[0].gene_track.xaxis.axis_label = names[0]
    if len(browsers) > 1:
        browsers[0].gene_track.xaxis.major_tick_line_color = None
        browsers[0].gene_track.xaxis.minor_tick_line_color = None
        browsers[0].gene_track.xaxis.major_label_text_font_size  = '0pt'

    for i, browser in enumerate(browsers[1:]):
        i = i+1
        track = browsers[0].add_track()
        track.fig = browsers[i].gene_track
        track.fig.axis.axis_label = names[i]
        track.fig.x_range = browsers[0].x_range
        if i < len(browsers)-1:
            track.fig.xaxis.major_tick_line_color = None
            track.fig.xaxis.minor_tick_line_color = None
            track.fig.xaxis.major_label_text_font_size  = '0pt'

    return browsers[0]


output_notebook(hide_banner=True) #|hide_line
data_path = gn.get_example_data_dir()

g = plot_multi_genbank(SeqIO.parse(os.path.join(data_path, "colored_genbank.gb"), "genbank"), init_pos=50000, feature_types=["CDS", "Domainator"], show_labels=False) #, "Domainator", "Domain_Search"
g.show()
#g.save_html("test2.html", title="stacked_contigs")


