# utils

> This contains useful functions

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| default_exp utils

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
from bokeh.io import output_notebook #|hide_line
output_notebook(hide_banner=True) #|hide_line

In [None]:
#| export
import numpy as np
import pandas as pd
import io

from collections import defaultdict, OrderedDict
import warnings
import gzip
import urllib.request
import os
import re
from platform import uname

from Bio import SeqIO
from Bio.Seq import Seq

from typing import List, Optional, Dict, Tuple
from IPython.display import display, HTML



In [None]:
#| export
def download_file(url, save_path):
    """Checks if a file with the same name is already in the save_path. If not download it."""
    if os.path.exists(save_path):
        print(f"File already exists: {save_path}")
    else:
        urllib.request.urlretrieve(url, save_path)
        print(f"File downloaded and saved: {save_path}")

In [None]:
#| export
def is_gzipped_file(file_path):
    try:
        with gzip.open(file_path, 'rb') as f:
            # Attempt to read a small chunk from the file
            f.read(1)
        return True
    except IOError:
        return False

In [None]:
#| export
def default_open_gz(gff_path):
    """If file is gzipped then opens it with `gzip.open`, otherwise opens it with `open`"""
    if is_gzipped_file(gff_path):
        return gzip.open(gff_path,'rt')
    else:
        return open(gff_path,'r')

In [None]:
# Example usage
file_url = 'https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gff.gz'
human_genome_gff = 'GRCh38_latest_genomic.gff.gz'

download_file(file_url, human_genome_gff)
is_gzipped_file(human_genome_gff)

File already exists: GRCh38_latest_genomic.gff.gz


True

In [None]:
#| export
def extract_attribute(input_str:str, #attribute string to parse
                      attr_name:str, #name of the attribute to extract
                     ) -> str:
    """Extracts the attribute called attr_name from the GFF attributes string"""
    
    pattern = f"[{attr_name[0].lower()}{attr_name[0].upper()}]{attr_name[1:]}=(?P<{attr_name}>[^;]+)"
    match = re.search(pattern, input_str)
    if match:
        return match.groupdict()[attr_name]
    else:
        return None

In [None]:
input_str = 'ID=cds-ATV02827.1;Parent=gene-SaO11_00001;Dbxref=NCBI_GP:ATV02827.1;Name=ATV02827.1;gbkey=CDS;gene=dnaA;locus_tag=SaO11_00001;product=Chromosomal replication initiator protein DnaA;protein_id=ATV02827.1;transl_table=11'
extract_attribute(input_str,"gene")

'dnaA'

In [None]:
#| hide
input_str = 'locus_tag=SaO11_00001;product=Chromosomal replication initiator protein DnaA;protein_id=ATV02827.1;transl_table=11'
assert extract_attribute(input_str,"gene") == None

In [None]:
#| export
def extract_all_attributes(input_str:str)->OrderedDict: #TODO: why is this not limited by the attributes subset provided to GenomeBrowser?
    """Extracts all attributes from the GFF attributes column"""
    
    pattern = "(?P<key>\w+[-\w]*)=(?P<value>[^;]+)"
    match = re.findall(pattern, input_str)
    d=OrderedDict()
    d.update(match)
    return d

In [None]:
#| export
def extract_attributes(input_str:str, #the attribute string of a GFF fome
                       attributes: Optional[List[str]] = None #an optional list of attribute names to extract. If None all attributes are extracted.
                       )->OrderedDict: 
    """Extracts attributes from the GFF attributes column"""
    pattern = "(?P<key>\w+[-\w]*)=(?P<value>[^;]+)"
    match = re.findall(pattern, input_str)
    d=OrderedDict()
    if attributes is not None:
        match = [m for m in match if m[0] in attributes]
    d.update(match)
    return d

In [None]:
#| export
def get_attributes(df: pd.DataFrame, #a features DataFrame with at least a "type" column and an "attributes_str" column
                   attributes: Optional[Dict[str, List]] = None # a dictionary with feature types as keys and a list of attributes to extract as values 
                   ) -> List:
    """Iterates over each row of the df and extracts the attributes specified in the attributes dictionary for each feature type"""
    attr_list=[]
    for i, row in df.iterrows():
        if attributes is None:
            attrs = None
        elif row.type in attributes:
            attrs = attributes[row.type]
        else:
            attrs = None
        
        attr_list.append(extract_attributes(row.attributes_str,attrs))

    return attr_list

In [None]:
#| export
def attributes_to_columns(features: pd.DataFrame):
    attr_dicts=features.attributes.apply(extract_all_attributes)
    all_keys=list(set().union(*[d.keys() for d in attr_dicts]))
    
    attr_dict=dict([(k,[d.get(k,None) for d in attr_dicts]) for k in all_keys])
    features=features.copy()
    for k,v in attr_dict.items():
        features[k]=v
    
    features.fillna("")
    return features
    

In [None]:
#| export
def set_positions(annotation: pd.DataFrame, # an annotation DataFrame extracted from a gff file
                            ) ->  pd.DataFrame:
    """Sets left and right as the position of the feature on the sequence, left is always lower than right.
    start and end represent the begining and end of the feature where start can be greater than end depending on the feature strand.
    """
    annotation=annotation.copy()
    annotation.loc[:, "left"] = annotation[["start"]].values
    annotation.loc[:, "right"] = annotation[["end"]].values
    
    mask = annotation["strand"] == "+"
    annotation.loc[mask, "start"] = annotation.loc[mask, "left"].values
    annotation.loc[mask, "end"] = annotation.loc[mask, "right"].values
    
    mask = annotation["strand"] == "-"
    annotation.loc[mask, "start"] = annotation.loc[mask, "right"].values
    annotation.loc[mask, "end"] = annotation.loc[mask, "left"].values
    
    annotation["middle"] = (annotation.right + annotation.left) / 2
    
    return annotation

In [None]:
#| export
#| hide
class EmptyDataFrame(Exception):
    pass

In [None]:
#| export
def parse_gff(gff_path:str, # path to the gff file
              seq_id: Optional[str] = None, # sequence id (first column of the gff), if not None, then return only the annotations for the seq_id with this name
              first: bool = True, # if True then return only the annotations for the first sequence (or the first with seq_id)
              bounds: Optional[tuple] = None, # (left limit, right limit)
              feature_types: Optional[list] = None, # list of feature types to extract
              attributes: Optional[Dict[str, List]] = None, # a dictionary with feature types as keys and a list of attributes to extract as values 
             )->List[pd.DataFrame]:
    """ Parses a GFF3 file and returns a list of Pandas DataFrames with the data for a specific contig. 
    If seq_id is None then only the first contig is parsed.
    If feature_types is None then all feature types are extracted."""

    if attributes is None:
        attributes = {}

    def _slurp_buffer(file_buffer, buffer_empty):
        # Reset the file pointer to the beginning of the file buffer
        file_buffer.seek(0)
        if buffer_empty:
            raise EmptyDataFrame("The annotation DataFrame is empty. Check that the feature_types and seq_id are correct, and that bounds (if specified) fall within the size of your genome.")
            df=pd.DataFrame(columns=["seq_id", "source","type","start","end","score","strand","phase","attributes"])
        else:
            df=pd.read_csv(file_buffer,sep="\t",header=None)
            df.columns=["seq_id", "source","type","start","end","score","strand","phase","attributes_str"]
            #df=attributes_to_columns(df)
            df["attributes"] = get_attributes(df, attributes)
            df.drop(columns=["attributes_str"], inplace=True)
            df=set_positions(df)
        return df
    
    out = list()

    #NOTE: This assumes that all lines for a given seq_id are consecutive, which is generally the case for gff files.
    with default_open_gz(gff_path) as gff_file:
        # Create an in-memory file buffer using the io.StringIO class
        file_buffer = io.StringIO()
        buffer_empty = True
        last_seq_id = None
        for line in gff_file:
            if line[0]=="#":
                continue
            else:
                r=line.split('\t')
                current_line_seqid = r[0]
                if last_seq_id is not None and current_line_seqid != last_seq_id: #seeing a new segment of the gff
                    if not buffer_empty:
                        if seq_id is None or seq_id == last_seq_id:
                            out.append(_slurp_buffer(file_buffer, buffer_empty))
                            file_buffer = io.StringIO()
                            buffer_empty = True
                            if first is not None or seq_id is not None:
                                break
                last_seq_id = current_line_seqid
                if seq_id is None: #
                    seq_id = current_line_seqid
                if r[0]==seq_id:
                    if feature_types==None or r[2] in feature_types:
                        if bounds==None or (int(r[3])<bounds[1] and int(r[4])>bounds[0]):
                            # Write each line to the file buffer
                            file_buffer.write(line)
                            buffer_empty=False
        if not buffer_empty:
            if seq_id is None or seq_id == last_seq_id:
                out.append(_slurp_buffer(file_buffer, buffer_empty))
    
    if len(out) == 0:
        raise EmptyDataFrame("The annotation DataFrame is empty. Check that the feature_types and seq_id are correct, and that bounds (if specified) fall within the size of your genome.")
    return out

In [None]:
from genomenotebook.data import get_example_data_dir
import os

In [None]:
data_path = get_example_data_dir()
gff_path = os.path.join(data_path, "jmh43.gff")
df=parse_gff(gff_path, 
             bounds=(10000,50000))[0]
df.head()

Unnamed: 0,seq_id,source,type,start,end,score,strand,phase,attributes,left,right,middle
0,NZ_JAGURL010000100.1,RefSeq,region,1,16949,.,+,.,"{'ID': 'NZ_JAGURL010000100.1:1..16949', 'Dbxre...",1,16949,8475.0
1,NZ_JAGURL010000100.1,RefSeq,gene,10059,8311,.,-,.,"{'ID': 'gene-KFX61_RS20985', 'Name': 'KFX61_RS...",8311,10059,9185.0
2,NZ_JAGURL010000100.1,Protein Homology,CDS,10059,8311,.,-,0,"{'ID': 'cds-WP_225638104.1', 'Parent': 'gene-K...",8311,10059,9185.0
3,NZ_JAGURL010000100.1,RefSeq,gene,10541,10092,.,-,.,"{'ID': 'gene-KFX61_RS20990', 'Name': 'KFX61_RS...",10092,10541,10316.5
4,NZ_JAGURL010000100.1,Protein Homology,CDS,10541,10092,.,-,0,"{'ID': 'cds-WP_016268480.1', 'Parent': 'gene-K...",10092,10541,10316.5


In [None]:
df=parse_gff(gff_path, 
             seq_id="NZ_JAGURL010000013.1",
             bounds=(10000,50000))[0]
df.head()

Unnamed: 0,seq_id,source,type,start,end,score,strand,phase,attributes,left,right,middle
0,NZ_JAGURL010000013.1,RefSeq,region,1,98838,.,+,.,"{'ID': 'NZ_JAGURL010000013.1:1..98838', 'Dbxre...",1,98838,49419.5
1,NZ_JAGURL010000013.1,RefSeq,gene,10195,11175,.,+,.,"{'ID': 'gene-KFX61_RS05880', 'Name': 'KFX61_RS...",10195,11175,10685.0
2,NZ_JAGURL010000013.1,Protein Homology,CDS,10195,11175,.,+,0,"{'ID': 'cds-WP_048697200.1', 'Parent': 'gene-K...",10195,11175,10685.0
3,NZ_JAGURL010000013.1,RefSeq,gene,11234,11824,.,+,.,"{'ID': 'gene-KFX61_RS05885', 'Name': 'KFX61_RS...",11234,11824,11529.0
4,NZ_JAGURL010000013.1,Protein Homology,CDS,11234,11824,.,+,0,"{'ID': 'cds-WP_048697198.1', 'Parent': 'gene-K...",11234,11824,11529.0


In [None]:
#| hide
#testing feature_types
gff_path = os.path.join(data_path, "jmh43.gff")
df=parse_gff(gff_path, 
             seq_id="NZ_JAGURL010000013.1",
             feature_types=["gene"],
             bounds=(10000,50000))[0]
assert(df.loc[0,"type"]=="gene")


In [None]:
#| hide
#testing attributes
attributes = {"gene":["Name","ID","X"], "CDS":["ID"]}
df=parse_gff(gff_path, 
             attributes=attributes,
             bounds=(10000,50000))[0]
df.head()

Unnamed: 0,seq_id,source,type,start,end,score,strand,phase,attributes,left,right,middle
0,NZ_JAGURL010000100.1,RefSeq,region,1,16949,.,+,.,"{'ID': 'NZ_JAGURL010000100.1:1..16949', 'Dbxre...",1,16949,8475.0
1,NZ_JAGURL010000100.1,RefSeq,gene,10059,8311,.,-,.,"{'ID': 'gene-KFX61_RS20985', 'Name': 'KFX61_RS...",8311,10059,9185.0
2,NZ_JAGURL010000100.1,Protein Homology,CDS,10059,8311,.,-,0,{'ID': 'cds-WP_225638104.1'},8311,10059,9185.0
3,NZ_JAGURL010000100.1,RefSeq,gene,10541,10092,.,-,.,"{'ID': 'gene-KFX61_RS20990', 'Name': 'KFX61_RS...",10092,10541,10316.5
4,NZ_JAGURL010000100.1,Protein Homology,CDS,10541,10092,.,-,0,{'ID': 'cds-WP_016268480.1'},10092,10541,10316.5


In [None]:
#| hide
#testing EmptyDataFrame Error
try:
    df=parse_gff(gff_path, 
                seq_id="NZ_JAGURL010000013.1",
                feature_types=["x"],
                bounds=(10000,50000))[0]
except EmptyDataFrame:
    exception_raised = True

assert exception_raised


In [None]:
#| hide
#Testing mistake in seq_id
try:
    features=parse_gff(gff_path, "U00097.3") #mistake in seq_id
except EmptyDataFrame:
    exception_raised = True

assert exception_raised

In [None]:
genome_path = os.path.join(data_path, "MG1655_U00096.fasta")
gff_path = os.path.join(data_path, "MG1655_U00096.gff3")
parse_gff(gff_path)[0].head()

Unnamed: 0,seq_id,source,type,start,end,score,strand,phase,attributes,left,right,middle
0,U00096.3,Genbank,region,1,4641652,.,+,.,"{'ID': 'U00096.3:1..4641652', 'Dbxref': 'taxon...",1,4641652,2320826.5
1,U00096.3,Genbank,gene,190,255,.,+,.,"{'ID': 'gene-b0001', 'Dbxref': 'ASAP:ABE-00000...",190,255,222.5
2,U00096.3,Genbank,CDS,190,255,.,+,0,"{'ID': 'cds-AAC73112.1', 'Parent': 'gene-b0001...",190,255,222.5
3,U00096.3,Genbank,gene,337,2799,.,+,.,"{'ID': 'gene-b0002', 'Dbxref': 'ASAP:ABE-00000...",337,2799,1568.0
4,U00096.3,Genbank,CDS,337,2799,.,+,0,"{'ID': 'cds-AAC73113.1', 'Parent': 'gene-b0002...",337,2799,1568.0


In [None]:
#| export
def available_feature_types(gff_path):
    ftypes=set()
    with default_open_gz(gff_path) as handle:
        for line in handle:
            if line[0]!="#":
                r=line.split('\t')
                if len(r)==9:
                    ftypes.add(r[2])
    return ftypes

In [None]:
data_path = get_example_data_dir()
gff_path = os.path.join(data_path, "MG1655_U00096.gff3")
available_feature_types(gff_path)

{'CDS',
 'exon',
 'gene',
 'mobile_genetic_element',
 'ncRNA',
 'origin_of_replication',
 'pseudogene',
 'rRNA',
 'recombination_feature',
 'region',
 'repeat_region',
 'sequence_feature',
 'tRNA'}

In [None]:
#| export
def available_attributes(gff_path):
    features=parse_gff(gff_path)[0]
    return features.columns

In [None]:
available_attributes(gff_path)

Index(['seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase',
       'attributes', 'left', 'right', 'middle'],
      dtype='object')

In [None]:
#| export
def parse_fasta(genome_path, seq_id):
    """Retrieves the Biopython SeqRecord object that matches the seq_id in a fasta file"""

    rec_found=False
    with open(genome_path,'r') as f:
        for rec in SeqIO.parse(f, 'fasta'):
            if rec.id==seq_id:
                rec_found=True
                break

    if not rec_found:
        warnings.warn("seq_id not found in fasta file")
        rec = None
    
    return rec.seq

In [None]:
#| hide
faa_path = os.path.join(data_path, "MG1655_U00096.fasta")
rec = next(SeqIO.parse(faa_path, 'fasta'))
testseq = str(rec.seq)

rec = parse_fasta(faa_path,rec.id)
assert(str(rec) == testseq)

In [None]:
#| export
def regions_overlap(region1, region2, min_overlap_fraction=0.0):
    """
        regions are tuples of start and stop coordinates
        returns true if a fraction of region2 >= min_overlap_fraction overlaps with region1
        coordinates within regions must be sorted low to high
    """

    if min_overlap_fraction >= 1:
        return False
    
    # incoming coordinates are gff-base (1-based, inclusive), convert to python-base (0-based)

    region_1 = (region1[0] - 1, region1[1])
    region_2 = (region2[0] - 1, region2[1])
    region_1_size = region1[1] - region1[0]
    region_2_size = region2[1] - region2[0]
    if region_1_size == 0:
        region_1_size = 0.1
    if region_2_size == 0:
        region_2_size = 0.1

    # -region2 start is contained in region11
    if region2[0] <= region1[1] and region2[0] >= region1[0]:
        # not true for completely contained, but we don't care, should technically be min(region2[1] - region2[0], region1[1] - region2[0])
        overlap_size = region1[1] - region2[0]
        if (overlap_size / region_2_size) >= min_overlap_fraction:
            return True

    # -region2 end is contained in region1
    if region2[1] <= region1[1] and region2[1] >= region1[0]:
        # not true for completely contained, but we don't care
        overlap_size = region2[1] - region1[0]
        if (overlap_size / region_2_size) >= min_overlap_fraction:
            return True

    # -region1 start is contained in region2
    if region1[0] <= region2[1] and region1[0] >= region2[0]:
        # not true for completely contained, but we don't care
        overlap_size = region2[1] - region1[0]
        if (overlap_size / region_2_size) >= min_overlap_fraction:
            return True

    # -region1 end is contained in region2
    if region1[1] <= region2[1] and region1[1] >= region2[0]:
        # not true for completely contained, but we don't care
        overlap_size = region1[1] - region2[0]
        if (overlap_size / region_2_size) >= min_overlap_fraction:
            return True

    return False
    

In [None]:
#| hide
assert(regions_overlap((0,100),(90,200))==True)
assert(regions_overlap((0,100),(190,200))==False)
assert(regions_overlap((190,200),(0,100))==False)
assert(regions_overlap((190,200),(0,195))==True)

In [None]:
#| export
from collections import defaultdict

In [None]:
#| export
def add_z_order(features, 
                prescedence = ["source", "CDS", "repeat_region", "ncRNA", "rRNA", "tRNA","exon"]):
    """
        features is a dataframe of features
        prescedence is a list of feature types in order of prescedence, e.g. ["CDS", "repeat_region", "ncRNA", "rRNA", "tRNA"] will put "CDS" features closer to the bottom of the plot than "repeat_region" features.
        returns features with a z_order column added
    """
    #TODO: possibility for "linking attributes" to link features and cause them to have the same z-order and occupy their entire envelope.
    type_order = defaultdict(lambda: len(prescedence)+1)
    type_order.update({t: i for i, t in enumerate(prescedence)})
    features.sort_values(by="start", inplace=True)
    features.sort_values(by="type", inplace=True, key=lambda x: x.map(type_order))
    z_order = []
    added = []
    all_z = {0}
    for index, row in features.iterrows():
        left, right = row["left"], row["right"]
        z = 0
        z_found = set()
        for (l_a, r_a, z_a, z_o) in added:
            if regions_overlap((left, right), (l_a, r_a)):
                if type_order[row["type"]] > z_o:
                    for i in range(z_a+1):
                        z_found.add(i)
                else:
                    z_found.add(z_a)
        if len(z_found) == len(all_z):
            z = max(all_z) + 1
            all_z.add(z)
        else:
            z = min(all_z - z_found)
        z_order.append(z)
        added.append((left, right, z, type_order[row["type"]]))
    features["z_order"] = z_order

    features.sort_values(by="start", inplace=True)

In [None]:
#| hide
gff_path = os.path.join(data_path, "jmh43.gff")
df=parse_gff(gff_path, 
             seq_id="NZ_JAGURL010000013.1",
             feature_types=["gene"],
             bounds=(10000,50000))[0]
add_z_order(df)
df.iloc[3:6]

Unnamed: 0,seq_id,source,type,start,end,score,strand,phase,attributes,left,right,middle,z_order
3,NZ_JAGURL010000013.1,RefSeq,gene,14538,16334,.,+,.,"{'ID': 'gene-KFX61_RS05895', 'Name': 'KFX61_RS...",14538,16334,15436.0,0
4,NZ_JAGURL010000013.1,RefSeq,gene,16377,16847,.,+,.,"{'ID': 'gene-KFX61_RS05900', 'Name': 'KFX61_RS...",16377,16847,16612.0,0
5,NZ_JAGURL010000013.1,RefSeq,gene,18894,16822,.,-,.,"{'ID': 'gene-KFX61_RS05905', 'Name': 'KFX61_RS...",16822,18894,17858.0,1


In [None]:
#| export
#### Code from Domainator
def get_cds_unique_name(feature):
    """
        If the feature already has a cds_id, then keep it, otherwise generate one based on the position on the contig.
    """
    if "cds_id" in feature.qualifiers:
        return feature.qualifiers["cds_id"][0]
    else:
        # need the strand information to account for circular contigs.
        name_parts = ["_".join( (str(p.stranded_start_human_readable), str(p.strand), str(p.stranded_end_human_readable)) ) for p in feature.location.parts]
        return " ".join(name_parts) # space so it can be split into multiple lines when writing genbank files

def get_cds_name(feature): #(contig_id, feature):
    if "gene_id" in feature.qualifiers:
        return feature.qualifiers["gene_id"][0]
    elif "locus_tag" in feature.qualifiers:
        return feature.qualifiers["locus_tag"][0]
    else:
        return get_cds_unique_name(feature)
#### End code from Domainator

In [None]:
#| export
from Bio import SeqRecord

In [None]:
#| export
strand_dict = {1: "+", -1: "-"}

def seqRecord_to_df(rec: SeqRecord,
                    feature_types: Optional[List[str]] = None, # if None then get all features, otherwise only those with type in FeatureTypes.
                    attributes: Optional[Dict[str,List]] = None 
                    # if None, then get all attributes of all feature types. If dict, then only get attributes of feature types keys. If value is None, get all
                    )->pd.DataFrame:
                    
    feature_lists = []
    for feature in rec.features:
        if feature_types is None or feature.type in feature_types:
            if attributes is None:
                attrs = None
            else:
                attrs=attributes.get(feature.type, None)
            for part in feature.location.parts:
                attributes_list = []#[("ID", get_cds_name(feature)),]
                for key, value in feature.qualifiers.items():
                    if key == "translation":
                        continue
                    #if key == "ID":
                    #    continue
                    if (attrs==None) or (key in attrs):
                        if len(value) == 1:
                            attributes_list.append((key, value[0]))
                        else:
                            attributes_list.append((key, "; ".join(value)))

                attributes_dict = OrderedDict( attributes_list )
                feature_lists.append([rec.id, 'Genbank', feature.type, part.start+1, part.end, '.', strand_dict.get(part.strand, "."), ".", attributes_dict])
        
    df=pd.DataFrame(feature_lists, columns=["seq_id", "source", "type", "start", "end", "score", "strand", "phase", "attributes"])
    return df

In [None]:
gb_path=os.path.join(data_path, "colored_genbank.gb")
recs=SeqIO.parse(gb_path, "genbank")
rec=next(recs)
df=seqRecord_to_df(rec, feature_types=["CDS"])
df.loc[1]["attributes"]

OrderedDict([('gene_id', 'pDONR201_2'),
             ('gene_type', 'bacteria'),
             ('complete', 'true'),
             ('gc', '50'),
             ('length', '306'),
             ('source', 'GeneMark.hmm2'),
             ('score', '20.71'),
             ('phase', '0'),
             ('name', 'pDONR201_2'),
             ('cds_id', '1264_-1_959'),
             ('domainator_pdonr_hmms', 'CcdB (CcdB protein, 1.1e-32, 103.1)'),
             ('Color', '#FF0000')])

In [None]:
df=seqRecord_to_df(rec, feature_types=["CDS"], attributes={"CDS":["gene_type"]})
df.head()

Unnamed: 0,seq_id,source,type,start,end,score,strand,phase,attributes
0,pDONR201_1,Genbank,CDS,2,106,.,+,.,{'gene_type': 'bacteria'}
1,pDONR201_1,Genbank,CDS,959,1264,.,-,.,{'gene_type': 'bacteria'}
2,pDONR201_1,Genbank,CDS,1266,1391,.,-,.,{'gene_type': 'bacteria'}
3,pDONR201_1,Genbank,CDS,1606,2265,.,-,.,{'gene_type': 'bacteria'}
4,pDONR201_1,Genbank,CDS,2916,3677,.,+,.,{'gene_type': 'bacteria'}


In [None]:
#| export

def parse_recs(recs, # iterator over Bio.SeqRecord.SeqRecord
                   seq_id: Optional[str] = None, # sequence id (first column of the gff), if not None, then return only the annotations for the seq_id with this name
                   first = True, # if True then return only the annotations for the first sequence (or the first with seq_id)
                   bounds: Optional[tuple] = None, # (left limit, right limit)
                   feature_types: Optional[list] = None, # list of feature types to extract
                   attributes: Optional[Dict[str, List]] = None, # a dictionary with feature types as keys and a list of attributes to extract as values 
               )->Tuple[List[Seq], List[pd.DataFrame]]:

    # read genbank file(s)
    feature_dfs = [] # list of dataframes, one for each seq record used if seq_id == "all"
    seqs = [] # list of Seqs
    for rec in recs:
        if seq_id == rec.id or seq_id is None:
            df = seqRecord_to_df(rec, feature_types=feature_types, attributes=attributes)
            if bounds is not None:
                df = df.loc[(df.end>bounds[0]) & (df.start<bounds[1])]
            
            df = set_positions(df)
            feature_dfs.append(df)
            seqs.append(rec.seq)
            if first or seq_id is not None: # we only want one
                break

    if len(feature_dfs) == 0:
        raise EmptyDataFrame("The annotation DataFrame is empty. Check that the feature_types and seq_id are correct, and that bounds (if specified) fall within the size of your genome.")
    return seqs, feature_dfs

In [None]:
#| export    
def parse_genbank(gb_path, # path to the genbank file
                  seq_id: Optional[str] = None, # sequence id (first column of the gff), if not None, then return only the annotations for the seq_id with this name
                  first = True, # if True then return only the annotations for the first sequence (or the first with seq_id)
                  bounds: Optional[tuple] = None, # (left limit, right limit)
                  feature_types: Optional[list] = None, # list of feature types to extract
                  attributes: Optional[Dict[str, List]] = None, # a dictionary with feature types as keys and a list of attributes to extract as values 
                  )->Tuple[List[Seq], List[pd.DataFrame]]:

    with open(gb_path,"r") as f:
        recs = parse_recs(SeqIO.parse(f, "genbank"), seq_id, first, bounds, feature_types, attributes)
    return recs


In [None]:
gb_path=os.path.join(data_path, "colored_genbank.gb")
recs=SeqIO.parse(gb_path, "genbank")
rec=next(recs)
testid=rec.id

seqs, dfs=parse_genbank(gb_path, seq_id = testid,
                 feature_types=["CDS", "Domainator"]
                )

# dfs[0].head()
assert len(dfs) == len(seqs)
assert len(dfs) == 1
assert dfs[0].loc[0, "seq_id"] == "pDONR201_1"

In [None]:
gb_path=os.path.join(data_path, "colored_genbank.gb")
seqs, dfs=parse_genbank(gb_path,
                 seq_id=None,
                 first=False,
                 feature_types=["CDS", "Domainator"]
                )

assert len(dfs) == len(seqs)
assert len(dfs) == 4
assert dfs[0].loc[0, "seq_id"] == "pDONR201_1"
assert dfs[1].loc[0, "seq_id"] == "pDONR201_2"
assert dfs[2].loc[0, "seq_id"] == "pDONR201_3"
assert dfs[3].loc[0, "seq_id"] == "pDONR201_4"

In [None]:
#| export

def inspect_feature_types(file_path: str, 
                          frmt: str #gff or genbank
                          ):
    """Outputs a table that recapitulates the feature types and attributes available in the file."""
    
    if frmt == "genbank":
        _, dfs=parse_genbank(file_path)
    elif frmt == "gff":
        dfs=parse_gff(file_path)

    table_data=[]
    for df in dfs:
        for t in set(df.type):
            row=[t]
            attributes = df.loc[df.type==t, "attributes"].iloc[0]
            for attr in attributes:
                row.append(attr)
                table_data.append(row)
                row=[""]


    df_output = pd.DataFrame(table_data, columns=["feature_type", "attributes"])
    display(HTML(df_output.to_html(index=False)))

In [None]:
inspect_feature_types(gff_path, "gff")

feature_type,attributes
exon,ID
,Parent
,anticodon
,gbkey
,inference
,locus_tag
,product
CDS,ID
,Parent
,Dbxref


In [None]:
#| hide
gb_path=os.path.join(data_path, "colored_genbank.gb")
inspect_feature_types(gb_path,"genbank")

feature_type,attributes
CDS,gene_id
,gene_type
,partial
,gc
,length
,source
,score
,phase
,name
,cds_id


In [None]:
#| hide
def split_string(string, max_length=10):
    if len(string) <= max_length:
        return string
    else:
        split_index = max_length
        while split_index > 0 and string[split_index] != ' ':
            split_index -= 1
        if split_index == 0:
            split_index = max_length  # If no suitable breaking point found, split at max_length
        return string[:split_index] + '\n' + split_string(string[split_index:].lstrip(), max_length)



In [None]:
#| hide
# Example usage
long_string = "This is a very long string that needs to be split into multiple lines because it exceeds 50 characters."

split_result = split_string(long_string, max_length=50)
print(split_result)


This is a very long string that needs to be split
into multiple lines because it exceeds 50
characters.


In [None]:
#| hide
#| export
def in_wsl() -> bool:
    return 'microsoft-standard' in uname().release

In [None]:
#| hide
in_wsl()

True

In [None]:
#| hide
#| export
def add_extension(filename,extension="svg"):
    base_name, ext = os.path.splitext(filename)
    if ext.lower() != '.'+extension:
        filename += '.'+extension
    return filename

In [None]:
#| hide
add_extension("test.svg"), add_extension("test",extension="png")

('test.svg', 'test.png')

In [None]:
#| hide
def sort_list_dict(d: dict, #a dictionnary for which all values are of type list
                   ref_list="xs", #key of the list to use as a reference for sorting
                   func=lambda x: x, #A custom function can be supplied to customize the sort order. Default is the identity function.
                  ):
    ks=list(d.keys())
    ref_list_ix=ks.index(ref_list)
    # Sort all the lists in the dictionary based on the values of the reference list
    sorted_lists = sorted(zip(*[d[k] for k in ks]), key= lambda x: func(x[ref_list_ix]))

    # Convert the sorted tuples back into separate lists
    unzipped_lists = zip(*sorted_lists)

    # Create a new dictionary with the same keys as the original dictionary, but with the sorted lists as values
    d = {k: list(t) for k, t in zip(ks, unzipped_lists)}
    return d


In [None]:
#| hide
d={'xs':[[2,5],[3,1]],'a':[1,3],'b':["c","d"]}
sort_list_dict(d, ref_list='xs', func= lambda x: x[1]) #sort according to the second element of each element of list 'xs'

{'xs': [[3, 1], [2, 5]], 'a': [3, 1], 'b': ['d', 'c']}

In [None]:
#| hide
#| export
from bokeh.plotting import show as bk_show
from bokeh.layouts import column, row
from bokeh.io import output_notebook, reset_output
from bokeh.plotting import save as bk_save #Need to rename the bokeh show function so that there is no confusion with GenomeBrowser.show
from bokeh.plotting import output_file as bk_output_file #Need to rename the bokeh show function so that there is no confusion with GenomeBrowser.show
from bokeh.io import output_notebook, reset_output, export_png, export_svgs, export_svg
from svgutils import compose
import os
import warnings
from selenium.webdriver.chrome.options import Options
from selenium import webdriver

In [None]:
#| hide
#| export
def _save(elements, heights, width, fname:str, title:str="Genome Plot"):
    base_name, ext = os.path.splitext(fname)
    ext = ext.lower()
    if ext not in {".svg", ".png"}:
        raise ValueError(f"filename must end in svg or png, not {ext}")
    
    reset_output()
    bk_output_file(filename=fname, title=title)
    
    layout = column(elements)

    if in_wsl():
            ## Setup chrome options
            chrome_options = Options()
            chrome_options.add_argument("--headless") # Ensure GUI is off
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-3d-apis")
            chrome_options.add_argument("--disable-blink-features")
            

            homedir = os.path.expanduser("~")
            try:
                    # webdriver_service = Service(f"{homedir}/chromedriver/stable/chromedriver")
                    # browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
                    browser = webdriver.Chrome(options=chrome_options)
            except:
                    warnings.warn("""If using WSL you can install chromedriver following these instructions:https://scottspence.com/posts/use-chrome-in-ubuntu-wsl
                                  Also make sure the chromedriver-binary python package has the same major version number as your chrome install.
                                  Check the chrome version using: google-chrome --version
                                   Then use pip to force install of a web driver with a compatible version, for example:
                                   pip install --force-reinstall -v "chromedriver-binary==121.0.6167.184.0"
                                   """)
                    browser=None

            
    else:
            browser=None

    if ext == ".svg":
        #export_svg(layout, filename=fname)
        export_svgs(layout, filename=fname, webdriver=browser)
        if len(heights)>1: # TODO: what is this?
            total_height=sum(heights)
            svgelements=[compose.SVG(fname)]
            offset=heights[0]
            for i, height in enumerate(heights[1:]):
                svgelements.append(
                    compose.SVG(f"{base_name}_{i+1}.svg").move(0,offset)
                )
                offset+=height
                
            compose.Figure(width+50, # +50 accounts for axis and labels
                           total_height, 
                           *svgelements).save(f"{base_name}_composite.svg")

    else:
        export_png(layout, filename=fname, webdriver=browser)
    
    reset_output()

In [None]:
#| hide
import numpy as np
import pandas as pd
from genomenotebook.browser import GenomeBrowser
from genomenotebook.plot import GenomePlot
from genomenotebook.data import get_example_data_dir

In [None]:
#| hide
data_path = get_example_data_dir()
fasta_path = os.path.join(data_path, "MG1655_U00096.fasta")
gff_path = os.path.join(data_path, "MG1655_U00096.gff3")

g=GenomeBrowser(gff_path=gff_path, fasta_path=fasta_path, bounds=(0,5000), search=False)
track = g.add_track(height=100)
track.scatter(data=pd.DataFrame(dict(x=np.arange(0,5000,100),y=np.sin(np.arange(0,5000,100)))), y="y", pos="x")

plot = GenomePlot(g)
plot._collect_elements()

_save(plot.elements, [g.height], g.width, fname="test_p.png")

file_path = os.path.join(os.getcwd(), "test_p.png")
assert os.path.getsize(file_path) > 0, f"File is empty: {file_path}"


In [None]:
#| hide
os.remove("test_p.png")

In [None]:
#| hide
#| export
def _save_html(elements, fname:str, title:str):
    reset_output()
    bk_output_file(filename=fname, title=title, mode='inline')
    bk_save(column(elements))
    reset_output()

In [None]:
#| hide
#| export
def _gb_show(elements):
    reset_output()
    output_notebook(hide_banner=True)
    bk_show(column(elements))
    reset_output()

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()