In [1]:
import matplotlib
from matplotlib import pyplot
from statistics import median, mean

# Make the plots used here look kinda like ggplots
pyplot.style.use("ggplot")

# Use a consistent color scheme for relating codon positions to colors.
#
# Uses a hopefully colorblind-friendly grayscale scheme, where CP 1 = black, CP 2 = gray, and CP 3 = white
# (This should be accompanied by black borders around each color so that the gray/white are easily visible on
# a light background).
#
# We also say CP 4 = green (which is not quite as colorblind-friendly... in the future would be good to add a
# pattern or something). I know what you're thinking -- "CP 4? The computer scientists have gone too
# far this time." But don't worry, I know basic biology (hopefully) -- CP 4 here is shorthand that means
# "positions in non-coding regions of the genome", since it can be interesting to summarize these positions
# alongside the CPs.
cp2color = {1: "#000000", 2: "#888888", 3: "#ffffff", 4: "#29D321"}
BORDERCOLOR = "#000000"

# This is the blue color used in the ggplot-style matplotlib color settings: from
# https://gist.github.com/huyng/816622
SCATTERPLOT_PT_COLOR = "#348ABD"

# Maps seq names to easier-to-remember names used in the report.
seq2name = {"edge_6104": "CAMP", "edge_1671": "BACTERIA", "edge_2358": "BACTEROIDALES"}
SEQS = ["edge_6104", "edge_1671", "edge_2358"]

# used in a few places, notably gene stats and gene utils
seq2len = {"edge_1671": 2153394, "edge_2358": 2806161, "edge_6104": 1289244}

# Whether or not this edge is in an isolated component of the graph. Used in the phasing analysis.
seq2iscircular = {"edge_1671": True, "edge_2358": True, "edge_6104": False}

In [2]:
def use_thousands_sep(mpl_axis):
    # Use thousands separators for positions.
    # For an arbitrary pyplot figure, you can call this function with pyplot.gca().xaxis or pyplot.gca().yaxis. 
    # Modified from https://stackoverflow.com/a/25973637 -- this is modified to work better with integers
    # (matplotlib seems to store all values as floats internally, even essentially integral things -- so we can
    # use the float.is_integer() method to see if a value is "close enough" to an integer, and if so remove the
    # trailing ".0" that happens when you try to format a float of an integer -- see
    # https://stackoverflow.com/a/21583817.)
    mpl_axis.set_major_formatter(matplotlib.ticker.FuncFormatter(
        lambda x, pos: "{:,}".format(int(x)) if x.is_integer() else "{:,}".format(x)
    ))

In [1]:
def get_parent_gene_info_of_position(pos, genes_df):
    """Returns information about the parent gene(s) of a position.
    
    Parameters
    ==========
    
    pos: int
        1-indexed position in a genome.
        
    genes_df: pd.DataFrame
        Result from calling parse_sco().
        
    Returns
    =======
    
    parent_info: list
        List of gene numbers that contain this position. Will usually contain just a single element
        (for positions located in a single gene), but may contain multiple elements if genes overlap
        at this position. May also be an empty list if this position is located in an intergenic region.
    """
    parent_genes = []
    for gene in genes_df.itertuples():
        if pos >= int(gene.LeftEnd) and pos <= int(gene.RightEnd):
            parent_genes.append(int(gene.Index))
    return parent_genes

In [2]:
def color_positions_by_gene_info(posleft, posright, genes_df,
                                 color_multi_genes="#888888", color_no_genes="#ffffff"):
    """Returns information about the parent gene(s) of all positions.
    
    Parameters
    ==========
    
    posleft: int
        1-indexed position in a genome. Inclusive.
        
    posright: int
        1-indexed position in a genome. Inclusive.
        
    genes_df: pd.DataFrame
        Result from calling parse_sco().
        
    Returns
    =======
    
    parent_info: list
        List (one element per position in the range), where the i-th element corresponds
        to the (i-1)-th position in the range's color.
    """
  
    num_genes = len(genes_df.index)
    
    # From https://matplotlib.org/3.1.1/tutorials/colors/colormap-manipulation.html -- adapted from
    # phasing graph (call&plot) notebook
    cmap = matplotlib.cm.get_cmap("viridis", num_genes)
    hexcolors = [matplotlib.colors.to_hex(cmap(c)) for c in range(num_genes)]  
    
    posm1_to_genes = []
    for pos in range(posleft, posright + 1):
        posm1_to_genes.append([])
        
    for gene in genes_df.itertuples():
        gene_left = int(gene.LeftEnd)
        gene_right = int(gene.RightEnd)
        gene_num = int(gene.Index)
        
        for pos in range(gene_left, gene_right + 1):
            posm1_to_genes[pos - 1].append(gene_num)
            
    posm1_to_color = []
    for posm1, genes in enumerate(posm1_to_genes):
        if len(genes) == 0:
            posm1_to_color.append(color_no_genes)
        elif len(genes) > 1:
            posm1_to_color.append(color_multi_genes)
        else:
            posm1_to_color.append(hexcolors[genes[0] - 1])
            
    return posm1_to_color