In [1]:
%cd ~/LevSeq

/home/fli/LevSeq


In [2]:
%load_ext blackcellmagic

In [3]:
import re
import os
import html

from copy import deepcopy

import numpy as np
import pandas as pd
import biopandas as Bio

import panel as pn
import holoviews as hv

from bokeh.io import output_notebook, show
from bokeh.plotting import figure

# Enable Bokeh to display plots in the notebook
hv.extension('bokeh')
pn.extension()
output_notebook()

In [4]:

# Amino acid code conversion
AA_DICT = {
    "Ala": "A",
    "Cys": "C",
    "Asp": "D",
    "Glu": "E",
    "Phe": "F",
    "Gly": "G",
    "His": "H",
    "Ile": "I",
    "Lys": "K",
    "Leu": "L",
    "Met": "M",
    "Asn": "N",
    "Pro": "P",
    "Gln": "Q",
    "Arg": "R",
    "Ser": "S",
    "Thr": "T",
    "Val": "V",
    "Trp": "W",
    "Tyr": "Y",
    "Ter": "*",
}

In [5]:
def match_plate2parent(df: pd.DataFrame, parent_dict: dict | None = None) -> dict:

    """
    Find plate names correpsonding to each parent sequence.

    Args:
    - df : pd.DataFrame
        A pandas DataFrame containing the data for a single plate.
        The DataFrame should have the following columns:
        - "Plate" : str
            The plate identifier.
        - "Well" : str
            The well identifier.
        - "Mutations" : str
            The mutations in the well.
    - parent_dict : dict
        A dictionary containing the parent name for each aa_varient.

    Returns:
    - dict
        A dictionary containing the plate names for each parent sequence.
    """

    if parent_dict is None:

        # add aa_variant column if not present by translating from the nc_variant column
        if "aa_variant" not in df.columns:
            df["aa_variant"] = df["nc_variant"].apply(
                Bio.sequence.Sequence(df["nc_variant"]).translate
            )

        # get all the parents from the df
        parents = df[df["Mutations"] == "#PARENT#"].reset_index(drop=True).copy()

        # get the parent nc_variant
        parent_aas = (
            df[df["Mutations"] == "#PARENT#"][["Mutations", "aa_variant"]]
            .drop_duplicates()["aa_variant"]
            .tolist()
        )

        parent_dict = {f"Parent-{i+1}": parent for i, parent in enumerate(parent_aas)}

    # get the plate names for each parent
    parent2plate = {
        p_name: df[df["aa_variant"] == p_seq]["Plate"].unique().tolist()
        for p_name, p_seq in parent_dict.items()
    }

    # reverse the dictionary to have plate names as keys and rasie flag if there are multiple parents for a plate
    plate2parent = {}
    for parent, plates in parent2plate.items():
        for plate in plates:
            if plate in plate2parent:
                raise ValueError(f"Multiple parents found for plate {plate}")
            else:
                plate2parent[plate] = parent

    return plate2parent


def detect_outliers_iqr(series: pd.Series) -> pd.Index:

    """
    Calculate the Interquartile Range (IQR) and
    determine the lower and upper bounds for outlier detection.

    The IQR is a measure of statistical dispersion and
    is calculated as the difference between the third quartile (Q3)
    and the first quartile (Q1) of the data

    Args:
    - series : pandas.Series
        A pandas Series containing the data for which the IQR and bounds are to be calculated.

    Returns:
    - tuple
        A tuple containing the lower bound and upper bound for outlier detection.

    Example:
    --------
    >>> import pandas as pd
    >>> data = pd.Series([10, 12, 14, 15, 18, 20, 22, 23, 24, 25, 100])
    >>> calculate_iqr_bounds(data)
    (-1.0, 39.0)
    """

    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    return series[(series < lower_bound) | (series > upper_bound)].index


def norm2parent(plate_df: pd.DataFrame) -> pd.DataFrame:

    """
    For each given plate,
    normalize the pdt values of a plate to the mean of the parent
    without the outliers.

    Args:
    - plate_df : pd.DataFrame
        A pandas DataFrame containing the data for a single plate.
        The DataFrame should have the following columns:
        - "Plate" : str
            The plate identifier.
        - "Mutations" : str
            The mutations in the well.
        - "pdt" : float
            The pdt value for the well.

    Returns:
    - pd.DataFrame
        A pandas DataFrame containing the normalized pdt values.
    """

    # get all the parents from the df
    parents = (
        plate_df[plate_df["Mutations"] == "#PARENT#"].reset_index(drop=True).copy()
    )
    filtered_parents = (
        parents.drop(index=detect_outliers_iqr(parents["pdt"]))
        .reset_index(drop=True)
        .copy()
    )

    # normalize the whole plate to the mean of the filtered parent
    plate_df["pdt_norm"] = plate_df["pdt"] / filtered_parents["pdt"].mean()

    return plate_df


def process_mutation(mutation: str) -> pd.Series:
    # Check if mutation is #PARENT#
    if mutation == "#PARENT#":
        return pd.Series([0, [(None, None, None)]])  # Return 0 sites and NaN details

    # Split by "_" to get number of sites
    sites = mutation.split("_")
    num_sites = len(sites)

    # Extract details if it matches the pattern
    details = []
    for site in sites:
        match = re.match(r"^([A-Z])(\d+)([A-Z*])$", site)
        if match:
            parent_aa, site_number, mutated_aa = match.groups()
            details.append((parent_aa, site_number, mutated_aa))
        else:
            details.append((None, None, None))

    return pd.Series([num_sites, details])

In [6]:
# normalized per plate to parent
df = pd.read_csv("/home/fli/LevSeq/sandbox/processed_plate_data.csv")
# ignore deletion meaning "Mutations" == "-"
df = df[df["Mutations"] != "-"].copy()
# count number of sites mutated and append mutation details
# df["num_sites"] = df['Mutations'].apply(lambda x: 0 if x == "#PARENT#" else len(x.split("_")))

# Apply function to the column
df[["num_sites", "mut_dets"]] = df["Mutations"].apply(process_mutation)

# apply the norm function to all plates
df = df.groupby("Plate").apply(norm2parent).reset_index(drop=True).copy()

# add a new column called parent name to the df
# using the dict out put from match_plate2parent
# that matches the plate to the parent
plate2parent = match_plate2parent(df, parent_dict=None)
df["Parent_Name"] = df["Plate"].map(plate2parent)
df

  df = df.groupby("Plate").apply(norm2parent).reset_index(drop=True).copy()


Unnamed: 0,Plate,Well,Row,Column,Mutations,pdt,nc_variant,aa_variant,num_sites,mut_dets,pdt_norm,Parent_Name
0,HMC0225-P1,A2,A,2,#PARENT#,273777.8326,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,0,"[(None, None, None)]",0.832833,Parent-1
1,HMC0225-P1,A3,A,3,A59V,219122.0179,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, V)]",0.666570,Parent-1
2,HMC0225-P1,A4,A,4,A59S,28506.2887,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, S)]",0.086716,Parent-1
3,HMC0225-P1,A5,A,5,A59N,97889.0927,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, N)]",0.297779,Parent-1
4,HMC0225-P1,A6,A,6,A59V,99170.9124,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, V)]",0.301678,Parent-1
...,...,...,...,...,...,...,...,...,...,...,...,...
636,HMC0226-P9,H8,H,8,L69N,5006.7981,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(L, 69, N)]",0.053158,Parent-2
637,HMC0226-P9,H9,H,9,#PARENT#,71968.1903,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,0,"[(None, None, None)]",0.764094,Parent-2
638,HMC0226-P9,H10,H,10,L69A,44687.1407,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(L, 69, A)]",0.474448,Parent-2
639,HMC0226-P9,H11,H,11,L69M,49032.8029,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(L, 69, M)]",0.520587,Parent-2


In [7]:
def prep_single_ssm(df: pd.DataFrame) -> pd.DataFrame:
    """
    Prepare the data for a single sitessm summary plot.

    Args:
    - df: pd.DataFrame, input full dataframe

    Returns:
    - pd.DataFrame, output dataframe
    """

    # slice out single site SSM and add in parentAA, site, and mutAA columns
    single_ssm_df = df[df["num_sites"] <= 1].copy()

    # Expand the single entry in Details for these rows into three columns
    single_ssm_df[["parent_aa", "site_numb", "mut_aa"]] = pd.DataFrame(
        single_ssm_df["mut_dets"].apply(lambda x: x[0]).tolist(),
        index=single_ssm_df.index,
    )

    single_ssm_df["parent_aa_loc"] = (
        single_ssm_df["parent_aa"] + single_ssm_df["site_numb"]
    )

    # fill nan site numbers with 0 and convert to int
    single_ssm_df["site_numb"] = single_ssm_df["site_numb"].fillna(0).astype(int)

    return single_ssm_df

In [8]:
single_ssm_df = prep_single_ssm(df)
single_ssm_df

Unnamed: 0,Plate,Well,Row,Column,Mutations,pdt,nc_variant,aa_variant,num_sites,mut_dets,pdt_norm,Parent_Name,parent_aa,site_numb,mut_aa,parent_aa_loc
0,HMC0225-P1,A2,A,2,#PARENT#,273777.8326,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,0,"[(None, None, None)]",0.832833,Parent-1,,0,,
1,HMC0225-P1,A3,A,3,A59V,219122.0179,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, V)]",0.666570,Parent-1,A,59,V,A59
2,HMC0225-P1,A4,A,4,A59S,28506.2887,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, S)]",0.086716,Parent-1,A,59,S,A59
3,HMC0225-P1,A5,A,5,A59N,97889.0927,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, N)]",0.297779,Parent-1,A,59,N,A59
4,HMC0225-P1,A6,A,6,A59V,99170.9124,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, V)]",0.301678,Parent-1,A,59,V,A59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
636,HMC0226-P9,H8,H,8,L69N,5006.7981,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(L, 69, N)]",0.053158,Parent-2,L,69,N,L69
637,HMC0226-P9,H9,H,9,#PARENT#,71968.1903,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,0,"[(None, None, None)]",0.764094,Parent-2,,0,,
638,HMC0226-P9,H10,H,10,L69A,44687.1407,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(L, 69, A)]",0.474448,Parent-2,L,69,A,L69
639,HMC0226-P9,H11,H,11,L69M,49032.8029,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(L, 69, M)]",0.520587,Parent-2,L,69,M,L69


In [9]:
single_ssm_df[["Parent_Name", "Plate", "parent_aa_loc"]].drop_duplicates()

Unnamed: 0,Parent_Name,Plate,parent_aa_loc
0,Parent-1,HMC0225-P1,
1,Parent-1,HMC0225-P1,A59
72,Parent-1,HMC0225-P1,V89
93,Parent-1,HMC0225-P2,
95,Parent-1,HMC0225-P2,V89
137,Parent-1,HMC0225-P2,I93
183,Parent-1,HMC0225-P3,
184,Parent-1,HMC0225-P3,I93
230,Parent-1,HMC0225-P3,H120
276,Parent-1,HMC0225-P4,


In [10]:
single_ssm_df[["Parent_Name", "parent_aa_loc"]].drop_duplicates().dropna().groupby('Parent_Name')['parent_aa_loc'].apply(list).to_dict()

{'Parent-1': ['A59', 'V89', 'I93', 'H120', 'I149'],
 'Parent-2': ['L69',
  'Y71',
  'I149',
  'V178',
  'C93',
  'K49',
  'R90',
  'E84',
  'H187',
  'E35',
  'A59']}

In [11]:
# only include parents in the same plate 
site_df = single_ssm_df[(single_ssm_df["Parent_Name"] == "Parent-1") & (single_ssm_df["parent_aa_loc"] == "A59")].copy()
site_df

Unnamed: 0,Plate,Well,Row,Column,Mutations,pdt,nc_variant,aa_variant,num_sites,mut_dets,pdt_norm,Parent_Name,parent_aa,site_numb,mut_aa,parent_aa_loc
1,HMC0225-P1,A3,A,3,A59V,219122.0179,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, V)]",0.66657,Parent-1,A,59,V,A59
2,HMC0225-P1,A4,A,4,A59S,28506.2887,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, S)]",0.086716,Parent-1,A,59,S,A59
3,HMC0225-P1,A5,A,5,A59N,97889.0927,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, N)]",0.297779,Parent-1,A,59,N,A59
4,HMC0225-P1,A6,A,6,A59V,99170.9124,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, V)]",0.301678,Parent-1,A,59,V,A59
5,HMC0225-P1,A7,A,7,A59V,74236.142,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, V)]",0.225827,Parent-1,A,59,V,A59
6,HMC0225-P1,A8,A,8,A59G,72259.2286,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, G)]",0.219813,Parent-1,A,59,G,A59
7,HMC0225-P1,A9,A,9,A59L,205426.6833,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, L)]",0.624909,Parent-1,A,59,L,A59
8,HMC0225-P1,A10,A,10,A59M,296001.5469,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, M)]",0.900438,Parent-1,A,59,M,A59
9,HMC0225-P1,A11,A,11,A59M,400101.0178,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, M)]",1.217109,Parent-1,A,59,M,A59
10,HMC0225-P1,A12,A,12,A59L,30080.0161,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, L)]",0.091504,Parent-1,A,59,L,A59


In [12]:
# get parents from those plates
site_parent_df = single_ssm_df[(single_ssm_df["Mutations"] == "#PARENT#") & (single_ssm_df["Plate"].isin(site_df["Plate"].unique()))].copy()
site_parent_df

# rename those site_numb, mut_aa, parent_aa_loc None or NaN to corresponding parent values
site_parent_df["mut_aa"] = site_parent_df["mut_aa"].fillna(site_df["parent_aa"].values[0])
site_parent_df["site_numb"] = site_parent_df["site_numb"].fillna(site_df["site_numb"].values[0])
site_parent_df["parent_aa_loc"] = site_parent_df["parent_aa_loc"].fillna(site_df["parent_aa_loc"].values[0])
site_parent_df

Unnamed: 0,Plate,Well,Row,Column,Mutations,pdt,nc_variant,aa_variant,num_sites,mut_dets,pdt_norm,Parent_Name,parent_aa,site_numb,mut_aa,parent_aa_loc
0,HMC0225-P1,A2,A,2,#PARENT#,273777.8326,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,0,"[(None, None, None)]",0.832833,Parent-1,,0,A,A59
13,HMC0225-P1,B3,B,3,#PARENT#,345530.6499,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,0,"[(None, None, None)]",1.051106,Parent-1,,0,A,A59
16,HMC0225-P1,B6,B,6,#PARENT#,288440.3666,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,0,"[(None, None, None)]",0.877437,Parent-1,,0,A,A59
21,HMC0225-P1,B11,B,11,#PARENT#,375571.7131,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,0,"[(None, None, None)]",1.142491,Parent-1,,0,A,A59
24,HMC0225-P1,C2,C,2,#PARENT#,329797.7622,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,0,"[(None, None, None)]",1.003246,Parent-1,,0,A,A59
26,HMC0225-P1,C4,C,4,#PARENT#,367748.1168,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,0,"[(None, None, None)]",1.118691,Parent-1,,0,A,A59
28,HMC0225-P1,C6,C,6,#PARENT#,362135.3731,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,0,"[(None, None, None)]",1.101617,Parent-1,,0,A,A59
39,HMC0225-P1,D5,D,5,#PARENT#,439126.4257,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,0,"[(None, None, None)]",1.335824,Parent-1,,0,A,A59
52,HMC0225-P1,E6,E,6,#PARENT#,421835.9195,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,0,"[(None, None, None)]",1.283227,Parent-1,,0,A,A59
65,HMC0225-P1,F7,F,7,#PARENT#,302014.9486,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,0,"[(None, None, None)]",0.918731,Parent-1,,0,A,A59


In [13]:
# now merge the two dataframes
merged_site_df = pd.concat([site_parent_df, site_df]).reset_index(drop=True).copy()
merged_site_df

Unnamed: 0,Plate,Well,Row,Column,Mutations,pdt,nc_variant,aa_variant,num_sites,mut_dets,pdt_norm,Parent_Name,parent_aa,site_numb,mut_aa,parent_aa_loc
0,HMC0225-P1,A2,A,2,#PARENT#,273777.8326,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,0,"[(None, None, None)]",0.832833,Parent-1,,0,A,A59
1,HMC0225-P1,B3,B,3,#PARENT#,345530.6499,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,0,"[(None, None, None)]",1.051106,Parent-1,,0,A,A59
2,HMC0225-P1,B6,B,6,#PARENT#,288440.3666,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,0,"[(None, None, None)]",0.877437,Parent-1,,0,A,A59
3,HMC0225-P1,B11,B,11,#PARENT#,375571.7131,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,0,"[(None, None, None)]",1.142491,Parent-1,,0,A,A59
4,HMC0225-P1,C2,C,2,#PARENT#,329797.7622,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,0,"[(None, None, None)]",1.003246,Parent-1,,0,A,A59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,HMC0225-P1,F8,F,8,A59R,32262.7528,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, R)]",0.098143,Parent-1,A,59,R,A59
68,HMC0225-P1,F9,F,9,A59V,247830.9906,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, V)]",0.753903,Parent-1,A,59,V,A59
69,HMC0225-P1,F10,F,10,A59Q,119894.7475,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, Q)]",0.364720,Parent-1,A,59,Q,A59
70,HMC0225-P1,F11,F,11,A59Q,115839.9939,ATGACTCCCTCGGACATCTCGGGGTATGATTATGGGCGTGTCGAGA...,MTPSDISGYDYGRVEKSPITDLEFDLLKKTVMLGEEDVMYLKKAAD...,1,"[(A, 59, Q)]",0.352386,Parent-1,A,59,Q,A59


In [24]:
def get_single_ssm_site_df(single_ssm_df: pd.DataFrame, parent: str, site: str) -> pd.DataFrame:
    """
    Get the single site SSM data for a given site with appended parent data.

    Args:
    - single_ssm_df: pd.DataFrame, input single site SSM dataframe
    - parent: str, parent to filter the data on
    - site: str, site to filter the data on

    Returns:
    - pd.DataFrame, output dataframe
    """

    # get the site data
    site_df = single_ssm_df[
        (single_ssm_df["Parent_Name"] == parent)
        & (single_ssm_df["parent_aa_loc"] == site)
    ].copy()

    # get parents from those plates
    site_parent_df = single_ssm_df[
        (single_ssm_df["Mutations"] == "#PARENT#")
        & (single_ssm_df["Plate"].isin(site_df["Plate"].unique()))
    ].copy()

    # rename those site_numb, mut_aa, parent_aa_loc None or NaN to corresponding parent values
    site_parent_df["mut_aa"] = site_parent_df["mut_aa"].fillna(
        site_df["parent_aa"].values[0]
    )
    site_parent_df["site_numb"] = site_parent_df["site_numb"].fillna(
        site_df["site_numb"].values[0]
    )
    site_parent_df["parent_aa_loc"] = site_parent_df["parent_aa_loc"].fillna(
        site_df["parent_aa_loc"].values[0]
    )

    # now merge the two dataframes
    return pd.concat([site_parent_df, site_df]).reset_index(drop=True).copy()

def prep_aa_order(df: pd.DataFrame, add_na: bool = False) -> pd.DataFrame:
    """
    Prepare the data for a single sitessm summary plot.

    Args:
    - df: pd.DataFrame, input full dataframe

    Returns:
    - pd.DataFrame, output dataframe
    """

    # Define the order of x-axis categories
    x_order = list(AA_DICT.values())
    
    if add_na:
        x_order += ["#N.A.#"]

    # Convert `Mutations` to a categorical column with specified order
    df["mut_aa"] = pd.Categorical(
        df["mut_aa"], categories=x_order, ordered=True
    )

    # Sort by the `x_order`, filling missing values
    return (
        df.sort_values("mut_aa", key=lambda x: x.cat.codes)
        .reset_index(drop=True)
        .copy()
    )

In [25]:
def get_parent2sitedict(df: pd.DataFrame) -> dict:

    """
    Get a dictionary of parent to site mapping for single site mutants.

    Args:
    - df : pd.DataFrame

    Returns:
    - dict
        A dictionary containing the parent sequence and site number for each parent.
    """

    site_dict = deepcopy(
        df[["Parent_Name", "parent_aa_loc"]]
        .drop_duplicates().dropna()
        .groupby("Parent_Name")["parent_aa_loc"]
        .apply(list)
        .to_dict()
    )

    # Sort the site list for each parent as an integer
    for parent, sites in site_dict.items():
        # Ensure each site is processed as a string and sorted by the integer part
        site_dict[parent] = sorted(sites, key=lambda site: int(str(site)[1:]))

    return site_dict

In [26]:
def get_y_label(y: str):

    """
    Function to return the y-axis label based on the input string.
    """
    clean_y = ""
    if "pdt" in y.lower():
        clean_y = "Product"
    elif "area" in y.lower():
        clean_y = "Yield"
    elif y == "fitness_ee2/(ee1+ee2)":
        clean_y = "ee2/(ee1+ee2)"
    elif y == "fitness_ee1/(ee1+ee2)":
        clean_y = "ee1/(ee1+ee2)"
    else:
        clean_y=  y

    # normalize the y label
    if "norm" in y.lower():
        clean_y = f"Normalized {clean_y.lower()}"
    return clean_y


def plot_bar_point(
    df: pd.DataFrame,
    x: str,
    y: str,
    y_label: str = None,
    title: str = None,
    if_max: bool = False,
) -> hv.Layout:

    # Create Bars plot
    bars = hv.Bars(
        df[[y, x]].sort_values(x).groupby(x).mean(),
        kdims=x,
        vdims=y,
    )

    # Display the plot
    bars.opts(
        title=title,
        ylabel=y_label or get_y_label(y),
        color=y,
        cmap="coolwarm",
        width=600,
        height=400,
        xrotation=45,
    )

    # Create Scatter chart
    points = hv.Scatter(df, x, [y, "Plate", "Well"]).opts(
        color=y, cmap="gray", size=8, alpha=0.5, tools=["hover"]
    )

    # create another scatter plot to highlight the max value
    if if_max:
        max_points = hv.Scatter(
            df.loc[df.groupby(x)[y].idxmax()],
            x,
            [y, "Plate", "Well"],
        ).opts(color="orange", size=10, alpha=1, tools=["hover"])
        return bars * points * max_points
    
    else:
        return bars * points

In [27]:
def get_parent_plot(df: pd.DataFrame, y: str = "pdt_norm") -> hv.Bars:

    """
    Function to plot the max value by parent.

    Args:
    - df : pd.DataFrame
        A pandas DataFrame containing the data for all parents.
        The DataFrame should have the Parent_Name columns
    - y : str
        The column name for which the max value is to be calculated.

    Returns:
    - hv.Bars
        A holoviews Bars object containing the plot.
    """

    parent_summary = df.groupby("Parent_Name")[y].max().reset_index()
    return hv.Bars(parent_summary, kdims="Parent_Name", vdims=y).opts(
        title="Max Value by Parent", width=600, height=400
    )


def agg_parent_plot(df: pd.DataFrame, ys: list = ["pdt_norm"]) -> pn.Row:

    """
    Function to plot the max value by parent for different y metrics.

    Args:
    - df : pd.DataFrame
        A pandas DataFrame containing the data for all parents.
        The DataFrame should have the Parent_Name columns
    - ys : list
        The list of column name for which the max value is to be calculated.

    Returns:
    - hv.Bars
    """

    # find single site mutations
    # avg_parnet_plots = [get_parent_plot(y=y) for y in ys if y in df.columns]
    avg_parnet_plots = [
        plot_bar_point(
            df,
            x="Parent_Name",
            y=y,
            title=f"{get_y_label(y)} across parents",
            if_max=True,
        )
        for y in ys
        if y in df.columns
    ]

    if len(avg_parnet_plots) == 0:
        return None
    # elif len(avg_ssm_plots) == 1:
    #     return avg_ssm_plots[0]
    else:
        return pn.Row(*avg_parnet_plots)


def plot_single_ssm_avg(
    single_ssm_df: pd.DataFrame,
    parent_name: str,
    y: str = "pdt_norm",
    width: int = 600,
):
    """
    Function to plot single site mutations with average values.

    Parameters:
    - df: DataFrame containing mutation data.
    """

    sliced_df = prep_aa_order(single_ssm_df[single_ssm_df["Parent_Name"] == parent_name].copy())

    height = max(30 * sliced_df["site_numb"].nunique(), 160)

    return hv.HeatMap(
        data=sliced_df[["parent_aa_loc", "mut_aa", y]]
        .dropna()
        .groupby(by=["parent_aa_loc", "mut_aa"])
        .mean()
        .sort_values(
            ["parent_aa_loc", "mut_aa"],
            key=lambda col: col.str.extract(r"(\d+)$").fillna(0).astype(int).iloc[:, 0]
            if col.name == "parent_aa_loc"
            else col
            # key=lambda col: col[1:].astype(int)
            # if col.name == "single_mutated_sites_w_parent"
            # else col,
        )
        .reset_index(),
        kdims=["mut_aa", "parent_aa_loc"],
        vdims=[y],
    ).opts(
        height=height,
        width=width,
        cmap="coolwarm",
        # color_levels=color_levels,
        colorbar=True,
        colorbar_opts=dict(title=y, width=8),
        xrotation=45,
        title=f"Average single site mutations for {parent_name}",
        xlabel="Residue",
        ylabel="Position",
        invert_yaxis=True,
        tools=["hover"],
    )


def agg_single_ssm_exp_avg(
    single_ssm_df: pd.DataFrame,
    parent_name: str,
    ys: list = ["pdt_norm"],
):

    # find single site mutations
    avg_ssm_plots = [
        plot_single_ssm_avg(single_ssm_df=single_ssm_df, parent_name=parent_name, y=y)
        for y in ys
        if y in df.columns
    ]

    if len(avg_ssm_plots) == 0:
        return None
    # elif len(avg_ssm_plots) == 1:
    #     return avg_ssm_plots[0]
    else:
        return pn.Row(*avg_ssm_plots)

In [28]:
# set up the main dashboard canvas
parents = df["Parent_Name"].unique().tolist()
single_ssm_df = prep_single_ssm(df)
sites_dict = get_parent2sitedict(single_ssm_df)

# Function to generate plots based on parent selection
def get_subplots(parent):
    
    filtered_df = df[df["Parent_Name"] == parent].copy

    site_dropdown = pn.widgets.Select(name="Sites", options=sites_dict[parent])

    def update_site_plot(site):

        site_df = prep_aa_order(get_single_ssm_site_df(single_ssm_df, parent=parent, site=site))

        site_info = site_df["parent_aa_loc"].unique()[0]

        return plot_bar_point(
            df=site_df,
            x="mut_aa",
            y="pdt_norm",
            # y_label: str = None,
            title=f"{site_info} for {parent}",
            if_max=False,
        )

    site_plot = pn.bind(update_site_plot, site=site_dropdown)

    return pn.Column(
        agg_single_ssm_exp_avg(
            single_ssm_df=single_ssm_df,
            parent_name=parent,
            # ys: list,
        ),
        site_dropdown,
        site_plot,
    )


# Dropdown for parent selection
parent_dropdown = pn.widgets.Select(name="Parent", options=parents)

# Initial parent plots
initial_subplots = get_subplots(parents[0])

# Panel layout
dashboard = pn.Column(
    agg_parent_plot(df), parent_dropdown, pn.bind(get_subplots, parent=parent_dropdown)
)

dashboard.servable()

  .groupby(by=["parent_aa_loc", "mut_aa"])
  df[[y, x]].sort_values(x).groupby(x).mean(),
  .groupby(by=["parent_aa_loc", "mut_aa"])
  df[[y, x]].sort_values(x).groupby(x).mean(),
  df = obj.data.set_index(index_cols).groupby(index_cols, sort=False).first()


BokehModel(combine_events=True, render_bundle={'docs_json': {'fc0d6b55-5d0c-44ab-b049-ba8dcb563053': {'version…

In [23]:
# export html
dashboard.save("/home/fli/LevSeq/sandbox/HMC.html", embed=True)

  9%|▉         | 2/22 [00:00<00:02,  8.83it/s]

  df[[y, x]].sort_values(x).groupby(x).mean(),
  df[[y, x]].sort_values(x).groupby(x).mean(),


 23%|██▎       | 5/22 [00:00<00:03,  5.02it/s]

  df[[y, x]].sort_values(x).groupby(x).mean(),
  df[[y, x]].sort_values(x).groupby(x).mean(),
  df[[y, x]].sort_values(x).groupby(x).mean(),


 36%|███▋      | 8/22 [00:01<00:03,  4.55it/s]

  df[[y, x]].sort_values(x).groupby(x).mean(),
  df[[y, x]].sort_values(x).groupby(x).mean(),


 45%|████▌     | 10/22 [00:02<00:02,  4.49it/s]

  df[[y, x]].sort_values(x).groupby(x).mean(),
  df[[y, x]].sort_values(x).groupby(x).mean(),
  df[[y, x]].sort_values(x).groupby(x).mean(),


 50%|█████     | 11/22 [00:02<00:02,  4.48it/s]

  .groupby(by=["parent_aa_loc", "mut_aa"])
  df[[y, x]].sort_values(x).groupby(x).mean(),
  df = obj.data.set_index(index_cols).groupby(index_cols, sort=False).first()
  df[[y, x]].sort_values(x).groupby(x).mean(),


                                               

  df[[y, x]].sort_values(x).groupby(x).mean(),
  df[[y, x]].sort_values(x).groupby(x).mean(),
  df[[y, x]].sort_values(x).groupby(x).mean(),
  df[[y, x]].sort_values(x).groupby(x).mean(),
  df[[y, x]].sort_values(x).groupby(x).mean(),
  df[[y, x]].sort_values(x).groupby(x).mean(),
  df[[y, x]].sort_values(x).groupby(x).mean(),
  df[[y, x]].sort_values(x).groupby(x).mean(),
  df[[y, x]].sort_values(x).groupby(x).mean(),
  df[[y, x]].sort_values(x).groupby(x).mean(),
  .groupby(by=["parent_aa_loc", "mut_aa"])
  df[[y, x]].sort_values(x).groupby(x).mean(),
  df = obj.data.set_index(index_cols).groupby(index_cols, sort=False).first()
  df[[y, x]].sort_values(x).groupby(x).mean(),
