In [1]:
%cd ~/SSMuLA

/disk2/fli/SSMuLA


In [2]:
%load_ext blackcellmagic
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np

from Bio.Seq import Seq

In [117]:
# General imports
import glob
import os
import re
import pickle
import datetime
import pathlib

# Data manipulation
# import growth_analysis as ga
import pandas as pd
pd.set_option('display.max_columns', 40)
import numpy as np

# Basic plotting
import holoviews as hv
import bokeh
from bokeh.io import export_svg
from bokeh.themes.theme import Theme
theme = Theme(
    json = {
    'attrs' : {
        'Title': {
            'align':'center',
            'text_font_size':'15px',
            'text_color':'black',
            'text_font': 'arial'
        },       # title centered and bigger
        'Axis': {
            'axis_label_text_font_style': 'normal',
            'axis_label_text_color':'black',
            'major_label_text_color':'black',
            'axis_label_text_font': 'arial',
            'major_label_text_font': 'arial',
        },          # no italic labels 
        'Legend': {
            'title_text_font_style': 'normal',
            'title_text_color':'black',
            'label_text_color':'black',
            'label_text_font': 'arial',
        },
        'ColorBar': {
            'title_text_font_style': 'normal',
            'major_label_text_color':'black',
            'major_label_text_font': 'arial',
            'title_text_color':'black',
            'title_text_font': 'arial',
        },
    }
}
)

hv.extension('bokeh')
hv.renderer('bokeh').theme = theme

import panel as pn
pn.config.comms = "vscode"

# Large data plotting
import datashader as ds
from holoviews.operation.datashader import datashade, rasterize

# Making graphs
import networkx as nx
import matplotlib.pyplot as plt
import itertools
from tqdm.auto import tqdm
from multiprocessing import Pool

from SSMuLA.landscape_info import LIB_INFO_DICT
from SSMuLA.vis import save_bokeh_hv

In [103]:
class ProcessDHFR:
    """
    Class to clean up the DHFR data
    """

    def __init__(self, input_csv: str = "data/DHFR/fitness_landscape/DHFR.csv") -> None:

        """
        Args:
        - input_csv: str: path to the input csv file
        """

        self._input_csv = input_csv
        
        
    @property
    def lib_info(self) -> dict:
        """Return the library information"""
        return LIB_INFO_DICT["DHFR"]
    
    @property
    def split_AA_cols(self) -> list:
        """Return the columns for the split amino acids"""
        return [f"AA{str(i)}" for i in self.lib_info["positions"].keys()]
    
    @property
    def input_df(self) -> pd.DataFrame:
        """Return the input dataframe"""
        return pd.read_csv(self._input_csv)
    
    @property
    def df_aa(self) -> pd.DataFrame:
        """Return the input dataframe with amino acid translations"""

        df = self.input_df.copy()

        # Translate the sequence to amino acids
        df["AAs"] = df["seq"].apply(lambda x: "".join(Seq(x).translate()))

        return df[["AAs", "seq", "fitness"]].copy()
    
    @property
    def df_split_aa(self) -> pd.DataFrame:
        """Return the input dataframe with amino acid translations
        and split into individual amino acids"""

        df = self.df_aa.copy()

        # Split combo into individual amino acids 
        df[self.split_AA_cols] = df["AAs"].apply(lambda x: pd.Series(list(x)))

        return df[["AAs", *self.split_AA_cols, "seq", "fitness"]].copy()
    
    @property
    def df_avg_aa(self) -> pd.DataFrame:
        """Return the average fitness of each amino acid"""

        df = self.df_aa.copy()[["AAs", "fitness"]]
        # Group by amino acid and take the average fitness
        df = df.groupby("AAs")["fitness"].mean().reset_index()
        # Split combo into individual amino acids 
        df[self.split_AA_cols] = df["AAs"].apply(lambda x: pd.Series(list(x)))
        return df[["AAs", *self.split_AA_cols, "fitness"]].copy()

In [17]:
ProcessDHFR().df_avg_aa

Unnamed: 0,AAs,AA1,AA2,AA3,fitness
0,***,*,*,*,-0.726347
1,**A,*,*,A,-0.748223
2,**C,*,*,C,-0.750235
3,**D,*,*,D,-0.737349
4,**E,*,*,E,-0.739465
...,...,...,...,...,...
9256,YYS,Y,Y,S,-0.712637
9257,YYT,Y,Y,T,-0.730895
9258,YYV,Y,Y,V,-0.722095
9259,YYW,Y,Y,W,-0.760234


In [18]:
ProcessDHFR().input_df["seq"].nunique()

261333

In [22]:
hv.Distribution(ProcessDHFR().input_df["fitness"]).opts(width=400, height=400)

In [23]:

hv.Distribution(ProcessDHFR().df_avg_aa["fitness"]).opts(width=400, height=400)

In [118]:
codon_fit = ProcessDHFR().input_df["fitness"]
residue_fit = ProcessDHFR().df_avg_aa["fitness"]

# Create the first distribution plot
dist1 = (
    hv.Distribution(codon_fit, label="Codon-level").opts(
        width=400,
        height=400,
        line_color=None,
    )
    * hv.Spikes([codon_fit.mean()], label="Mean codon fitness").opts(
        line_dash="dotted", line_color="blue", line_width=1.2
    )  # for label
    * hv.Spikes([codon_fit.median()], label="Median codon fitness").opts(
        line_color="blue", line_width=1.2
    )
)

# Create the second distribution plot
dist2 = (
    hv.Distribution(residue_fit, label="Residue-level").opts(
        color="orange",
        line_color=None,
    )
    * hv.Spikes([residue_fit.mean()], label="Mean residue fitness").opts(
        line_dash="dotted", line_color="orange", line_width=1.2
    )
    * hv.Spikes([residue_fit.median()], label="Median residue fitness").opts(
        line_color="orange", line_width=1.2
    )
)

# Overlay the two plots
overlay_dist = dist1 * dist2

# Customize the plot options
overlay_dist.opts(
    legend_position="top_right",
    title="DHFR fitness distribution",
    xlabel="Fitness",
)  # ylabel='Frequency')

# Display the plot with the legend
save_bokeh_hv(
    overlay_dist,
    plot_name="DHFR fitness distribution",
    plot_path="results/fitness_distribution",
    bokehorhv = "hv",
    # dpi: int = 300,
    # scale: float = 1,
    )

In [64]:
dist1

In [7]:
df[df["AAs"].str.contains("\*")]


Unnamed: 0,seq,fitness,AAs
49,AAAAAATAA,-0.707481,KK*
51,AAAAAATAG,-0.749359,KK*
57,AAAAAATGA,-0.662051,KK*
113,AAAAACTAA,-0.711992,KN*
115,AAAAACTAG,-0.736583,KN*
...,...,...,...
261255,TTTTTGTAG,-0.662173,FL*
261261,TTTTTGTGA,-0.712708,FL*
261317,TTTTTTTAA,-0.778114,FF*
261319,TTTTTTTAG,-0.795470,FF*


In [None]:
# calc active vs inactive based on 1.96 std > mean fitness

avg_stop1 = summary_AA_df[summary_AA_df['# Stop'] > 0]['mu_1-bg/max'].mean()
std_stop1 = summary_AA_df[summary_AA_df['# Stop'] > 0]['mu_1-bg/max'].std()
fit_min1 = 1.96*std_stop1 + avg_stop1

avg_stop2 = summary_AA_df[summary_AA_df['# Stop'] > 0]['mu_2-bg/max'].mean()
std_stop2 = summary_AA_df[summary_AA_df['# Stop'] > 0]['mu_2-bg/max'].std()
fit_min2 = 1.96*std_stop2 + avg_stop2

print('95%',len(summary_AA_df[(summary_AA_df['mu_1-bg/max'] > fit_min1) & (summary_AA_df['mu_2-bg/max'] > fit_min2)]))

plot_95 = hv.Curve([(-0.5,-0.5),(1,1)]).opts(line_color='black',line_width=0.5)*datashade(
    hv.Points(
        summary_AA_df[(summary_AA_df['mu_1-bg/max'] > fit_min1) & (summary_AA_df['mu_2-bg/max'] > fit_min2)], 
        kdims=['mu_1-bg/max','mu_2-bg/max'])
).opts(frame_height=500,frame_width=500,xlim=(0,1), ylim=(0,1), title=f'fit min (1.96 standard deviations above the mean): {1.96*std_stop + avg_stop:3f}', xlabel='mu_1-bg/max', ylabel='mu_2-bg/max', fontscale=1.5)

summary_AA_df['active'] = summary_AA_df.apply(lambda row: row['mu_1-bg/max'] > fit_min1 and row['mu_2-bg/max'] > fit_min2, axis=1)
