In [1]:
%cd ~/SSMuLA

/disk2/fli/SSMuLA


In [2]:
%load_ext blackcellmagic
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [7]:
from SSMuLA.preprocess import ProcessDHFR

In [8]:
ProcessDHFR()

<SSMuLA.preprocess.ProcessDHFR at 0x7f626d1315d0>

In [3]:
import pandas as pd
import numpy as np

from Bio.Seq import Seq

In [None]:
# General imports
import glob
import os
import re
import pickle
import datetime
import pathlib

# Data manipulation
# import growth_analysis as ga
import pandas as pd
pd.set_option('display.max_columns', 40)
import numpy as np

# Basic plotting
import holoviews as hv
import bokeh
from bokeh.io import export_svg
from bokeh.themes.theme import Theme
theme = Theme(
    json = {
    'attrs' : {
        'Title': {
            'align':'center',
            'text_font_size':'15px',
            'text_color':'black',
            'text_font': 'arial'
        },       # title centered and bigger
        'Axis': {
            'axis_label_text_font_style': 'normal',
            'axis_label_text_color':'black',
            'major_label_text_color':'black',
            'axis_label_text_font': 'arial',
            'major_label_text_font': 'arial',
        },          # no italic labels 
        'Legend': {
            'title_text_font_style': 'normal',
            'title_text_color':'black',
            'label_text_color':'black',
            'label_text_font': 'arial',
        },
        'ColorBar': {
            'title_text_font_style': 'normal',
            'major_label_text_color':'black',
            'major_label_text_font': 'arial',
            'title_text_color':'black',
            'title_text_font': 'arial',
        },
    }
}
)

hv.extension('bokeh')
hv.renderer('bokeh').theme = theme

import panel as pn
pn.config.comms = "vscode"

# Large data plotting
import datashader as ds
from holoviews.operation.datashader import datashade, rasterize

# Making graphs
import networkx as nx
import matplotlib.pyplot as plt
import itertools
from tqdm.auto import tqdm
from multiprocessing import Pool

from SSMuLA.landscape_global import ACTIVE_THRESH_DICT, LIB_INFO_DICT, calc_active_cutoff
from SSMuLA.vis import save_bokeh_hv, CODON_AA_COLOER_DICT, plot_fit_dist
from SSMuLA.util import checkNgen_folder

In [None]:
class ProcessDHFR:
    """
    Class to clean up the DHFR data
    """

    def __init__(self, input_csv: str = "data/DHFR/fitness_landscape/DHFR.csv") -> None:

        """
        Args:
        - input_csv, str: path to the input csv file
        """

        self._input_csv = input_csv

        # append the active cutoffs
        self._df_avg_aa_append, self._avg_aa_active_cutoff = calc_active_cutoff(
            self.df_avg_aa, ["fitness"]
        )

        # save the appended dataframe
        self._df_avg_aa_append.to_csv(self.output_csv, index=False)

        self._overlay_fit_dist()

    def _overlay_fit_dist(
        self,
        title: str = "DHFR fitness distribution",
    ) -> hv.Distribution:
        
        """
        Plot the fitness distribution

        Args:
        - dist1, hv.Distribution: fitness distribution 1
        - dist2, hv.Distribution: fitness distribution 2
        - active_cut_calc, float: calculated active cutoff
        - active_cut_def, float: defined active cutoff

        Returns:
        - hv.Distribution: plot of the fitness distribution
        """

        # Overlay the two plots
        overlay_dist = (
            self.codon_fit_dist
            * self.avg_aa_fit_dist
            * hv.Spikes([self.avg_aa_active_cutoff], label = "Calculated active cutoff").opts(
                color="gray", line_width=1.6
            )
            * hv.Spikes([ACTIVE_THRESH_DICT["DHFR"]], label = "Defined active cutoff").opts(
                color="gray", line_dash="dashed", line_width=1.6
            )
        )

        # Customize the plot options
        overlay_dist.opts(
            legend_position="top_right",
            title=title,
            xlabel="Fitness",
        )

        # Display the plot with the legend
        save_bokeh_hv(
            overlay_dist,
            plot_name=title,
            plot_path="results/fitness_distribution",
            bokehorhv="hv",
        )
        return overlay_dist

    @property
    def lib_info(self) -> dict:
        """Return the library information"""
        return LIB_INFO_DICT["DHFR"]

    @property
    def split_AA_cols(self) -> list:
        """Return the columns for the split amino acids"""
        return [f"AA{str(i)}" for i in self.lib_info["positions"].keys()]

    @property
    def input_df(self) -> pd.DataFrame:
        """Return the input dataframe"""
        return pd.read_csv(self._input_csv)

    @property
    def df_aa(self) -> pd.DataFrame:

        """Return the input dataframe with amino acid translations"""

        df = self.input_df.copy()

        # Translate the sequence to amino acids
        df["AAs"] = df["seq"].apply(lambda x: "".join(Seq(x).translate()))

        return df[["AAs", "seq", "fitness"]].copy()

    @property
    def df_split_aa(self) -> pd.DataFrame:

        """Return the input dataframe with amino acid translations
        and split into individual amino acids"""

        df = self.df_aa.copy()

        # Split combo into individual amino acids
        df[self.split_AA_cols] = df["AAs"].apply(lambda x: pd.Series(list(x)))

        return df[["AAs", *self.split_AA_cols, "seq", "fitness"]].copy()

    @property
    def df_avg_aa(self) -> pd.DataFrame:

        """Return the average fitness of each amino acid"""

        df = self.df_aa.copy()[["AAs", "fitness"]]
        # Group by amino acid and take the average fitness
        df = df.groupby("AAs")["fitness"].mean().reset_index()
        # Split combo into individual amino acids
        df[self.split_AA_cols] = df["AAs"].apply(lambda x: pd.Series(list(x)))
        return df[["AAs", *self.split_AA_cols, "fitness"]].copy()

    @property
    def df_avg_aa_append(self) -> pd.DataFrame:
        """Return the average fitness of each amino acid with the active cutoff appended"""
        return self._df_avg_aa_append

    @property
    def avg_aa_active_cutoff(self) -> float:
        """Return the active cutoff for the average amino acid"""
        return self._avg_aa_active_cutoff

    @property
    def codon_fit(self) -> pd.Series:
        """Return the fitness of based on codon as a series"""
        return self.input_df["fitness"]

    @property
    def avg_aa_fit(self) -> pd.Series:
        """Return the fitness of based on codon as a series"""
        return self.df_avg_aa["fitness"]

    @property
    def codon_fit_dist(self) -> hv.Distribution:
        """Return the fitness distribution based on codon"""
        return plot_fit_dist(self.codon_fit, "codon")

    @property
    def avg_aa_fit_dist(self) -> hv.Distribution:
        """Return the fitness distribution based on average amino acid"""
        return plot_fit_dist(self.avg_aa_fit, "AA")
    
    @property
    def output_csv(self) -> str:
        """Return the path to the output csv"""
        output_csv = self._input_csv.replace("fitness_landscape", "processed")
        # check if the folder exists
        checkNgen_folder(output_csv)
        return output_csv

In [None]:
ProcessDHFR()

In [None]:
checkNgen_folder("data/DHFR/fitness_landscape/DHFR.csv".replace("fitness_landscape", "processed"))

In [None]:
calc_active_cutoff(
    df=avg_aa_df, fitness_cols = ["fitness"]
)

In [None]:
avg_aa_df['active'].value_counts()/len(avg_aa_df)

In [None]:
ProcessDHFR().df_avg_aa

In [None]:
ProcessDHFR().input_df["seq"].nunique()

In [None]:
dhfr_fitclass = ProcessDHFR()

codon_df = ProcessDHFR().input_df
avg_aa_df = ProcessDHFR().df_avg_aa

codon_fit = codon_df["fitness"]
aa_fit = avg_aa_df["fitness"]

# Create the first distribution plot
condon_dist = (
    hv.Distribution(codon_fit, label="Codon-level").opts(
        width=400,
        height=400,
        color=PRESENTATION_PALETTE_SATURATE6["blue"],
        line_color=None,
    )
    * hv.Spikes([codon_fit.mean()], label="Mean codon fitness").opts(
        line_dash="dotted", line_color=PRESENTATION_PALETTE_SATURATE6["blue"], line_width=1.6
    )  # for label
    * hv.Spikes([codon_fit.median()], label="Median codon fitness").opts(
        line_color=PRESENTATION_PALETTE_SATURATE6["blue"], line_width=1.6
    )
)

# Create the second distribution plot
aa_dist = (
    hv.Distribution(aa_fit, label="AA-level").opts(
        color=PRESENTATION_PALETTE_SATURATE6["orange"],
        line_color=None,
    )
    * hv.Spikes([aa_fit.mean()], label="Mean AA fitness").opts(
        line_dash="dotted", line_color=PRESENTATION_PALETTE_SATURATE6["orange"], line_width=1.6
    )
    * hv.Spikes([aa_fit.median()], label="Median AA fitness").opts(
        line_color=PRESENTATION_PALETTE_SATURATE6["orange"], line_width=1.6
    )
)

# Overlay the two plots
overlay_dist = dist1 * dist2

# Customize the plot options
overlay_dist.opts(
    legend_position="top_right",
    title="DHFR fitness distribution",
    xlabel="Fitness",
)  # ylabel='Frequency')

# Display the plot with the legend
save_bokeh_hv(
    overlay_dist,
    plot_name="DHFR fitness distribution",
    plot_path="results/fitness_distribution",
    bokehorhv = "hv",
    # dpi: int = 300,
    # scale: float = 1,
    )

In [None]:
df_split_aa = ProcessDHFR().df_split_aa

In [None]:
site_counts = df_split_aa.groupby(["AAs", "AA1", "AA2", "AA3"], as_index=True).count().reset_index()
site_counts

In [None]:
from SSMuLA.param import TRANSLATE_DICT, CODON_COUNT_PER_AA

In [None]:
df_split_aa['Counts'] = df_split_aa.groupby('AAs')["AAs"].transform('count')
count_aa = df_split_aa[["AAs", "AA1", "AA2", "AA3", "Counts"]].drop_duplicates().sort_values("AAs").reset_index(drop=True)
count_aa["Norm_counts"] = count_aa.apply(lambda row: row["Counts"] / (CODON_COUNT_PER_AA[row["AA1"]] * CODON_COUNT_PER_AA[row["AA2"]] * CODON_COUNT_PER_AA[row["AA3"]]), axis=1)
count_aa

In [None]:
count_aa.Norm_counts.unique()

In [None]:
site_counts_df = site_counts.to_frame()
site_counts_df.plot(kind='bar', figsize=(20, 5))

In [None]:
aa_stop_df = avg_aa_df[avg_aa_df["AAs"].str.contains("\*")]
aa_stop_df

In [None]:
# calc active vs inactive based on 1.96 std > mean fitness

avg_stop = aa_stop_df["fitness"].mean()
std_stop = aa_stop_df["fitness"].std()
fit_min = 1.96 * std_stop + avg_stop

print('95%', len(avg_aa_df.loc[avg_aa_df['fitness'] > fit_min]), fit_min)

# add column called active if fitness > fit_min
avg_aa_df.loc[avg_aa_df['fitness'] > fit_min, 'active'] = True
avg_aa_df.loc[avg_aa_df['fitness'] <= fit_min, 'active'] = False
avg_aa_df


In [None]:
# active percentage
active_percnet = avg_aa_df['active'].value_counts()/len(avg_aa_df)
active_percnet

In [None]:
fit_min