In [None]:
# No parameters required

# Growth curve analysis of validation mutants

## Import libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams["svg.fonttype"] = "none"

## Specify paths

In [None]:
# Input
layout_path = "../growth_data/20240129_full_layout.tsv"
source_data = [
    "../growth_data/20240128_validations_DMS_control.xlsx",
    "../growth_data/20240128_validations_DMS_caspo.xlsx",
    "../growth_data/20240131_validations_DMS_mica.xlsx",
    "../growth_data/20240131_validations_DMS_ani.xlsx",
]
DMS_data = "../classified/BY4741_FKS1-HS1/refined_classification.csv"

# Output
df_outpath = "../growth_data/"
graph_outpath = "../graphs/"

## Get layout

In [None]:
layout = pd.read_csv(
    layout_path, sep="\t", header=0, dtype={"Sanger_validated": "boolean"}
)
layout

In [None]:
print(
    "Unique genotypes validated by Sanger:\n",
    layout.loc[layout.Sanger_validated, "genotype"].unique(),
)

## Get plate reader data

In [None]:
def get_data(fpath):
    ### Import libraries
    import numpy as np
    from datetime import datetime, date, time

    ### Read excel file
    source_df = pd.read_excel(fpath, index_col=0, header=0, skiprows=26, skipfooter=36)

    ### Parse timepoints
    t0, t1 = source_df.iloc[[0, 1], 0]  # Get the first and second timepoints

    # Convert first timepoint from time (of the day) object to time duration (h)
    t0_h = (
        datetime.combine(date.today(), t0) - datetime.combine(date.today(), time.min)
    ).total_seconds() / 3600

    # Convert difference between second and first timepoint into duration in h (measurement interval)
    delta = (
        datetime.combine(date.today(), t1) - datetime.combine(date.today(), t0)
    ).total_seconds() / 3600

    # Rewrite column of timepoints using calculated values
    source_df["Time"] = np.arange(t0_h, t0_h + len(source_df.Time) * delta, delta)

    # Delete column with temperature
    source_df.drop(columns="TÂ° 600", inplace=True)

    # Rename columns
    source_df.columns = [x.split("=")[-1] if "=" in x else x for x in source_df.columns]

    # Reshape
    longdf = source_df.melt(
        id_vars="Time", var_name="well", value_name="OD"
    )  # wide to long dataframe
    return longdf

In [None]:
df_list = []

for i, f in enumerate(source_data):
    df = get_data(f)
    df["plate"] = i + 1
    df_list.append(df)

fulldf = pd.concat(df_list, ignore_index=True)
fulldf

## Annotate data

In [None]:
annotdf = fulldf.merge(right=layout[layout.Sanger_validated], on=["plate", "well"])
annotdf

## Visualize growth curves

```
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)

grid = sns.FacetGrid(data=annotdf, col = 'compound', hue='genotype', palette='hls')
grid.map(sns.lineplot, 'Time', 'OD')

grid.set_titles(row_template='{row_name}', col_template='{col_name}')
grid.set_axis_labels('Time (h)', 'OD')
grid.add_legend(title = 'Genotype')
grid.fig.subplots_adjust(top=0.9)
grid.tight_layout()
```

## Calculate normalized area under the curve (AUC)

In [None]:
def get_auc(g):
    import numpy as np

    return np.trapezoid(g.OD)

In [None]:
aucdf = (
    annotdf[annotdf["Time"] <= 40]
    .groupby(["genotype", "aa_seq", "clone", "compound", "well"])[["OD"]]
    .apply(func=get_auc)
    .reset_index(name="auc")
)
aucdf

In [None]:
aggdf = (
    aucdf.groupby(["genotype", "aa_seq", "compound"])[["auc"]]
    .agg(auc=("auc", "mean"), auc_min=("auc", "min"), auc_max=("auc", "max"))
    .reset_index()
)
aggdf.head(10)

In [None]:
def get_l2fc(val, minv, maxv, comp, df):
    import numpy as np

    # Retrieve corresponding WT value for the condition
    wt = df.loc[(df.compound == comp) & (df.genotype == "BY"), "auc"].values

    # Make sure a single value was extracted
    if len(wt) == 1:
        # Return log2 fold-change for mean, min and max
        return np.log2(val / wt[0]), np.log2(minv / wt[0]), np.log2(maxv / wt[0])
    else:
        return "error"

In [None]:
aggdf["L2FC"], aggdf["L2FC_min"], aggdf["L2FC_max"] = zip(
    *aggdf.apply(
        lambda row: get_l2fc(row.auc, row.auc_min, row.auc_max, row.compound, aggdf),
        axis=1,
    )
)
aggdf.head(10)

In [None]:
aggdf["min_yerr"] = aggdf["L2FC"] - aggdf["L2FC_min"]
aggdf["max_yerr"] = aggdf["L2FC_max"] - aggdf["L2FC"]
aggdf.head(10)

## Visualize log2 fold changes

In [None]:
sns.catplot(aggdf, col="compound", y="L2FC", hue="genotype")

## Compare with DMS scores

In [None]:
DMS_aa = pd.read_csv(DMS_data, index_col=0)
DMS_aa

In [None]:
corrdf = aggdf.merge(right=DMS_aa, on=["compound", "aa_seq"])
corrdf.head(10)

In [None]:
sns.set_theme(
    rc={
        "font.family": "Arial",
        "font.size": 8,
        "legend.title_fontsize": 8,
        "legend.fontsize": 8,
        "axes.labelsize": 8,
        "axes.titlesize": 8,
        "xtick.labelsize": 8,
        "ytick.labelsize": 8,
        "xtick.major.pad": 2,
        "ytick.major.pad": 2,
        "xtick.bottom": True,
        "ytick.left": True,
        "xtick.major.size": 2,
        "ytick.major.size": 2,
    },
    style="ticks",
)

lcomp = ["none", "anidulafungin", "caspofungin", "micafungin"]
comp_dict = {
    "caspofungin": "Caspofungin",
    "micafungin": "Micafungin",
    "anidulafungin": "Anidulafungin",
    "none": "Control",
}
classes = ["resistant", "sensitive", "deleterious"]
class_palette = ["#C75DAB", "#F1F1F1", "#009B9E"]
class_cmap = dict(zip(classes, class_palette))

# Initialize list to save linear regression parameters for each condition
reglist = []

grid = sns.lmplot(
    corrdf,
    x="s",
    y="L2FC",
    col="compound",
    col_order=lcomp,
    markers="none",  # dots are redrawn with color = resistance class
    # note: i've tried coloring the markers using scatter_kws, doesn't work
    facet_kws={"despine": False},
    line_kws={"color": "lightgrey"},
    height=2,
    aspect=0.9,
)

for i, c in enumerate(lcomp):
    graphdf = corrdf[corrdf.compound == c]
    grid.axes[0][i].title.set_text(comp_dict[c])

    # Drawing error bars
    grid.axes[0][i].errorbar(
        x=graphdf.s,
        y=graphdf.L2FC,
        xerr=[graphdf.min_s, graphdf.max_s],
        yerr=[graphdf.min_yerr, graphdf.max_yerr],
        fmt="none",
        ecolor="grey",  # only error bars, no dots
    )

    # Drawing dots, colored by their classification (resistant, WT-like, etc)
    grid.axes[0][i].scatter(
        x=graphdf.s,
        y=graphdf.L2FC,
        c=graphdf.sensres.map(class_cmap),
        ec="grey",
        zorder=100,
    )

    # Calculate and display Spearman correlation coefficient
    from scipy import stats

    sr, sp = stats.spearmanr(graphdf.s, graphdf.L2FC)
    grid.axes[0][i].text(
        0.3, -0.5, rf"$\rho$ = {sr:.2f}" + "\n$\it{p}$-val = " + f"{sp:.1e}", ha="left"
    )

    # I am leaving some unused code below to compare linear regression with seaborn (statsmodels OLS) and sklearn
    # but ultimately I use the regression done by seaborn (statsmodels OLS)
    # then I fetch the slope and intercept from seaborn drawn lines and append them to a dataframe

    # Perform linear regression using sklearn package
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression

    X = graphdf.s.values.reshape(-1, 1)
    y = graphdf.L2FC.values.reshape(-1, 1)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=6
    )  # train on 20% of dataset
    reg = LinearRegression().fit(X_train, y_train)

    # Extract slope and intercept from sklearn Linear Regression model
    slopeSK = reg.coef_.flatten()[0]
    interceptSK = reg.intercept_[0]

    # Extract slope and intercept from seaborn lmplot (statsmodels OLS)
    slope, intercept, r_value, p_value, std_err = stats.linregress(
        x=grid.axes[0][i].get_lines()[0].get_xdata(),
        y=grid.axes[0][i].get_lines()[0].get_ydata(),
    )

    # Save slope and intercept for each condition in a list -> append to master list
    reglist.append([c, slope, intercept])

# Highlighting interesting cases
int_mut = ["V641W", "L642K"]
deviating_df = corrdf[
    (corrdf.compound == "micafungin") & (corrdf.genotype.isin(int_mut))
]
grid.axes[0][3].scatter(
    x=deviating_df.s,
    y=deviating_df.L2FC,
    c=deviating_df.sensres.map(class_cmap),
    ec="k",
    lw=2,
    zorder=200,
)
# .. and label them
for i, m in enumerate(int_mut):
    grid.axes[0][3].annotate(
        m,
        (deviating_df.s.values[i], deviating_df.L2FC.values[i]),  # x  # y
        xytext=(-4, 0),  # distance of text label from xy coords
        textcoords="offset fontsize",  # xytext coords given in fontsize
    )

grid.set_axis_labels("DMS selection coefficient", "Log2FC(AUC)")

# Convert list of list to dataframe
regdf = pd.DataFrame(reglist, columns=["compound", "slope", "intercept"])

plt.savefig(f"{graph_outpath}/validations.svg", format="svg", dpi=300)

In [None]:
regdf

In [None]:
def get_estimate_DMS_score(v, r, df):
    # Merge row and dataframe containing linear regression parameters
    merg = pd.merge(r.to_frame().T, df, how="left", on="compound")

    # Get slope and intercept for this condition
    s = merg.slope[0]
    i = merg.intercept[0]

    # Return corrected value
    return (v - i) / s

In [None]:
# Mutants for which we need to estimate the DMS score
mut_no_DMS = [x for x in aggdf.genotype.unique() if x not in corrdf.genotype.unique()]
print(mut_no_DMS)

In [None]:
aggdf["s"] = aggdf.apply(
    lambda row: get_estimate_DMS_score(row.L2FC, row, regdf), axis=1
)
rescued_df = aggdf[aggdf.genotype.isin(mut_no_DMS + ["V641W", "L642K"])][
    ["compound", "genotype", "aa_seq", "s"]
]
rescued_df["Nham_aa"] = 1
rescued_df["aa_pos"] = rescued_df["genotype"].map(lambda x: x[1:4])
rescued_df["alt_aa"] = rescued_df["genotype"].map(lambda x: x[-1])
rescued_df["wt_aa"] = rescued_df["genotype"].map(lambda x: x[0])
rescued_df[["compound", "aa_seq", "Nham_aa", "aa_pos", "alt_aa", "wt_aa", "s"]].to_csv(
    f"{df_outpath}/validation_DMS_missing_estimates.csv", index=False
)
rescued_df