In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
sns.set(context="notebook", style="ticks", font="Helvetica")
import numpy as np

In [None]:
datasets = sorted([
    "somatosensory_converted_into_tpm",
    "Dopaminergic_TPM_clean",
    "kakadarov_tpm",
    "Cheng_ES_TPM",
    "Gaublomme_GSE75109_TPM_clean",
    "Gaublomme_GSE75110_TPM_clean",
    "Gaublomme_GSE75111_TPM_clean",
    "Rbp4_positive_cells",
    "Alveolar_cells_Type_I_Merged_Batches",
    "Alveolar_cells_Type_II_Merged_Batches",
#     "Alveolar_cells_both_types",
    "klein",
    "hepat_TPM_yang_clean",
    "Yu_First_wave_endocrine_cells",
])

In [None]:
cell_type_names = {
    "somatosensory_converted_into_tpm": "Somatosensory N",
    "Dopaminergic_TPM_clean": "Dopaminergic N",
    "kakadarov_tpm": "CD8+ T cell",
    "Cheng_ES_TPM": "Isolated ESC",
    "Gaublomme_GSE75109_TPM_clean": "Th17 A",
    "Gaublomme_GSE75110_TPM_clean": "Th17 B",
    "Gaublomme_GSE75111_TPM_clean": "Th17 C",
    "Rbp4_positive_cells": "Corticostriatal N",
    "Alveolar_cells_Type_I_Merged_Batches": "Lung ACI",
    "Alveolar_cells_Type_II_Merged_Batches": "Lung ACII",
#     "Alveolar_cells_both_types",
    "klein": "Cultured ESC",
    "hepat_TPM_yang_clean": "Liver HB/HC",
    "Yu_First_wave_endocrine_cells": "Pancreatic EC",
}

In [None]:
def load_pvalues(stretch):
    dfs = [pd.read_csv("{}/chr_stat_test_pvalues_{}.csv".format(x, stretch)) for x in datasets] 
    for i, dataset in enumerate(datasets):
        dfs[i].loc[:,"dataset"] = dataset
    df = pd.concat(dfs)
    return df

In [None]:
dfs = {x: load_pvalues(x) for x in ["7", "14", "21"]}

In [None]:
df = dfs["14"]

In [None]:
df = df.replace(cell_type_names)

In [None]:
def transform(df):
    orig = df.loc[:, ["metric", "orig_value"]]
    shuf = df.loc[:, ["metric", "shuffled_value"]]
    orig.columns = ["metric", "value"]
    shuf.columns = ["metric", "value"]
    orig.loc[:,"Distribution"] = "Original"
    shuf.loc[:,"Distribution"] = "Median Shuffled"
    
    return pd.concat([orig, shuf])

t_df = df.groupby("dataset").apply(transform).reset_index()

In [None]:
df.loc[:,"Difference"] = (df.orig_value - df.shuffled_value)
df.loc[:,"-log2 p-value"] = -np.log2(df.pvalue)
df.loc[:,"Cell Type"] = df.dataset
cutoff=-np.log2(0.025)
cutoff

In [None]:
current_palette = sns.color_palette("Paired", n_colors=7)
new_palette = []
for x in current_palette:
    new_palette += [x,x]
new_palette = new_palette[:13]

def plot_volcano(_df, title, ax, legend=False):
    sns.set(font_scale=1.5, style="ticks", font="Arial")
    filled_markers = ['o', 'X']*7
    sns.scatterplot(data=_df,x="Difference",y="-log2 p-value",hue="Cell Type",style="Cell Type",markers=filled_markers,legend=legend,s=75,alpha=0.9,palette=new_palette,ax=ax)
    xlim = max(_df.Difference.max(),_df.Difference.min())*1.1
    sns.lineplot([-xlim,xlim],[cutoff, cutoff],dashes=True,color="grey", alpha=0.5,ax=ax)
    sns.distplot([0], kde=False, hist=False, rug=True, rug_kws={"height": 1.0, "linewidth": 2, "alpha": 0.5, "color": "grey"},ax=ax)
    ax.set_xlim([-xlim,xlim])
    if legend:
        ax.legend(bbox_to_anchor=(1.0, 1.0))
    ax.set_title(title)

_, axs = plt.subplots(3,1,figsize=(4,7))
plt.subplots_adjust(hspace=1.1)
plot_volcano(df.loc[df.metric == "percentile_10"], "IC (1st Decile)", axs[0], legend="full")
plot_volcano(df.loc[df.metric == "percentile_90"], "IC (9th Decile)", axs[1])
plot_volcano(df.loc[df.metric == "quantile_ratio"], "IC (Quantile Ratio)", axs[2])
plt.show()


In [None]:
df[df.metric == "percentile_90"].reset_index()

In [None]:
def draw_pvalues(ax, x, dataset, metric):
    y = df.loc[(df.metric == metric) & (df.dataset == dataset), ["orig_value", "shuffled_value"]].max().max()
    ax.text(x, y+0.05, "{:.1E}".format(df.loc[(df.metric == metric) & (df.dataset == dataset), "pvalue"].values[0]), horizontalalignment='center', fontsize=20)

In [None]:
sns.set(font_scale=1.8, style="ticks", font="Arial")
_df = df.loc[df.metric == "percentile_10"]
_df = _df.reset_index()
_df = _df.loc[[11,2,12]]
_datasets = _df.dataset
_df = _df.groupby("dataset").apply(transform).reset_index()
g = sns.catplot(data=_df, x="dataset", y="value", hue="Distribution", kind="bar", sharey=False, aspect=1.1)
plt.xticks(rotation = 30, ha="right")
plt.ylabel("1st Decile")
plt.xlabel("Cell Type")
for x, name in enumerate(_datasets):
    draw_pvalues(g.ax, x, name, "percentile_10")

In [None]:
ks_dfs = [pd.read_csv("{}/chr_ks.csv".format(x)) for x in datasets]

In [None]:
ks = pd.concat(ks_dfs)
ks.index = datasets
ks = ks.rename(cell_type_names, axis=0)

In [None]:
ks

In [None]:
# t_df[(t_df.dataset == "Cheng_ES_TPM") & (t_df.metric == "median")]
# ks.to_csv("ks_summary.csv",float_format="%.1E")
def get_row(dataset, stretch):
    metrics = ["percentile_10", "percentile_90", "percentile_97.5", "percentile_02.5", "quantile_ratio", "median"]
    _df = df.loc[df.metric.isin(metrics) & (df.dataset == dataset)]
    row_values = [ks.loc[dataset, stretch]]
    for metric in metrics:
        values = _df.loc[_df.metric == metric, ["orig_value", "shuffled_value"]].values
        row_values += list(np.reshape(values, (-1,))) + [_df.loc[_df.metric == metric, "pvalue"].values[0]]
    index = pd.MultiIndex.from_product([metrics, ["orig_value", "shuffled_value", "pvalue"]])
    index = pd.MultiIndex.from_frame(pd.DataFrame(np.vstack([[("ks", "pvalue")],list(index.values)])))    
    row = pd.Series(row_values, index=index, name=(dataset, stretch))
    return row



In [None]:
m_index = pd.MultiIndex.from_product([cell_type_names.values(), ["7", "14", "21"]])
summary = pd.DataFrame([get_row(*x) for x in m_index.values], index=m_index)

In [None]:
summary

In [None]:
summary.to_csv("chr_stats_test_summary.csv")
