In [None]:
import pickle
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import ks_2samp
from statsmodels.distributions.empirical_distribution import ECDF
from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()
from statsmodels.stats.proportion import proportion_confint
from matplotlib.colors import LogNorm

In [None]:
dich_type = "geomean"
dataset = "Yu_First_wave_endocrine_cells"
checkpoint = "1000"

In [None]:
orig_df = pd.read_csv("{}.csv".format(dataset), index_col="gene_id")
# orig_df = pd.read_csv("{}/intermediate/{}/dichotomised_genes.csv".format(dataset, dich_type), index_col="gene_id")

In [None]:
dichotomised_df = pd.read_csv("{}/intermediate/{}/dichotomised_genes.csv".format(dataset, dich_type), index_col="gene_id")

In [None]:
# chr_names = list(range(1,22)) + ["X", "Y"]
chr_names = list(range(1,20)) + ["X", "Y"]
ic_dfs = [pd.read_csv("{}/results/{}/stage1_chr{}_IC.csv".format(dataset,dich_type,x)) for x in chr_names]
chrs = {x: pd.read_csv("{}/intermediate/chr{}_filtered.csv".format(dataset, x)) for x in chr_names}
filtered_genes = pd.read_csv("{}/intermediate/{}/filtered_dichotomised_genes.csv".format(dataset, dich_type))
for i, name in zip(range(len(ic_dfs)), chr_names):
    ic_dfs[i].loc[:,"chromosome"] = name

ic_df = pd.concat(ic_dfs).reset_index(drop=True)

In [None]:
ic_df.dropna()

In [None]:
shuffled_ic_df = pd.read_csv("{}/results/{}/shuffled_IC.csv".format(dataset, dich_type))

In [None]:
shuffled_ic_df

# KS-test

In [None]:
sns.set(context="notebook", style="ticks", font="Helvetica")

def get_norm(arr):
    return {
        "weights": np.ones(len(arr)) / len(arr)
    }

ks_summaries = []
for stretch in [7,14,21]:
    n_bins=50
    display("Stretch size {}".format(stretch))
    sequential = ic_df[(ic_df.stretch == stretch) & (ic_df.n_genes >= stretch / 7 * 6)].ic
    shuffled = shuffled_ic_df[(shuffled_ic_df.stretch == stretch) & (shuffled_ic_df.n_genes > stretch / 7 * 6)].ic
    display(ks_2samp(sequential, shuffled))
    ks_summaries += [{
        "stretch_size": stretch,
        "ks": ks_2samp(sequential, shuffled)
    }]
    bins=np.histogram(shuffled.dropna(), bins=n_bins)[1]
    sns.distplot(sequential.dropna(), kde=False, hist_kws=get_norm(sequential.dropna()), label="Original", bins=bins)
    sns.distplot(shuffled.dropna(), kde=False, hist_kws=get_norm(shuffled.dropna()), label="Shuffled", bins=bins)
    plt.legend()
#     plt.arrow(2.18,1.5,0,-1,head_width=0.02, fc='k', ec='k')
#     plt.arrow(2.23,1.5,0,-1,head_width=0.02, fc='k', ec='k')
#     plt.arrow(2.57,1.2,0,-1,head_width=0.05, fc='k', ec='k')
#     plt.arrow(2.63,1.2,0,-1,head_width=0.05, fc='k', ec='k')
    
    plt.show()
    
    df_ecdf = ECDF(sequential)
    shuffled_df_ecdf = ECDF(shuffled)
    x = np.arange(0,5,0.01)
    sns.lineplot(x, df_ecdf(x), drawstyle='steps-post')
    sns.lineplot(x, shuffled_df_ecdf(x), drawstyle='steps-post')
    sns.lineplot(x, shuffled_df_ecdf(x) - df_ecdf(x), drawstyle='steps-post')
    plt.show()

For the figure

In [None]:
stretch=14
n_bins=50
sns.set(font_scale=1.5, style="ticks", font="Arial")
display("Stretch size {}".format(stretch))
sequential = ic_df[(ic_df.stretch == stretch) & (ic_df.n_genes >= stretch / 7 * 6)].ic
shuffled = shuffled_ic_df[(shuffled_ic_df.stretch == stretch) & (shuffled_ic_df.n_genes >= stretch / 7 * 6)].ic
display(ks_2samp(sequential, shuffled))
ks_summaries += [{
    "stretch_size": stretch,
    "ks": ks_2samp(sequential, shuffled)
}]
bins=np.histogram(shuffled.dropna(), bins=n_bins)[1]

sns.distplot(sequential.dropna(), kde=False, hist_kws=get_norm(sequential.dropna()), label="Original", bins=bins)
sns.distplot(shuffled.dropna(), kde=False, hist_kws=get_norm(shuffled.dropna()), label="Shuffled", bins=bins)
plt.legend()
plt.xlabel("IC")
plt.ylabel("Frequency")
#     plt.arrow(2.18,1.5,0,-1,head_width=0.02, fc='k', ec='k')
#     plt.arrow(2.23,1.5,0,-1,head_width=0.02, fc='k', ec='k')
#     plt.arrow(2.57,1.2,0,-1,head_width=0.05, fc='k', ec='k')
#     plt.arrow(2.63,1.2,0,-1,head_width=0.05, fc='k', ec='k')

plt.show()

In [None]:
pd.DataFrame([{"stretch": x['stretch_size'], "pvalue": x["ks"][1]} for x in ks_summaries]).set_index("stretch").T.to_csv("{}/chr_ks.csv".format(dataset), index=False)

# Permutation tests (genes shuffled)

In [None]:
def get_statistics(df):
    ics = df.ic
    return pd.Series({
        "median": ics.median(),
        "percentile_90": ics.quantile(0.9),
        "percentile_10": ics.quantile(0.1),
        "percentile_97.5": ics.quantile(0.975),
        "percentile_02.5": ics.quantile(0.025),
        "quantile_ratio": ics.quantile(0.9) / ics.quantile(0.1),
#         "skew": ics.skew()
    })

In [None]:
sns.set(context="notebook", style="ticks", font="Helvetica")
permutation_summaries = []
for stretch in [7,14,21]:
    display("Stretch "+str(stretch))
    orig = ic_df[(ic_df.stretch == stretch) & (ic_df.n_genes >= stretch / 7 * 6)]
    shuffled = shuffled_ic_df[(shuffled_ic_df.stretch == stretch) & (shuffled_ic_df.n_genes >= stretch / 7 * 6)]
    orig_statistics = get_statistics(orig)
    shuffled_statistics = shuffled.groupby("iteration").progress_apply(get_statistics)
#     total_shuffled_statistic = get_statistics(shuffled) #WRONG, this is not median
    total_shuffled_statistic = shuffled_statistics.median()
    
#     lower_count = (shuffled_statistics <= orig_statistics).sum()
#     upper_count = (shuffled_statistics >= orig_statistics).sum()
#     lower_pvalue = (lower_count + 1) / (shuffled_statistics.shape[0] + 1)
#     upper_pvalue = (upper_count + 1) / (shuffled_statistics.shape[0] + 1)
    
    shuf_mean = shuffled_statistics.mean(axis=0)
    orig_diff = np.abs(orig_statistics - shuf_mean)
    shuf_diff = shuffled_statistics.subtract(shuf_mean).abs()
    pvalue = ((shuf_diff >= orig_diff).sum(axis=0) + 1) / (shuffled_statistics.shape[0] + 1)
    print("shuf_mean",shuf_mean)
    print("OrigDiff",orig_diff)
    print("shuf_diff",shuf_diff)
    pvalues = pd.DataFrame({
        "orig_value": orig_statistics,
        "shuffled_value": total_shuffled_statistic,
#         "lower_count": lower_count,
#         "lower_pvalue": lower_pvalue,
#         "upper_count": upper_count,
#         "upper_pvalue": upper_pvalue,
        "pvalue": pvalue,
    })
#     pvalues["significance"] = pvalues.apply(lambda x: "LOWER" if x.lower_pvalue <= 0.025 else ("HIGHER" if x.upper_pvalue <= 0.025 else "-----"), axis=1)
    permutation_summaries += [pvalues]
    display(pvalues)
    
    _, axs = plt.subplots(3,3,figsize=(15,12))
    for ax, statistic in zip(np.array(axs).flatten(), orig_statistics.index):
        sns.distplot(shuffled_statistics[statistic], ax=ax, kde=False, rug=False, label="Shuffled")
        sns.distplot([orig_statistics[statistic]], ax=ax, kde=False, hist=False, rug=True, rug_kws={"height": 0.5}, label="Original")
        sns.distplot([total_shuffled_statistic[statistic]], ax=ax, kde=False, hist=False, rug=True, rug_kws={"height": 0.5}, label="Median Shuffled")
        ax.legend()
    plt.show()
    
    statistic = "quantile_ratio"
    sns.distplot(shuffled_statistics[statistic], kde=False, rug=False, label="Shuffled")
    sns.distplot([orig_statistics[statistic]], kde=False, hist=False, rug=True, rug_kws={"height": 0.95}, label="Original")
    sns.distplot([total_shuffled_statistic[statistic]], kde=False, hist=False, rug=True, rug_kws={"height": 0.95}, label="Median Shuffled")
    plt.legend()
    plt.xlabel("Quantile Ratio")
    plt.show()
    
    statistic = "percentile_10"
    sns.distplot(shuffled_statistics[statistic], kde=False, rug=False, label="Shuffled")
    sns.distplot([orig_statistics[statistic]], kde=False, hist=False, rug=True, rug_kws={"height": 0.95}, label="Original")
    sns.distplot([total_shuffled_statistic[statistic]], kde=False, hist=False, rug=True, rug_kws={"height": 0.95}, label="Median Shuffled")
    plt.legend()
    plt.xlabel("10th Percentile")
    plt.show()
    
    statistic = "percentile_90"
    sns.distplot(shuffled_statistics[statistic], kde=False, rug=False, label="Shuffled")
    sns.distplot([orig_statistics[statistic]], kde=False, hist=False, rug=True, rug_kws={"height": 0.95}, label="Original")
    sns.distplot([total_shuffled_statistic[statistic]], kde=False, hist=False, rug=True, rug_kws={"height": 0.95}, label="Median Shuffled")
    plt.legend()
    plt.xlabel("90th Percentile")
    plt.show()
    
    df = pd.DataFrame([
        ["Quantile Ratio", pvalues.loc["quantile_ratio", "orig_value"], "Original"],
        ["Quantile Ratio", pvalues.loc["quantile_ratio", "shuffled_value"], "Median Shuffled"],
        ["90%", pvalues.loc["percentile_90", "orig_value"], "Original"],
        ["90%", pvalues.loc["percentile_90", "shuffled_value"], "Median Shuffled"],
        ["97.5%", pvalues.loc["percentile_97.5", "orig_value"], "Original"],
        ["97.5%", pvalues.loc["percentile_97.5", "shuffled_value"], "Median Shuffled"],
        ["2.5%", pvalues.loc["percentile_02.5", "orig_value"], "Original"],
        ["2.5%", pvalues.loc["percentile_02.5", "shuffled_value"], "Median Shuffled"],
        ["10%", pvalues.loc["percentile_10", "orig_value"], "Original"],
        ["10%", pvalues.loc["percentile_10", "shuffled_value"], "Median Shuffled"],
    ], columns=["metric", "value", "Distribution"])

    sns.catplot(data=df, x="metric", y="value", hue="Distribution", kind="bar", sharey=False)
    plt.xlabel("")
    plt.ylabel("")
    plt.show()
    
#     if stretch == 14:
    pvalues.index.name = "metric"
    pvalues.to_csv("{}/chr_stat_test_pvalues_{}.csv".format(dataset, stretch))
    