In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
tqdm_notebook().pandas()
from statsmodels.distributions.empirical_distribution import ECDF
import numpy as np
from scipy.stats import ks_2samp
from matplotlib.colors import LogNorm

In [None]:
dataset = "somatosensory_converted_into_tpm"
families = "clean_panther4march"

df = pd.read_csv("{}/{}/results/family_IC.csv".format(dataset, families), index_col="family_id")
shuffled_df = pd.read_csv("{}/{}/results/shuffled_family_IC.csv".format(dataset, families), index_col="family_id")
raw_df = pd.read_csv("{}.csv".format(dataset), index_col="gene_id")
family_df = pd.read_csv("{}.csv".format(families))

In [None]:
df.family_size.hist(bins=50)

In [None]:
counts = df.family_size.value_counts()
family_sizes = counts[counts >= 30].index

In [None]:
summaries = []

for family_size in family_sizes:
    display("Family size: "+str(family_size))
    orig = df.loc[df.family_size == family_size]
    shuffled = shuffled_df.loc[shuffled_df.family_size == family_size]

    #KS-test
    display(ks_2samp(orig.ic, shuffled.ic))
    sns.distplot(orig.ic.dropna(), norm_hist=True, rug=False, kde=False, label="Original")
    sns.distplot(shuffled.ic.dropna(), norm_hist=True, rug=False, kde=False, label="Shuffled")
    plt.legend()
    plt.show()
    orig_ecdf = ECDF(orig.ic)
    shuffled_ecdf = ECDF(shuffled.ic)
    x = np.arange(0,5,0.01)
    sns.lineplot(x, orig_ecdf(x), label="Original")
    sns.lineplot(x, shuffled_ecdf(x), label="Shuffled")
    sns.lineplot(x, shuffled_ecdf(x) - orig_ecdf(x), label="Difference")
    plt.show()

    #Permutation test
    def get_statistics(df):
        ics = df.ic
        return pd.Series({
            "median": ics.median(),
            "percentile_90": ics.quantile(0.9),
            "percentile_10": ics.quantile(0.1),
            "percentile_97.5": ics.quantile(0.975),
            "percentile_02.5": ics.quantile(0.025),
            "quantile_ratio": ics.quantile(0.9) / ics.quantile(0.1),
#             "skew": ics.skew()
        })
    orig_statistics = get_statistics(orig)
    shuffled_statistics = shuffled.groupby("iteration").progress_apply(get_statistics)
#     total_shuffled_statistic = get_statistics(shuffled) #WRONG, this is not median
    total_shuffled_statistic = shuffled_statistics.median()

#     lower_count = (shuffled_statistics <= orig_statistics).sum()
#     upper_count = (shuffled_statistics >= orig_statistics).sum()
#     lower_pvalue = (lower_count + 1) / (shuffled_statistics.shape[0] + 1)
#     upper_pvalue = (upper_count + 1) / (shuffled_statistics.shape[0] + 1)
    
    shuf_mean = shuffled_statistics.mean(axis=0)
    orig_diff = np.abs(orig_statistics - shuf_mean)
    shuf_diff = shuffled_statistics.subtract(shuf_mean).abs()
    pvalue = ((shuf_diff >= orig_diff).sum(axis=0) + 1) / (shuffled_statistics.shape[0] + 1)
    pvalues = pd.DataFrame({
        "orig_value": orig_statistics,
        "shuffled_value": total_shuffled_statistic,
#         "lower_count": lower_count,
#         "lower_pvalue": lower_pvalue,
#         "upper_count": upper_count,
#         "upper_pvalue": upper_pvalue,
        "pvalue": pvalue,
    })
#     pvalues["significance"] = pvalues.apply(lambda x: "LOWER" if x.lower_pvalue <= 0.025 else ("HIGHER" if x.upper_pvalue <= 0.025 else "-----"), axis=1)

    display(pvalues)
    _, axs = plt.subplots(3,3,figsize=(15,12))
    for ax, statistic in zip(np.array(axs).flatten(), orig_statistics.index):
        sns.distplot(shuffled_statistics[statistic], ax=ax, kde=False, rug=True, label="Shuffled")
        sns.distplot([orig_statistics[statistic]], ax=ax, kde=False, hist=False, rug=True, rug_kws={"height": 0.5}, label="Original")
        sns.distplot([total_shuffled_statistic[statistic]], ax=ax, kde=False, hist=False, rug=True, rug_kws={"height": 0.5}, label="Combined Shuffled")
        ax.legend()
    plt.show()
    
    summaries += [{
        "family_size": family_size,
        "ks": ks_2samp(orig.ic, shuffled.ic),
        "pvalues": pvalues
    }]

In [None]:
x = summaries[0]
def build_row(x):
    l = [x["family_size"], df.loc[df.family_size == x["family_size"]].shape[0], x["ks"].statistic, x["ks"].pvalue]
    l += list(x["pvalues"].loc["quantile_ratio", ["orig_value", "shuffled_value"]].values)
    l += [x["pvalues"].loc["quantile_ratio", "pvalue"]]
    l += list(x["pvalues"].loc["percentile_90", ["orig_value", "shuffled_value"]].values)
    l += [x["pvalues"].loc["percentile_90", "pvalue"]]
    l += list(x["pvalues"].loc["percentile_10", ["orig_value", "shuffled_value"]].values)
    l += [x["pvalues"].loc["percentile_10", "pvalue"]]
    l += list(x["pvalues"].loc["median", ["orig_value", "shuffled_value"]].values)
    l += [x["pvalues"].loc["median", "pvalue"]]
    return pd.Series(l, index=[
        ["", "", "KS", "KS"] + (["Quantile Ratio"] * 3) + (["90%"] * 3) + (["10%"] * 3) + (["Median"] * 3),
        ["Size", "No. Families", "Statistic", "P-value"] + (["Original", "Shuffled", "P-value"] * 4)
    ])
    
summary_table = pd.DataFrame([build_row(x) for x in summaries])#.set_index(("", "Family Size"))
# summary_table.index.name = "Size"
summary_table.loc[:,("","Size")] = summary_table.loc[:,("","Size")].astype(int)
summary_table.loc[:,("","No. Families")] = summary_table.loc[:,("","No. Families")].astype(int)
summary_table.to_csv("{}/family_stat_test_pvalues.csv".format(dataset))
summary_table
# print(summary_table.to_latex(float_format="%.3f", multicolumn_format="c", column_format="|l|rr|rrr|rrr|rrr|rrr|", index=False))
# np.array([0, 1, 2, list(x["pvalues"].loc["median", ["orig_value", "shuffled_value", "lower_pvalue", "upper_pvalue"]].values)])

In [None]:
sns.set(font_scale=1.2, style="ticks", font="Arial")
shuffled_statistics = shuffled_df.groupby("family_size").apply(get_statistics)
plt.figure(figsize=(8,2.8))
sns.scatterplot(data=df, x="family_size", y="ic", label="Original")
sns.lineplot(shuffled_statistics.index, shuffled_statistics["median"], label="Shuffled (Median)")
sns.lineplot(shuffled_statistics.index, shuffled_statistics["percentile_97.5"], label="Shuffled (97.5th Percentile)")
# sns.lineplot(shuffled_statistics.index, shuffled_statistics.percentile_90, label="Shuffled 90%")
# sns.lineplot(shuffled_statistics.index, shuffled_statistics.percentile_10, label="Shuffled 10%")
sns.lineplot(shuffled_statistics.index, shuffled_statistics["percentile_02.5"], label="Shuffled (2.5th Percentile)",)
plt.ylabel("IC")
plt.xlabel("Family Size")
plt.yscale("log",basey=2)
plt.xscale("log")
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
# shuffled_statistics