# Prepare gene name lookup dictionary

In [None]:
import pandas as pd

gene_df = pd.read_csv("/mnt/storage/cmap/2017/GSE92742_Broad_LINCS_gene_info.txt", sep="\t", header=0, index_col=None)
id2hgnc = {str(key): value for key, value in zip(gene_df["pr_gene_id"].tolist(), gene_df["pr_gene_symbol"].tolist())}

# Read Header information to see for which genes we have overexpression experiments

In [None]:
df = pd.read_csv("/mnt/storage/cmap/2017/GSE92742_Broad_LINCS_sig_info.txt", header=0, index_col=None, sep="\t")
#df = df[df["qc_pass"] == 1]
df_oe = df[df["pert_type"] == "trt_oe"]
df_kd = df[df["pert_type"] == "trt_sh"]
df_drug = df[df["pert_type"] == "trt_cp"]
overexpressed_genes = df_oe["pert_iname"].tolist()
knockdown_genes = df_kd["pert_iname"].tolist()

In [None]:
df_oe.head()

In [None]:
df.columns

In [None]:
sorted(df_oe["cell_id"].unique())
# U937: Monocyte-like malignant
# NCIH716: Adenocarcinoma from colon
# SW480: Colorectal Cancer
# MCF10A: non-tumor epithelial 

In [None]:
df_kd.columns

In [None]:
len(df_oe["pert_iname"].unique())

In [None]:
len(df_kd["pert_iname"].unique())

In [None]:
"MGAT3" in df["cmap_name"].values

In [None]:
# alternative for NR5A2: ZNF281
# for rs11739663:  EXOC3, CEP72
# for OSMR: RICTOR
# for rs11742570: PTGER4
# for rs4703855: ZNF366
# for rs564349 (ERGIC1 is Coregene): DUSP1, BNIP1
# for rs2538470: CUL1
# for rs75900472: JAK2
# for rs12422544: LRRK2
# for rs17085007: RPL21 (maybe)
# for rs28374715: NUSAP1
# for rs17694108: CEBPA, CEBPG
# for rs2823286: NRIP1

additional_hsps = ["ZNF281", "EXOC3", "CEP72", "RICTOR", "PTGER4", "ZNF366", "DUSP1", "BNIP1", "CUL1", "JAK2", "LRRK2", "NUSAP1", "CEBPG", "NRIP1"]


In [None]:
def get_sig_ids(df_oe, hgnc):
    return df_oe["sig_id"][df_oe["pert_iname"] == hgnc]

def get_cell_types(df_oe, hgnc):
    return df_oe["cell_id"][df_oe["pert_iname"] == hgnc]

In [None]:
id2perturbagen = {exp_id: perturbagen for exp_id, perturbagen in zip(df_oe["sig_id"], df_oe["pert_iname"])}

# Extract all overexpression experiments from one cell type

In [None]:
from cmapPy.pandasGEXpress.parse import parse
from scipy.stats import ttest_ind, fisher_exact, mannwhitneyu, ks_2samp
from speos.postprocessing.postprocessor import PostProcessor
import numpy as np
from tqdm.notebook import tqdm

celltype = "PC3"

dfs = []
wide_dfs = []
perturbagen_pvals = []

columns = []
for perturbagen in tqdm(df_oe[df_oe.cell_id == celltype].pert_iname.unique()):
    try:
        ids = []
        raw_ids = get_sig_ids(df_oe, perturbagen)
        cell_lines = get_cell_types(df_oe, perturbagen)
        for raw_id, cell in zip(raw_ids, cell_lines):
            if cell == celltype:
                ids.append(raw_id)
        responses = parse("/mnt/storage/cmap/2017/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx", cid=ids)
    except Exception:
        print("Could not load response for perturbagen {}".format(perturbagen))

    
    responses.data_df.rename(index=id2hgnc, inplace=True)
    columns.append(responses.data_df)
    
# check that indices are in same order:
lead_index = columns[0].index

for column in columns:
    assert all(lead_index == column.index)

new_oe_df = pd.concat(columns, axis=1)
new_oe_df = new_oe_df.rename(axis=1, mapper=id2perturbagen)

new_oe_df.to_csv("/mnt/storage/cmap/2017/oe_df_{}.tsv".format(celltype), sep="\t")


# Now Knockdown

In [None]:
from cmapPy.pandasGEXpress.parse import parse
from scipy.stats import ttest_ind, fisher_exact, mannwhitneyu, ks_2samp
from speos.postprocessing.postprocessor import PostProcessor
import numpy as np
from tqdm.notebook import tqdm

id2perturbagen = {exp_id: perturbagen for exp_id, perturbagen in zip(df_kd["sig_id"], df_kd["pert_iname"])}

celltype = "HEK293T"

dfs = []
wide_dfs = []
perturbagen_pvals = []

columns = []
for perturbagen in tqdm(df_kd[df_kd.cell_id == celltype].pert_iname.unique()):
    try:
        ids = []
        raw_ids = get_sig_ids(df_kd, perturbagen)
        cell_lines = get_cell_types(df_kd, perturbagen)
        for raw_id, cell in zip(raw_ids, cell_lines):
            if cell == celltype:
                ids.append(raw_id)
        responses = parse("/mnt/storage/cmap/2017/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx", cid=ids)
    except Exception:
        print("Could not load response for perturbagen {}".format(perturbagen))

    
    responses.data_df.rename(index=id2hgnc, inplace=True)
    columns.append(responses.data_df)
    
# check that indices are in same order:
lead_index = columns[0].index

for column in columns:
    assert all(lead_index == column.index)

new_kd_df = pd.concat(columns, axis=1)
new_kd_df = new_kd_df.rename(axis=1, mapper=id2perturbagen)

new_kd_df.to_csv("/mnt/storage/cmap/2017/kd_df_{}.tsv".format(celltype), sep="\t")


# Read Coregenes

In [None]:
import json
from extensions.preprocessing import preprocess_labels

trait = "uc"

def get_coregenes(trait: str, background):
    trait2name = {"uc": "uc",
                "cad": "cad_really",
                "scz": "scz",
                "ad": "alz",
                "ra": "ra"}

    mendelians = preprocess_labels("../extensions/{}_only_genes.tsv".format(trait2name[trait]))

    hsps= pd.read_csv("../hsps/{}.txt".format(trait), header=None, index_col=None).iloc[:, 0].tolist()

    with open("/mnt/storage/speos/results/{}_film_nohetioouter_results.json".format(trait2name[trait]), "r") as file:
        candidate2cs = json.load(file)[0]

    coregenes = [key for key, value in candidate2cs.items() if value == 11]

    other_coregenes = [key for key, value in candidate2cs.items() if value != 11]

    allcore = set()
    allcore.update(set(coregenes))
    allcore.update(set(mendelians))
    allcore = allcore.intersection(set(id2hgnc.values()))

    noncore = set(background).difference(allcore).difference(other_coregenes)

    return allcore, other_coregenes, hsps,  noncore

In [None]:
allcore, other_coregenes, hsps,  noncore = get_coregenes("ra", id2hgnc.values())

In [None]:
print(len(coregenes))
print(len(noncore))

In [None]:
fig, ax = plt.subplots(figsize=(3.5*cm, 6*cm))

bp = ax.boxplot(x=[dfs["Expression"][dfs["Group"] == "Core Gene\n(n=630)"], dfs["Expression"][dfs["Group"] == "Peripheral\n(n=10848)"]], 
              positions=[0,1], widths=[0.08, 0.08], showfliers=False, zorder=5, patch_artist=True)

sns.violinplot(dfs, x="Group", y="Expression", fill=False, palette={"Core Gene\n(n=630)": "#01016f", "Peripheral\n(n=10848)": "#5a5a5a"},
               linewidth=0.5, ax=ax, inner=None)


for feature, color in zip(['boxes', "medians", "whiskers", "caps"], ["darkgray", "black", "darkgray", "darkgray"]):
    plt.setp(bp[feature], color=color)

ax.text(0.5, y=3, s="n.s.", ha="center")

plt.savefig("Perturbation_ns_thumbnail.svg", bbox_inches="tight")

In [None]:
from cmapPy.pandasGEXpress.parse import parse
from scipy.stats import ttest_ind, fisher_exact, mannwhitneyu, ks_2samp
from speos.postprocessing.postprocessor import PostProcessor
import numpy as np

dfs = []
wide_dfs = []
perturbagen_pvals = []
for perturbagen_hsp in ["DAP"]:
    try:
        ids = []
        raw_ids = get_sig_ids(df_oe, perturbagen_hsp)
        cell_lines = get_cell_types(df_oe, perturbagen_hsp)
        for raw_id, cell in zip(raw_ids, cell_lines):
            if cell in ["HT29"]:
                ids.append(raw_id)
        print("Found {} signatures for perturbagen {}.".format(len(ids), perturbagen_hsp))
        responses = parse("/mnt/storage/cmap/2017/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx", cid=ids)
    except Exception:
        print("Could not load response for perturbagen {}".format(perturbagen_hsp))

    
    responses.data_df.rename(index=id2hgnc, inplace=True)

    mendelian_expression = []
    nonmendelian_expression = []

    genenames = []
    mendelian = []
    expression = []
    for i, row in responses.data_df.iterrows():
        if row.name in allcore:
            genenames.append(row.name)
            mendelian_expression.append(row.item())
            expression.append(row.item())
            mendelian.append(True)
        elif row.name in noncore:
            genenames.append(row.name)
            nonmendelian_expression.append(row.item())
            expression.append(row.item())
            mendelian.append(False)

    global_mean = np.mean(mendelian_expression + nonmendelian_expression)

    mendelian_expression = np.asarray(mendelian_expression) +0.5
    nonmendelian_expression = np.asarray(nonmendelian_expression)


    print(perturbagen_hsp)
    pvals = []
    pvals.append(ttest_ind(mendelian_expression, nonmendelian_expression)[1])

    print("Found {} out of 1 cell lines significant".format(sum(np.asarray(pvals) < 0.05)))

    dfs.append(pd.DataFrame(data={"Perturbagen": [perturbagen_hsp] * (mendelian_expression.shape[0] + nonmendelian_expression.shape[0]),
                            "Expression": mendelian_expression.squeeze().tolist() + nonmendelian_expression.squeeze().tolist(),
                            "Group": ["Core Gene\n(n={})".format(mendelian_expression.shape[0])] * mendelian_expression.shape[0] + ["Peripheral\n(n={})".format(nonmendelian_expression.shape[0])] * nonmendelian_expression.shape[0]}))
    wide_dfs.append(pd.DataFrame(data={perturbagen_hsp: expression},
                            index = genenames)
    )
    perturbagen_pvals.append(pvals)
wide_dfs = pd.concat(wide_dfs, axis=1)
dfs = pd.concat(dfs)


In [None]:
mendelian_expression.mean()

In [None]:
nonmendelian_expression.mean()

In [None]:
fig, ax = plt.subplots(figsize=(3.5*cm, 6*cm))
bp = ax.boxplot(x=[dfs["Expression"][dfs["Group"] == "Core Gene\n(n=630)"], dfs["Expression"][dfs["Group"] == "Peripheral\n(n=10848)"]], 
              positions=[0,1], widths=[0.08, 0.08], showfliers=False, zorder=5, patch_artist=True)
sns.violinplot(dfs, x="Group", y="Expression", fill=False, palette={"Core Gene\n(n=630)": "#01016f", "Peripheral\n(n=10848)": "#5a5a5a"},
               linewidth=0.5, ax=ax)


for feature, color in zip(['boxes', "medians", "whiskers", "caps"], ["darkgray", "black", "darkgray", "darkgray"]):
    plt.setp(bp[feature], color=color)
ax.text(0.5, y=3, s="***", ha="center")
plt.savefig("Perturbation_sign_thumbnail.svg", bbox_inches="tight")

# using the large dfs

In [None]:
import numpy as np
from scipy.stats import ttest_ind, mannwhitneyu
from statsmodels.stats.multitest import fdrcorrection
from random import shuffle, seed

def get_differential_percentages(full_df, coregenes, hsps, noncore, use_min=True, randomize_core=False, random_seed=None, use_t_test=True):
    if randomize_core:
        if random_seed is not None:
            seed(random_seed)
        background_genes = full_df.index.tolist()
        shuffle(background_genes)
        background_genes = set(background_genes)
        mock_coregenes = [background_genes.pop() for _ in range(len(coregenes))]
        coregene_target = full_df.loc[full_df.index.isin(mock_coregenes), :]
        noncore_target = full_df.loc[full_df.index.isin(background_genes), :]
    else:
        coregene_target = full_df.loc[full_df.index.isin(coregenes), :]
        noncore_target = full_df.loc[full_df.index.isin(noncore), :]

    if use_t_test:
        test = ttest_ind
    else:
        test = mannwhitneyu

    large_result = []
    result = test(coregene_target, noncore_target)
    large_result.append((coregene_target.mean(axis=0) - noncore_target.mean(axis=0)).values)
    large_result.append(result[0])
    large_result.append(result[1])
    large_result.append(fdrcorrection(result[1])[1])
    #(large_result[3] < 0.05).sum() / len(large_result[3])

    result_df = pd.DataFrame(data=large_result, columns=coregene_target.columns, index=["meandiff", "statistic", "pval", "FDR"])
    result_df.columns = [column.split(".")[0] for column in result_df.columns]
    if use_min:
        full_result_df_unified = result_df.transpose().groupby(result_df.columns).agg({"FDR": "min", "pval": "min", "statistic": lambda x: max(x.min(), x.max(), key=abs), "meandiff": lambda x: max(x.min(), x.max(), key=abs)}).transpose()
    else:
        full_result_df_unified = result_df
    if len(full_result_df_unified.columns) > 0:
        overall_percentage =  (full_result_df_unified.loc["FDR", :] < 0.05).sum() / len(full_result_df_unified.columns)
    else:
        overall_percentage = np.nan

    coregene_mask = np.asarray([value.split(".")[0] in coregenes for value in full_df.columns])
    hsp_mask = np.asarray([value.split(".")[0] in hsps for value in full_df.columns])
    noncore_mask = np.asarray([value.split(".")[0] in noncore.difference(hsps) for value in full_df.columns])

    part_result_df = pd.DataFrame(data=[result[coregene_mask] for result in large_result], columns=coregene_target.columns[coregene_mask], index=["meandiff", "statistic", "pval", "FDR"])
    part_result_df.columns = [column.split(".")[0] for column in part_result_df.columns]
    
    result_df_unified = part_result_df.transpose().groupby(part_result_df.columns).agg({"FDR": "min", "pval": "min", "statistic": lambda x: max(x.min(), x.max(), key=abs), "meandiff": lambda x: max(x.min(), x.max(), key=abs)}).transpose()
    n_coregenes = len(result_df_unified.columns)
    from_coregenes_percentage = (result_df_unified.loc["FDR", :] < 0.05).sum() / len(result_df_unified.columns)

    part_result_df = pd.DataFrame(data=[result[hsp_mask] for result in large_result], columns=coregene_target.columns[hsp_mask], index=["meandiff", "statistic", "pval", "FDR"])
    part_result_df.columns = [column.split(".")[0] for column in part_result_df.columns]
    result_df_unified = part_result_df.transpose().groupby(part_result_df.columns).agg({"FDR": "min", "pval": "min", "statistic": lambda x: max(x.min(), x.max(), key=abs), "meandiff": lambda x: max(x.min(), x.max(), key=abs)}).transpose()
    n_hsps = len(result_df_unified.columns)
    from_hsps_percentage = (result_df_unified.loc["FDR", :] < 0.05).sum() / len(result_df_unified.columns)

    part_result_df = pd.DataFrame(data=[result[noncore_mask] for result in large_result], columns=coregene_target.columns[noncore_mask], index=["meandiff", "statistic", "pval", "FDR"])
    part_result_df.columns = [column.split(".")[0] for column in part_result_df.columns]
    result_df_unified = part_result_df.transpose().groupby(part_result_df.columns).agg({"FDR": "min", "pval": "min", "statistic": lambda x: max(x.min(), x.max(), key=abs), "meandiff": lambda x: max(x.min(), x.max(), key=abs)}).transpose()
    n_noncore = len(result_df_unified.columns)
    from_peripherals_percentage = (result_df_unified.loc["FDR", :] < 0.05).sum() / len(result_df_unified.columns)

    coregene_mask = np.asarray([value in coregenes for value in full_result_df_unified.columns]).astype(np.bool_)
    hsp_mask = np.asarray([value in hsps for value in full_result_df_unified.columns]).astype(np.bool_)
    noncore_mask = np.asarray([value in noncore.difference(hsps) for value in full_result_df_unified.columns]).astype(np.bool_)

    mask_df = pd.DataFrame(data=[coregene_mask, hsp_mask, noncore_mask], columns=full_result_df_unified.columns, index=["Core Gene", "HSP", "Peripheral"])
    full_result_df_unified = pd.concat((mask_df, full_result_df_unified), axis=0)
    return full_result_df_unified, (overall_percentage, len(full_result_df_unified.columns)), (from_coregenes_percentage, n_coregenes), (from_hsps_percentage, n_hsps), (from_peripherals_percentage, n_noncore)




In [None]:
celltype = "PC3"

full_kd_df = pd.read_csv("/mnt/storage/cmap/2017/kd_df_{}.tsv".format(celltype), header=0, sep="\t", index_col=0)

In [None]:
get_differential_percentages(full_kd_df, allcore, hsps, noncore, use_min=True)[0].transpose().to_csv("/mnt/storage/cmap/2017/differential_perturbation_knockdown_UC_{}.tsv".format(celltype), sep="\t")

In [None]:
get_differential_percentages(full_kd_df, allcore, hsps, noncore, randomize_core=True)

In [None]:
results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_kd_df, allcore, hsps, noncore, use_min=True)


random_core = []
random_hsp = []
random_peri = []
for i in tqdm(range(100)):
    _, _, core_result_random, hsp_result_random, peri_result_random = get_differential_percentages(full_kd_df, allcore, hsps, noncore, randomize_core=True, random_seed=i)
    random_core.append(core_result_random[0])
    random_hsp.append(hsp_result_random[0])
    random_peri.append(peri_result_random[0])

In [None]:
len(allcore)

In [None]:
results_df = results_df.transpose()
results_df

In [None]:
import seaborn as sns
from speos.visualization.settings import *
import matplotlib.pyplot as plt
fig, ax= plt.subplots(figsize=(8*cm,5*cm))

num_target_core_genes = len(allcore.intersection(set(full_kd_df.index)))

kd_matrix_mean = pd.DataFrame(index=["HSP" + "\n(n=%s)" % hsp_result[1], "Peripheral\n" + "(n=%s)" % peri_result[1], "Core Gene\n" + "(n=%s)" % core_result[1]],
                         data={"Core Genes\n" + "n={}".format(num_target_core_genes): [hsp_result[0], peri_result[0],  core_result[0]],
                               "Random Genes\n" + "n={} ({}x)".format(num_target_core_genes, len(random_hsp)): [np.mean(random_hsp), np.mean(random_peri), np.mean(random_core)]})

ax = sns.heatmap(kd_matrix_mean.transpose(), vmin=0,  vmax=1, cmap="Purples", annot=True, fmt=".1%", ax=ax,
                 cbar_kws={'label': "Fraction Significant\nDifferential Perturbations",
                           "pad": 0.01})
ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize = 6)
cbar = ax.collections[-1].colorbar
cbar.ax.set_ylabel("Fraction Significant\nDifferential Perturbations", fontsize=5)
ax.set_ylabel("Target Gene Set", fontsize=7)
ax.set_xlabel("Perturbagen (Knockdown)", fontsize=7)
plt.tight_layout()
plt.savefig("Perturbation_knockdown_{}_{}.svg".format(celltype), bbox_inches="tight")

# getting Knockdown for every trait for every celltype

In [None]:

def full_knockdown(trait, celltype, background, restriction: set = set()):
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    import seaborn as sns

    # set font
    mpl.rcParams['font.family'] = 'Helvetica'

    full_width = 18
    cm = 1/2.54
    small_font = 6
    medium_font = 8
    large_font = 10
    mpl.rc('xtick', labelsize=small_font)
    mpl.rc('ytick', labelsize=small_font)
    mpl.rcParams['axes.linewidth'] = 0.4
    mpl.rcParams['ytick.major.size'] = 3
    mpl.rcParams['ytick.major.width'] = 0.5
    mpl.rcParams['ytick.minor.size'] = 2
    mpl.rcParams['ytick.minor.width'] = 0.3
    mpl.rcParams['xtick.major.size'] = 2
    mpl.rcParams['xtick.major.width'] = 0.3
    mpl.rcParams['xtick.minor.size'] = 1
    mpl.rcParams['xtick.minor.width'] = 0.1


    print ("Starting KD Analysis for {} {}".format(trait, celltype))
    if isinstance(trait, str):
        allcore, other_coregenes, hsps,  noncore = get_coregenes(trait, background)
        traitstring = trait
    else:
        allcore = set()
        other_coregenes = set()
        hsps = set()
        noncore = set(list(background)[:])
        for _trait in trait:
            _allcore, _other_coregenes, _hsps,  _noncore = get_coregenes(_trait, background)
            allcore.update(set(_allcore))
            other_coregenes.update(set(_other_coregenes))
            hsps.update(set(_hsps))
            noncore = noncore.intersection(_noncore)
        traitstring = "_".join(trait)

    full_kd_df = pd.read_csv("/mnt/storage/cmap/2017/kd_df_{}.tsv".format(celltype), header=0, sep="\t", index_col=0)

    if len(restriction) > 0:
        restriction = [restr for restr in restriction if restr in full_kd_df.columns]
        full_kd_df = full_kd_df[list(restriction)]
        typestring = celltype + "_restricted"
    else:
        typestring = celltype

    get_differential_percentages(full_kd_df, allcore, hsps, noncore, use_min=True)[0].transpose().to_csv("/mnt/storage/cmap/2017/differential_perturbation_knockdown_{}_{}.tsv".format(traitstring, typestring), sep="\t")

    results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_kd_df, allcore, hsps, noncore, use_min=True, use_t_test=False)

    random_core = []
    random_hsp = []
    random_peri = []
    for i in range(100):
        _, _, core_result_random, hsp_result_random, peri_result_random = get_differential_percentages(full_kd_df, allcore, hsps, noncore, randomize_core=True, random_seed=i, use_t_test=False)
        random_core.append(core_result_random[0])
        random_hsp.append(hsp_result_random[0])
        random_peri.append(peri_result_random[0])

    results_df = results_df.transpose()
        
    fig, ax= plt.subplots(figsize=(8*cm,5*cm))

    num_target_core_genes = len(allcore.intersection(set(full_kd_df.index)))

    kd_matrix_mean = pd.DataFrame(index=["HSP" + "\n(n=%s)" % hsp_result[1], "Peripheral\n" + "(n=%s)" % peri_result[1], "Core Gene\n" + "(n=%s)" % core_result[1]],
                                data={"Core Genes\n" + "n={}".format(num_target_core_genes): [hsp_result[0], peri_result[0],  core_result[0]],
                                    "Random Genes\n" + "n={} ({}x)".format(num_target_core_genes, len(random_hsp)): [np.mean(random_hsp), np.mean(random_peri), np.mean(random_core)]})

    ax = sns.heatmap(kd_matrix_mean.transpose(), vmin=0,  vmax=1, cmap="Purples", annot=True, fmt=".1%", ax=ax,
                        cbar_kws={'label': "Fraction Significant\nDifferential Perturbations",
                                "pad": 0.01})
    #ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize = 6, rotaion=90)
    #ax.set_yticklabels(ax.get_ymajorticklabels(), fontsize = 6, rotaion=-90)
    cbar = ax.collections[-1].colorbar
    cbar.ax.set_ylabel("Fraction Significant\nDifferential Perturbations", fontsize=5)
    ax.set_ylabel("Target Gene Set", fontsize=7)
    ax.set_xlabel("Perturbagen (Knockdown)", fontsize=7)
    plt.tight_layout()
    plt.savefig("Perturbation_knockdown_{}_{}.svg".format(traitstring, typestring), bbox_inches="tight")

In [None]:
full_knockdown("uc", "PC3", id2hgnc.values(), restriction=all_perturbagens)

In [None]:
import contextlib
import joblib
from tqdm import tqdm

@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
    """Context manager to patch joblib to report into tqdm progress bar given as argument"""
    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
        def __call__(self, *args, **kwargs):
            tqdm_object.update(n=self.batch_size)
            return super().__call__(*args, **kwargs)

    old_batch_callback = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
    try:
        yield tqdm_object
    finally:
        joblib.parallel.BatchCompletionCallBack = old_batch_callback
        tqdm_object.close()

In [None]:
from joblib import Parallel, delayed

traits = ["uc", "ra", "cad", "ad", "scz"]
celltypes = ["PC3", "HT29", "HEK293T"]

combinations = []

for trait in traits:
    for celltype in celltypes:
        combinations.append((trait, celltype))

with tqdm_joblib(tqdm(desc="My calculation", total=len(combinations))) as progress_bar:
    Parallel(n_jobs=15)(delayed(full_knockdown)(trait, celltype, id2hgnc.values()) for (trait, celltype) in combinations)

# and once restriction perturbagens to those that are also used with HEK293T

In [None]:
from joblib import Parallel, delayed

traits = ["uc", "ra", "cad", "ad", "scz"]
celltypes = ["PC3", "HT29", "HEK293T"]

combinations = []
restrictions = []

for trait in traits:
    for celltype in celltypes:
        combinations.append((trait, celltype))
        restrictions.append(pd.read_csv("/mnt/storage/cmap/2017/kd_df_HEK293T.tsv", header=0, sep="\t", index_col=0).columns.tolist())

with tqdm_joblib(tqdm(desc="My calculation", total=len(combinations))) as progress_bar:
    Parallel(n_jobs=15)(delayed(full_knockdown)(trait, celltype, id2hgnc.values(), restriction) for (trait, celltype), restriction in zip(combinations, restrictions))

In [None]:
full_knockdown(["uc", "cad", "scz"], "HEK293T", id2hgnc.values())

In [None]:
for celltype in celltypes:
    full_knockdown(["uc","ra", "cad","ad", "scz"], celltype, id2hgnc.values())

# Multi-factorial analysis of overlap Genes

In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

cad_core, _, _, cad_noncore = get_coregenes("cad", id2hgnc.values())
uc_core, _, _, uc_noncore = get_coregenes("uc", id2hgnc.values())
scz_core, _, _, scz_noncore = get_coregenes("scz", id2hgnc.values())


noncore = cad_noncore.intersection(uc_noncore).intersection(scz_noncore)

celltype = "HEK293T"

full_kd_df = pd.read_csv("/mnt/storage/cmap/2017/kd_df_{}.tsv".format(celltype), header=0, sep="\t", index_col=0)



In [None]:
uc_sign = pd.read_csv("/mnt/storage/cmap/2017/differential_perturbation_knockdown_uc_{}.tsv".format(celltype), sep="\t", index_col=0)
uc_sign = uc_sign.index[uc_sign.FDR < 0.05].tolist()
len(uc_sign)

In [None]:
full_kd_df[uc_sign[0]]

In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

results = []
for gene in uc_sign:
    #column = full_kd_df[gene]
    groups = []
    scores = []
    for i, row in full_kd_df.iterrows():
        if row.name in uc_core:
            groups.append("UC Core")
            scores.append(row[gene])
        elif row.name in cad_core or row.name in scz_core:
            groups.append("Other Core")
            scores.append(row[gene])
        elif row.name in noncore:
            groups.append("Peripheral")
            scores.append(row[gene])
    
    results.append(pairwise_tukeyhsd(endog=scores,
                              groups=groups,
                              alpha=0.05))

In [None]:
results[8].summary()

In [None]:
other_vs_periph = []

core_vs_periph = []

for result in results:
    other_vs_periph.append(result.summary()[1][6].data)
    core_vs_periph.append(result.summary()[3][6].data)

    

In [None]:
np.asarray(core_vs_periph).sum() / len(core_vs_periph)

In [None]:
np.asarray(other_vs_periph).sum() / len(other_vs_periph)

# Two-Step approach using nested t-tests

In [None]:
from scipy.stats import ttest_ind

results_outer = {}
meandiff_outer = {}
results_inner = {}
meandiff_inner = {}
joint_results = {}
joint_meandiff = {}

triple_core = uc_core.union(cad_core).union(scz_core)
double_core = cad_core.union(scz_core)

triple_perturbagens = pd.read_csv("/mnt/storage/cmap/2017/differential_perturbation_knockdown_uc_cad_scz_{}.tsv".format(celltype), sep="\t", index_col=0)
triple_perturbagens = triple_perturbagens.index[triple_perturbagens.FDR < 0.05].tolist()


#for gene in set(uc_sign).intersection(triple_perturbagens):
full_kd_df.columns = [column.split(".")[0] for column in full_kd_df.columns]
#for gene in uc_sign:
for gene in triple_perturbagens:
    index = None
    #first uc coregenes vs uc peripherals]
    core_scores = []
    periph_scores = []

    for i, row in full_kd_df.iterrows():
        if row.name in uc_core:
            core_scores.append(row[gene])
        elif row.name in noncore:
            periph_scores.append(row[gene])


    
    result = ttest_ind(core_scores, periph_scores)[1]

    if isinstance(result, np.ndarray):
        index = np.argmin(result)
        result = result[index]
        meandiff_outer[gene] = np.mean(np.asarray(periph_scores)[:, index]) - np.mean(np.asarray(core_scores)[:, index])
    
    else:
        meandiff_outer[gene] = np.mean(periph_scores) - np.mean(core_scores)

    results_outer[gene] = result

    
    
    #then cad and scz coregenes vs triple peripherals]
    core_scores = []
    periph_scores = []

    for i, row in full_kd_df.iterrows():
        if row.name in double_core:
            core_scores.append(row[gene])
        elif row.name in noncore:
            periph_scores.append(row[gene])
    
    if index is not None:
        results_inner[gene] =  ttest_ind(core_scores, periph_scores)[1][index]
        meandiff_inner[gene] =  np.mean(np.asarray(periph_scores)[:, index]) - np.mean(np.asarray(core_scores)[:, index])
    else:
        results_inner[gene] =  ttest_ind(core_scores, periph_scores)[1]
        meandiff_inner[gene] =  np.mean(periph_scores) - np.mean(core_scores)

    core_scores = []
    periph_scores = []

    for i, row in full_kd_df.iterrows():
        if row.name in triple_core:
            core_scores.append(row[gene])
        elif row.name in noncore:
            periph_scores.append(row[gene])
    
    if index is not None:
        joint_results[gene] =  ttest_ind(core_scores, periph_scores)[1][index]
        joint_meandiff[gene] =  np.mean(np.asarray(periph_scores)[:, index]) - np.mean(np.asarray(core_scores)[:, index])
    else:
        joint_results[gene] =  ttest_ind(core_scores, periph_scores)[1]
        joint_meandiff[gene] =  np.mean(periph_scores) - np.mean(core_scores)

In [None]:
joint_effects = pd.read_csv("/mnt/storage/cmap/2017/differential_perturbation_knockdown_uc_cad_scz_{}.tsv".format(celltype), sep="\t", index_col=0)

joint_meandiff = {gene: joint_effects.loc[gene, "meandiff"] for gene in uc_sign}
joint_results = {gene: joint_effects.loc[gene, "FDR"] for gene in uc_sign}

In [None]:
fig, ax = plt.subplots()
#colors = ["blue" if pval < 0.05 else "gray" for pval in results_outer.values()]
#ax.scatter(np.zeros_like(list(meandiff_outer.values())), np.abs(list(meandiff_outer.values())), color=colors)

#colors = ["orange" if pval < 0.05 else "gray" for pval in results_inner.values()]
#ax.scatter(np.ones_like(list(meandiff_inner.values())), meandiff_inner.values(),  color=colors)

#colors = ["purple" if pval < 0.05 else "gray" for pval in joint_results.values()]
#ax.scatter(np.ones_like(list(joint_meandiff.values())) * 2, joint_meandiff.values(),  color=colors)
lines = []
labels = []
for gene in results_outer.keys():
    line = ax.plot((0,1, 2), (meandiff_outer[gene], meandiff_inner[gene], joint_meandiff[gene]), zorder=-5, marker="o")
    lines.extend(line)
    labels.append(gene)

#ax.legend(lines, labels)
ax.axhline(y=0)

In [None]:
fig, ax = plt.subplots()
#colors = ["blue" if pval < 0.05 else "gray" for pval in results_outer.values()]
#ax.scatter(np.zeros_like(list(meandiff_outer.values())), np.abs(list(meandiff_outer.values())), color=colors)

#colors = ["orange" if pval < 0.05 else "gray" for pval in results_inner.values()]
#ax.scatter(np.ones_like(list(meandiff_inner.values())), meandiff_inner.values(),  color=colors)

#colors = ["purple" if pval < 0.05 else "gray" for pval in joint_results.values()]
#ax.scatter(np.ones_like(list(joint_meandiff.values())) * 2, joint_meandiff.values(),  color=colors)
lines = []
labels = []

line = ax.plot(range(len(meandiff_outer.values())), meandiff_outer.values(), zorder=-5, marker="o", color="blue", markersize=4)
lines.extend(line)
line = ax.plot(range(len(meandiff_outer.values())), meandiff_inner.values(), zorder=-5, marker="o", color="orange", markersize=4)
lines.extend(line)
line = ax.plot(range(len(meandiff_outer.values())), joint_meandiff.values(), zorder=-5, marker="o", color="purple", markersize=4)
lines.extend(line)

color = ["blue" if value < 0.05 else "lightgray" for value in results_outer.values()]
ax.scatter(range(len(color)), meandiff_outer.values(), marker="o", color=color, edgecolor="blue", s=20, zorder=5)

color = ["orange" if value < 0.05 else "lightgray" for value in results_inner.values()]
ax.scatter(range(len(color)), meandiff_inner.values(), marker="o", color=color, edgecolor="orange", s=20, zorder=5)

color = ["purple" if value < 0.05 else "lightgray" for value in joint_results.values()]
ax.scatter(range(len(color)), joint_meandiff.values(), marker="o", color=color, edgecolor="purple", s=20, zorder=5)
lines.extend(line)
labels.append(gene)

ax.set_xticks(range(len(meandiff_outer.values())))
ax.set_xticklabels(meandiff_outer.keys(), rotation=90)

ax.legend(lines, ["UC", "CAD+SCZ", "UC+CAD+SCZ"])
ax.axhline(y=0, color="black", linestyle="--", zorder=-10)

In [None]:
from scipy.stats import fisher_exact

all_perturbagens = set(pd.read_csv("/mnt/storage/cmap/2017/differential_perturbation_knockdown_uc_cad_scz_{}.tsv".format(celltype), sep="\t", index_col=0).index.tolist())
all_perturbagens

double_sign = set([key for key, value in results_inner.items() if value < 0.05])

uc_sign=set(uc_sign)
# UC sign       not UC sign

# double sign

# not double sing       
array = [[len(uc_sign.intersection(double_sign)), len(double_sign.difference(uc_sign))],
         [len(uc_sign.difference(double_sign)), len(all_perturbagens.difference(uc_sign).difference(double_sign))]]

fisher_exact(array)

In [None]:
from scipy.stats import ttest_ind

results_outer = {}
meandiff_outer = {}
results_inner = {}
meandiff_inner = {}
joint_results = {}
joint_meandiff = {}

triple_core = uc_core.union(cad_core).union(scz_core)
double_core = cad_core.union(scz_core)

triple_perturbagens = pd.read_csv("/mnt/storage/cmap/2017/differential_perturbation_knockdown_uc_cad_scz_{}.tsv".format(celltype), sep="\t", index_col=0)
triple_perturbagens = triple_perturbagens.index[triple_perturbagens.FDR < 0.05].tolist()


#for gene in set(uc_sign).intersection(triple_perturbagens):
full_kd_df.columns = [column.split(".")[0] for column in full_kd_df.columns]
for gene in uc_sign:
#for gene in triple_perturbagens:
    index = None
    #first uc coregenes vs uc peripherals]

    df = get_differential_percentages(full_kd_df, uc_core, set(), noncore, use_min=True)[0].transpose()
    
    meandiff_outer = {gene: -value for gene, value in zip(df.index, df.meandiff)}
    results_outer = {gene: value for gene, value in zip(df.index, df.FDR)}

    
    #then cad and scz coregenes vs triple peripherals]

    df = get_differential_percentages(full_kd_df, double_core, set(), noncore, use_min=True)[0].transpose()
    
    meandiff_inner = {gene: -value for gene, value in zip(df.index, df.meandiff)}
    results_inner = {gene: value for gene, value in zip(df.index, df.FDR)}


    df = get_differential_percentages(full_kd_df, triple_core, set(), noncore, use_min=True)[0].transpose()
    
    joint_meandiff = {gene: -value for gene, value in zip(df.index, df.meandiff)}
    joint_results = {gene: value for gene, value in zip(df.index, df.FDR)}

In [None]:
fig, ax = plt.subplots(figsize=(full_width*cm, 6*cm))
#colors = ["blue" if pval < 0.05 else "gray" for pval in results_outer.values()]
#ax.scatter(np.zeros_like(list(meandiff_outer.values())), np.abs(list(meandiff_outer.values())), color=colors)

#colors = ["orange" if pval < 0.05 else "gray" for pval in results_inner.values()]
#ax.scatter(np.ones_like(list(meandiff_inner.values())), meandiff_inner.values(),  color=colors)

#colors = ["purple" if pval < 0.05 else "gray" for pval in joint_results.values()]
#ax.scatter(np.ones_like(list(joint_meandiff.values())) * 2, joint_meandiff.values(),  color=colors)
lines = []
labels = []

line = ax.plot(range(len(meandiff_outer.values())), meandiff_outer.values(), zorder=-5, marker=None, color="blue")
lines.extend(line)
line = ax.plot(range(len(meandiff_outer.values())), meandiff_inner.values(), zorder=-5, marker=None, color="orange")
lines.extend(line)
line = ax.plot(range(len(meandiff_outer.values())), joint_meandiff.values(), zorder=-5, marker=None, color="purple")
lines.extend(line)

color = ["blue" if value < 0.05 else "lightgray" for value in results_outer.values()]
ax.scatter(range(len(color)), meandiff_outer.values(), marker="o", color=color, edgecolor="blue", s=15, zorder=5)

color = ["orange" if value < 0.05 else "lightgray" for value in results_inner.values()]
ax.scatter(range(len(color)), meandiff_inner.values(), marker="o", color=color, edgecolor="orange", s=15, zorder=5)

color = ["purple" if value < 0.05 else "lightgray" for value in joint_results.values()]
ax.scatter(range(len(color)), joint_meandiff.values(), marker="o", color=color, edgecolor="purple", s=15, zorder=5)
lines.extend(line)
labels.append(gene)

ax.set_xticks(range(len(meandiff_outer.values())))
ax.set_xticklabels(meandiff_outer.keys(), rotation=90)

ax.legend(lines, ["UC", "CAD+SCZ", "UC+CAD+SCZ"])
ax.axhline(y=0, color="black", linestyle="--", zorder=-10)

ax.set_xlabel("Perturbagen (HEK293T)")
ax.set_ylabel("Mean Difference\n(Periph. - Core)")

# Look at behaviour in PC3 cells

In [None]:
celltype = "PC3"
uc_sign = pd.read_csv("/mnt/storage/cmap/2017/differential_perturbation_knockdown_uc_{}.tsv".format(celltype), sep="\t", index_col=0)
uc_sign = set(uc_sign.index[uc_sign.FDR < 0.05].tolist())
cad_sign = pd.read_csv("/mnt/storage/cmap/2017/differential_perturbation_knockdown_cad_{}.tsv".format(celltype), sep="\t", index_col=0)
cad_sign = set(cad_sign.index[cad_sign.FDR < 0.05].tolist())
scz_sign = pd.read_csv("/mnt/storage/cmap/2017/differential_perturbation_knockdown_scz_{}.tsv".format(celltype), sep="\t", index_col=0)
scz_sign = set(scz_sign.index[scz_sign.FDR < 0.05].tolist())


triple_perturbagens = pd.read_csv("/mnt/storage/cmap/2017/differential_perturbation_knockdown_uc_cad_scz_{}.tsv".format(celltype), sep="\t", index_col=0)
total_perturbagens = set(triple_perturbagens.index.tolist())
triple_perturbagens = set(triple_perturbagens.index[triple_perturbagens.FDR < 0.05].tolist())
len(triple_perturbagens.difference(uc_sign.union(scz_sign).union(cad_sign))) / min(len(triple_perturbagens), len(uc_sign.union(scz_sign).union(cad_sign)))

In [None]:
len(triple_perturbagens) / len(total_perturbagens)

In [None]:
len(uc_sign) / len(total_perturbagens)

In [None]:
len(cad_sign) / len(total_perturbagens)

In [None]:
len(scz_sign) / len(total_perturbagens)

In [None]:
len(uc_sign.intersection(cad_sign))

In [None]:
len(uc_sign.intersection(scz_sign))

In [None]:
len(cad_sign)

In [None]:
len(uc_sign)

In [None]:
len(scz_sign)

In [None]:
len(triple_perturbagens)

In [None]:
len(triple_perturbagens.difference(uc_sign.union(scz_sign).union(cad_sign))) / min(len(triple_perturbagens), len(uc_sign.union(scz_sign).union(cad_sign)))

In [None]:
triple_perturbagens.difference(uc_sign.union(scz_sign).union(cad_sign))

# Check if significant perturbagens are consistent across cell types

In [None]:
sign_perturbagens = {trait: {} for trait in traits}

celltypes = ["PC3", "HT29", "HEK293T"]
for trait in traits:
    for celltype in celltypes:
        df = pd.read_csv("/mnt/storage/cmap/2017/differential_perturbation_knockdown_{}_{}.tsv".format(trait, celltype), sep="\t", header=0, index_col=0)
        sign_perturbagens[trait][celltype] = set(df.index[df["FDR"] < 0.05])

overlap_indices = {trait: {} for trait in traits}
for trait in traits :
    #union = sign_perturbagens[trait][celltypes[0]].union(sign_perturbagens[trait][celltypes[1]]).union(sign_perturbagens[trait][celltypes[2]])
    for celltypeA in celltypes:
        row = []
        for celltypeB in celltypes:
            setA = sign_perturbagens[trait][celltypeA]
            setB = sign_perturbagens[trait][celltypeB]
            row.append(len(setA.intersection(setB)) / min(len(setA), len(setB)))
            
        overlap_indices[trait][celltypeA] = row

# across traits

In [None]:
import seaborn as sns
from random import sample

traits = ['uc', 'ra', 'cad', 'ad', 'scz']

sign_perturbagens = {trait: {} for trait in traits}
background = {trait: {} for trait in traits}

n_random_draws = 10000

for trait in traits:
    for celltype in celltypes:
        df = pd.read_csv("/mnt/storage/cmap/2017/differential_perturbation_knockdown_{}_{}_restricted.tsv".format(trait, celltype), sep="\t", header=0, index_col=0)
        sign_perturbagens[trait][celltype] = set(df.index[df["FDR"] < 0.05])
        background[trait][celltype] = set(df.index)

overlap_background = {trait: {} for trait in traits}
overlap_indices = {trait: {} for trait in traits}
rand_control = {trait: {} for trait in traits}
for traitA in traits:
    overlap_indices[traitA] = {trait: {} for trait in traits}
    overlap_background[traitA] = {trait: {} for trait in traits}
    rand_control[traitA] = {trait: {} for trait in traits}
    for traitB in traits:
        #union = sign_perturbagens[trait][celltypes[0]].union(sign_perturbagens[trait][celltypes[1]]).union(sign_perturbagens[trait][celltypes[2]])
        for celltypeA in celltypes:
            row = []
            background_row = []
            rand_row = []
            for celltypeB in celltypes:
                setA = sign_perturbagens[traitA][celltypeA]
                setB = sign_perturbagens[traitB][celltypeB]
                real_coeff = len(setA.intersection(setB)) / min(len(setA), len(setB))
                row.append(real_coeff)

                setA = background[traitA][celltypeA]
                setB = background[traitB][celltypeB]
                background_row.append(len(setA.intersection(setB)) / min(len(setA), len(setB)))

                rand_coeffs = []
                for _ in range(n_random_draws):
                    sampleA = sample(list(background[traitA][celltypeA]), len(sign_perturbagens[traitA][celltypeA]))
                    sampleB = sample(list(background[traitB][celltypeB]), len(sign_perturbagens[traitB][celltypeB]))
                    rand_coeffs.append(len(set(sampleA).intersection(set(sampleB))) / min(len(sampleA), len(sampleB)))
                
                rand_coeffs.append(real_coeff)
                rand_row.append((np.argpartition(rand_coeffs, n_random_draws) == n_random_draws).nonzero()[0].item())
            
            rand_control[traitA][traitB][celltypeA] = rand_row
            overlap_indices[traitA][traitB][celltypeA] = row
            overlap_background[traitA][traitB][celltypeA] = background_row

rownames = []
rows = []
for traitA in traits:
    for celltype in celltypes:
        rownames.append(celltype)
        row = []
        for traitB in traits:
            row.extend(overlap_indices[traitA][traitB][celltype])
        rows.append(row)

rows = np.asarray(rows)

rownames = []
background_rows = []
for traitA in traits:
    for celltype in celltypes:
        rownames.append(celltype)
        row = []
        for traitB in traits:
            row.extend(overlap_background[traitA][traitB][celltype])
        background_rows.append(row)

background_rows = np.asarray(background_rows)

rows = rows / background_rows

rows = rows[:,[0,3,6,9,12,1,4,7,10,13,2,5,8,11,14]]
rows = rows[[0,3,6,9,12,1,4,7,10,13,2,5,8,11,14], :]


oldshape = rows.shape 
labels = rows.flatten()

labels = np.asarray([("%.2g" % k).lstrip('0') if k != 1 else k for k in labels]).reshape(oldshape)

fig, ax = plt.subplots(figsize=(full_width*0.5*cm,full_width*0.4*cm ))

ax = sns.heatmap(rows, vmin=0,  vmax=1, cmap="viridis", annot=labels, ax=ax, fmt="", annot_kws={"fontsize": 5})
ax.set_yticklabels([trait.upper() for trait in traits]*3, rotation=90 )
ax.set_xticklabels([trait.upper() for trait in traits]*3, rotation=0, ha="center")
plt.yticks(rotation=0)

maximum = 15
minimum = 0
stride = 5
for trait, start in zip(rownames, range(minimum, maximum, stride)):
    ax.text(x = start + (stride/2), y= 17.5, s=trait, ha="center")
    ax.text(y = start + (stride/2), x= -3, s=trait, va="center", rotation=90)

plt.savefig("across_traits_knockdown_5_restricted.svg", bbox_inches="tight")

In [None]:
import matplotlib

rand_rownames = []
rand_rows = []
for traitA in traits:
    for celltype in celltypes:
        rand_rownames.append(celltype)
        row = []
        for traitB in traits:
            row.extend(rand_control[traitA][traitB][celltype])
        rand_rows.append(row)

rand_rows = np.asarray(rand_rows)
rand_rows = rand_rows[:,[0,3,6,9,12,1,4,7,10,13,2,5,8,11,14]]
rand_rows = rand_rows[[0,3,6,9,12,1,4,7,10,13,2,5,8,11,14], :]

rand_rows =  rand_rows / n_random_draws

labels = rand_rows.copy()
labels[labels > 0.5] = 1 - labels[labels > 0.5]

old_shape = labels.shape

fdr_labels = fdrcorrection(labels.flatten())[1].reshape(old_shape)

rand_rows[rand_rows < 0.5] = fdr_labels[rand_rows < 0.5]
rand_rows[rand_rows > 0.5] = 1- fdr_labels[rand_rows > 0.5]

fdr_labels *= 2

colors = ["green", "white", "pink"]
norm = matplotlib.colors.BoundaryNorm(boundaries=[0.975, 0.025], ncolors=256)

cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", colors)

labels = np.asarray([("%.3f" % k).lstrip('0') if k != 0 else "0" for k in fdr_labels.flatten()]).reshape(oldshape)

fig, ax = plt.subplots(figsize=(full_width*cm,full_width*0.8*cm ))

ax = sns.heatmap(rand_rows, vmin=0,  vmax=1, cmap=cmap,norm=norm, annot=labels, ax=ax, fmt="", annot_kws={"fontsize": 7})
ax.set_yticklabels([trait.upper() for trait in traits]*3, rotation=90 )
ax.set_xticklabels([trait.upper() for trait in traits]*3, rotation=0, ha="center")
plt.yticks(rotation=0)

maximum = 15
minimum = 0
stride = 5
for trait, start in zip(rownames, range(minimum, maximum, stride)):
    ax.text(x = start + (stride/2), y= 17.5, s=trait, ha="center")
    ax.text(y = start + (stride/2), x= -3, s=trait, va="center", rotation=90)

plt.savefig("across_traits_knockdown_5_pvals_{}_restricted.svg".format(n_random_draws))

In [None]:
rand_rows

In [None]:
rand_coeffs 

In [None]:
(np.argpartition(rand_coeffs, 500) == real_coeff).nonzero()

In [None]:
pc_rows = range(5)
ht29_rows = range(5,10)
hek_rows= range(10,15)

specificity = []
for _rows in [pc_rows, ht29_rows, hek_rows]:
    values = []
    for col in _rows:
        for row in _rows:
            if col == row:
                continue
            else:
                values.append(1-rows[row, col])
    specificity.append(np.mean(values))
specificity

In [None]:
pc_rows = range(0,5,2)
ht29_rows = range(5,10,2)
hek_rows= range(10,15,2)

specificity = []
for _rows in [pc_rows, ht29_rows, hek_rows]:
    values = []
    for col in _rows:
        for row in _rows:
            if col == row:
                continue
            else:
                values.append(1-rows[row, col])
    specificity.append(np.mean(values))
specificity

In [None]:
from random import sample

overlap_coeffs = []
for _ in range(1000):
    sampleA = sample(list(background["ad"]["HEK293T"]), len(sign_perturbagens["ad"]["HEK293T"]))
    sampleB = sample(list(background["scz"]["HEK293T"]), len(sign_perturbagens["scz"]["HEK293T"]))

    overlap_coeffs.append(len(set(sampleA).intersection(set(sampleB))) / min(len(sampleA), len(sampleB)))


In [None]:
np.quantile(overlap_coeffs, 0.975)

In [None]:
np.quantile(overlap_coeffs, 0.025)

In [None]:
testarray = np.array([3,1,4,2])
(np.argpartition(testarray, 2) == 2).nonzero()[0].item()

In [None]:
np.argpartition(testarray, 2)

In [None]:
traits = ['uc', 'ra', 'cad', 'ad', 'scz', "uc_cad_scz", "uc_ra_cad_ad_scz"]

sign_perturbagens = {trait: {} for trait in traits}
background = {trait: {} for trait in traits}
for trait in traits:
    for celltype in celltypes:
        df = pd.read_csv("/mnt/storage/cmap/2017/differential_perturbation_knockdown_{}_{}.tsv".format(trait, celltype), sep="\t", header=0, index_col=0)
        sign_perturbagens[trait][celltype] = set(df.index[df["FDR"] < 0.05])
        background[trait][celltype] = set(df.index)

overlap_background = {trait: {} for trait in traits}
overlap_indices = {trait: {} for trait in traits}
for traitA in traits:
    overlap_indices[traitA] = {trait: {} for trait in traits}
    overlap_background[traitA] = {trait: {} for trait in traits}
    for traitB in traits:
        #union = sign_perturbagens[trait][celltypes[0]].union(sign_perturbagens[trait][celltypes[1]]).union(sign_perturbagens[trait][celltypes[2]])
        for celltypeA in celltypes:
            row = []
            background_row = []
            for celltypeB in celltypes:
                setA = sign_perturbagens[traitA][celltypeA]
                setB = sign_perturbagens[traitB][celltypeB]
                row.append(len(setA.intersection(setB)) / min(len(setA), len(setB)))

            for celltypeB in celltypes:
                setA = background[traitA][celltypeA]
                setB = background[traitB][celltypeB]
                background_row.append(len(setA.intersection(setB)) / min(len(setA), len(setB)))
                
            overlap_indices[traitA][traitB][celltypeA] = row
            overlap_background[traitA][traitB][celltypeA] = background_row

rownames = []
rows = []
for traitA in traits:
    for celltype in celltypes:
        rownames.append(celltype)
        row = []
        for traitB in traits:
            row.extend(overlap_indices[traitA][traitB][celltype])
        rows.append(row)

rows = np.asarray(rows)

rownames = []
background_rows = []
for traitA in traits:
    for celltype in celltypes:
        rownames.append(celltype)
        row = []
        for traitB in traits:
            row.extend(overlap_background[traitA][traitB][celltype])
        background_rows.append(row)

background_rows = np.asarray(background_rows)

rows = rows / background_rows

#rows = rows[:,[0,3,6,9,12,1,4,7,10,13,2,5,8,11,14]]
#rows = rows[[0,3,6,9,12,1,4,7,10,13,2,5,8,11,14], :]
rows = rows[:,[0,3,6,9,12,15,18,1,4,7,10,13,16,19,2,5,8,11,14,17,20]]
rows = rows[[0,3,6,9,12,15,18,1,4,7,10,13,16,19,2,5,8,11,14,17,20], :]

oldshape = rows.shape 
labels = rows.flatten()

labels = np.asarray([("%.2g" % k).lstrip('0') if k != 1 else k for k in labels]).reshape(oldshape)

fig, ax = plt.subplots(figsize=(full_width*0.5*cm,full_width*0.4*cm ))

ax = sns.heatmap(rows, vmin=0,  vmax=1, cmap="viridis", annot=labels, ax=ax, fmt="", annot_kws={"fontsize": 5})
ax.set_yticklabels([trait.upper() for trait in traits]*3, rotation=90 )
ax.set_xticklabels([trait.upper() for trait in traits]*3, rotation=90, ha="center")
plt.yticks(rotation=0)

maximum = 15
minimum = 0
stride = 7
for trait, start in zip(rownames, range(minimum, maximum, stride)):
    ax.text(x = start + (stride/2), y= 28, s=trait, ha="center")
    ax.text(y = start + (stride/2), x= -10, s=trait, va="center", rotation=90)

plt.savefig("across_traits_knockdown_7.svg", bbox_inches="tight")

In [None]:
traits

In [None]:
rows.shape

In [None]:
rownames = []
rows = []
for traitA in traits:
    for celltype in celltypes:
        rownames.append(celltype)
        row = []
        for traitB in traits:
            row.extend(overlap_background[traitA][traitB][celltype])
        rows.append(row)

rows = np.asarray(rows)

rows = rows[:,[0,3,6,9,12,1,4,7,10,13,2,5,8,11,14]]
rows = rows[[0,3,6,9,12,1,4,7,10,13,2,5,8,11,14], :]

fig, ax = plt.subplots()

ax = sns.heatmap(rows, vmin=0,  vmax=1, cmap="viridis", annot=True, ax=ax, annot_kws={"fontsize": 5})
ax.set_yticklabels(traits*3, rotation=90)
ax.set_xticklabels(traits*3)
fig.autofmt_xdate(rotation=45)
plt.yticks(rotation=0)

maximum = 15
minimum = 0
stride = 5
for trait, start in zip(celltypes, range(minimum, maximum, stride)):
    ax.text(x = start + (stride/2), y= 18, s=trait, ha="center")
    ax.text(y = start + (stride/2), x= -3, s=trait, va="center", rotation=90)

In [None]:
exp_traits = ['uc', 'ra', 'cad', 'ad', 'scz', 'uc_cad_scz', 'uc_ra_cad_ad_scz']
traits = ['uc', 'ra', 'cad', 'ad', 'scz']

sign_perturbagen_fraction = {celltype: {} for celltype in celltypes}
for celltype in celltypes:
    for trait in exp_traits:
        df = pd.read_csv("/mnt/storage/cmap/2017/differential_perturbation_knockdown_{}_{}.tsv".format(trait, celltype), sep="\t", header=0, index_col=0)
        sign_perturbagen_fraction[celltype][trait] = len(df.index[df["FDR"] < 0.05]) / len(df.index)

In [None]:
traits

In [None]:
from numpy.random import normal

from speos.visualization.settings import *
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(full_width*cm*0.5, 5*cm))
markers = ["^", "o", "s"]
scatters = []
for (celltype, values), marker in zip(sign_perturbagen_fraction.items(), markers):
    ax.plot([0,1,2], [np.mean([values[trait] for trait in traits]), values["uc_cad_scz"], values["uc_ra_cad_ad_scz"]],marker=marker, markeredgecolor="black", color="purple")
    ax.plot([0,1], [np.max([values[trait] for trait in ["uc", "cad", "scz"]]), values["uc_cad_scz"]], linestyle=":", color="purple")

    scatters.append(ax.scatter(x=normal(0, 0.02, 5), y=[values[trait] for trait in traits], marker=marker, color="purple", edgecolor="black", alpha=0.5))

ax.legend(scatters, celltypes)
ax.set_ylim(0,1)
ax.set_ylabel("Sign. Perturbagen Fraction")
ax.set_xticks([0,1,2])
ax.set_xticklabels(["Indiv. Traits", "3 Traits (Union)", "5 Traits (Union)"])
plt.savefig("multitraits_knockdown.svg")

In [None]:
from numpy.random import normal

sign_perturbagen_meandiff = {celltype: {} for celltype in celltypes}
for celltype in celltypes:
    for trait in exp_traits:
        df = pd.read_csv("/mnt/storage/cmap/2017/differential_perturbation_knockdown_{}_{}.tsv".format(trait, celltype), sep="\t", header=0, index_col=0)
        sign_perturbagen_meandiff[celltype][trait] = df.meandiff[df["FDR"] < 0.05].abs().mean()
fig, ax = plt.subplots(figsize=(full_width*cm*0.5, 5*cm))
markers = ["^", "o", "s"]
scatters = []
for (celltype, values), marker in zip(sign_perturbagen_meandiff.items(), markers):
    ax.plot([0,1,2], [np.mean([values[trait] for trait in traits]), values["uc_cad_scz"], values["uc_ra_cad_ad_scz"]],marker=marker, markeredgecolor="black", color="purple")
    plot_this = np.argmax([sign_perturbagen_fraction[celltype][trait] for trait in ["uc", "cad", "scz"]])
    ax.plot([0,1], [[values[trait] for trait in ["uc", "cad", "scz"]][plot_this], values["uc_cad_scz"]], linestyle=":", color="purple")

    scatters.append(ax.scatter(x=normal(0, 0.02, 5), y=[values[trait] for trait in traits], marker=marker, color="purple", edgecolor="black", alpha=0.5))

ax.legend(scatters, celltypes)
#ax.set_ylim(0,0.25)
ax.set_ylabel("Avg. Mean Difference")
ax.set_xticks([0,1,2])
ax.set_xticklabels(["Indiv. Traits", "3 Traits (Union)", "5 Traits (Union)"])
plt.savefig("multitraits_meandiff_knockdown.svg")

In [None]:
plot_this

In [None]:
[sign_perturbagen_fraction[celltype][trait] for trait in ["uc", "cad", "scz"]]

In [None]:
sign_perturbagen_fraction = {celltype: {} for celltype in celltypes}
for celltype in celltypes:
    for trait in exp_traits:
        df = pd.read_csv("/mnt/storage/cmap/2017/differential_perturbation_overexpression_{}_{}.tsv".format(trait, celltype), sep="\t", header=0, index_col=0)
        sign_perturbagen_fraction[celltype][trait] = len(df.index[df["FDR"] < 0.05]) / len(df.index)

In [None]:
from numpy.random import normal
fig, ax = plt.subplots(figsize=(full_width*cm*0.5, 5*cm))
markers = ["^", "o", "s"]
scatters = []
for (celltype, values), marker in zip(sign_perturbagen_fraction.items(), markers):
    ax.plot([0,1,2], [np.mean([values[trait] for trait in traits]), values["uc_cad_scz"], values["uc_ra_cad_ad_scz"]],marker=marker, markeredgecolor="black", color="orange")
    ax.plot([0,1], [np.max([values[trait] for trait in ["uc", "cad", "scz"]]), values["uc_cad_scz"]], linestyle=":", color="orange")

    scatters.append(ax.scatter(x=normal(0, 0.02, 5), y=[values[trait] for trait in traits], marker=marker, color="orange", edgecolor="black", alpha=0.5))

ax.legend(scatters, celltypes)
ax.set_ylim(0,0.8)
ax.set_ylabel("Sign. Perturbagen Fraction")
ax.set_xticks([0,1,2])
ax.set_xticklabels(["Indiv. Traits", "3 Traits (Union)", "5 Traits (Union)"])
plt.savefig("multitraits_overexpression.svg")

In [None]:
exp_traits = traits + ["uc_cad_scz", "uc_ra_cad_ad_scz"]

sign_perturbagen_meandiff = {celltype: {} for celltype in celltypes}
for celltype in celltypes:
    for trait in exp_traits:
        df = pd.read_csv("/mnt/storage/cmap/2017/differential_perturbation_overexpression_{}_{}.tsv".format(trait, celltype), sep="\t", header=0, index_col=0)
        sign_perturbagen_meandiff[celltype][trait] = df.meandiff[df["FDR"] < 0.05].abs().mean()

from numpy.random import normal
fig, ax = plt.subplots(figsize=(full_width*cm*0.5, 5*cm))
markers = ["^", "o", "s"]
scatters = []
for (celltype, values), marker in zip(sign_perturbagen_meandiff.items(), markers):
    ax.plot([0,1,2], [np.mean([values[trait] for trait in traits]), values["uc_cad_scz"], values["uc_ra_cad_ad_scz"]],marker=marker, markeredgecolor="black", color="orange")
    plot_this = np.argmax([sign_perturbagen_fraction[celltype][trait] for trait in ["uc", "cad", "scz"]])
    ax.plot([0,1], [[values[trait] for trait in ["uc", "cad", "scz"]][plot_this], values["uc_cad_scz"]], linestyle=":", color="orange")

    scatters.append(ax.scatter(x=normal(0, 0.02, 5), y=[values[trait] for trait in traits], marker=marker, color="orange", edgecolor="black", alpha=0.5))

ax.legend(scatters, celltypes)
#ax.set_ylim(0,0.25)
ax.set_ylabel("Avg. Mean Difference")
ax.set_xticks([0,1,2])
ax.set_xticklabels(["Indiv. Traits", "3 Traits (Union)", "5 Traits (Union)"])
plt.savefig("multitraits_meandiff_overexpression.svg")

In [None]:
import pandas as pd
traits = ['uc', 'ra', 'cad', 'ad', 'scz', "uc_cad_scz", "uc_ra_cad_ad_scz"]
celltypes = ["HT29", "PC3", "HEK293T"]

sign_perturbagen = {celltype: {} for celltype in celltypes}
for celltype in celltypes:
    for trait in traits:
        df = pd.read_csv("/mnt/storage/cmap/2017/differential_perturbation_knockdown_{}_{}.tsv".format(trait, celltype), sep="\t", header=0, index_col=0)
        sign_perturbagen[celltype][trait] = set(df.index[df["FDR"] < 0.05])

In [None]:
hek = sign_perturbagen["HEK293T"]
hek

In [None]:
hek_union = hek["uc"].union(hek["cad"].union(hek["scz"]))

len(hek_union)

In [None]:
len(hek["uc_cad_scz"])

In [None]:
len(hek["uc_cad_scz"].intersection(hek_union))

In [None]:
hek_union.difference(hek["uc_cad_scz"])

In [None]:
hek["uc"].difference(hek["uc_cad_scz"])

In [None]:
len(hek["uc"].intersection(hek["uc_cad_scz"]))

In [None]:
hek["cad"].difference(hek["uc_cad_scz"])

In [None]:
len(hek["cad"].intersection(hek["uc_cad_scz"]))

In [None]:
len(hek["cad"])

In [None]:
len(hek["uc_cad_scz"])

In [None]:
hek["scz"].difference(hek["uc_cad_scz"])

In [None]:
len(hek["scz"].intersection(hek["uc_cad_scz"]))

In [None]:
hek_intersection = hek["uc"].intersection(hek["cad"].intersection(hek["scz"].intersection(hek["ra"]).intersection(hek["ad"])))

In [None]:
hek_intersection

# Volcano plot

In [None]:
trait = "uc"
celltype = "HT29"

allcore, other_coregenes, hsps,  noncore = get_coregenes(trait, id2hgnc.values())
full_kd_df = pd.read_csv("/mnt/storage/cmap/2017/kd_df_{}.tsv".format(celltype), header=0, sep="\t", index_col=0)

results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_kd_df, allcore, hsps, noncore, use_min=True)
results_df = results_df.transpose()
print(len(results_df))
results_df = results_df[(results_df["Core Gene"] + results_df["HSP"] + results_df["Peripheral"]).values.astype(np.bool_)]
print(len(results_df))
results_df = results_df[results_df["FDR"] < 0.05]
print(len(results_df))

In [None]:
from speos.visualization.settings import *
from matplotlib.patches import Patch
import matplotlib.pyplot as plt
from adjustText import adjust_text

fig, ax = plt.subplots(figsize=(full_width*cm*0.5, 8*cm))
all_significant = results_df["meandiff"][(results_df["FDR"] < 0.05) & (results_df["Core Gene"] + results_df["HSP"] + results_df["Peripheral"])]
core_significant = results_df["meandiff"][(results_df["FDR"] < 0.05) & (results_df["Core Gene"])]
hsp_significant = results_df["meandiff"][(results_df["FDR"] < 0.05) & (results_df["HSP"])]
peri_significant = results_df["meandiff"][(results_df["FDR"] < 0.05) & (results_df["Peripheral"])]

core_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["Core Gene"])]
hsp_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["HSP"])]
peri_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["Peripheral"])]
ax.set_yscale("log")
ax.scatter(x=peri_significant,y = 1 / peri_fdr, s=5, c="#8a8a8a")
ax.scatter(x=core_significant,y = 1 / core_fdr, s=5, c="#01016f")
ax.scatter(x=hsp_significant,y = 1 / hsp_fdr, s=5, c="#d8031c")

texts = []
sorted_df = results_df.sort_values(by="FDR", ascending=True)
already_printed = []
for i in range(8):
    texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
    already_printed.append(sorted_df.index[i])

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y= (1/results_df["FDR"].values).tolist(), force_points=3, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)

texts = []
sorted_df = results_df.sort_values(by="meandiff", ascending=True)
for i in range(3):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

sorted_df = results_df.sort_values(by="meandiff", ascending=False)
for i in range(5):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

sorted_df = results_df[results_df["meandiff"] < 0].sort_values(by="FDR", ascending=True)
for i in range(5):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y=(1/results_df["FDR"].values).tolist(), force_points=5, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)

texts = []
hsp_df = results_df[results_df["HSP"]]
if len(hsp_df) > 0:
      for i in range(len(hsp_df)):
            texts.append(ax.text(hsp_df["meandiff"][i], 1/hsp_df["FDR"][i], hsp_df.index[i], size=4, va="center"))

      adjust_text(texts, x=results_df["meandiff"].values.tolist(), y=(1/results_df["FDR"].values).tolist(), force_points=0.5, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)


ax.vlines(0, 1/0.05, 10e28, color="gray", linestyles=":")

ax.text(-0.01, y=10e28, s="{:.1f}%".format(((all_significant < 0).sum() / len(all_significant)) * 100), ha="right", va="top", fontsize=8)
ax.text(+0.01, y=10e28, s="{:.1f}%".format(((all_significant > 0).sum() / len(all_significant)) * 100), ha="left", va="top", fontsize=8)


ax.text(-0.01, y=10e26, s="{:.1f}%".format(((peri_significant < 0).sum() / len(peri_significant)) * 100), ha="right", va="top", color="#8a8a8a", fontsize=8)
ax.text(+0.01, y=10e26, s="{:.1f}%".format(((peri_significant > 0).sum() / len(peri_significant)) * 100), ha="left", va="top", color="#8a8a8a", fontsize=8)

ax.text(-0.01, y=10e24, s="{:.1f}%".format(((core_significant < 0).sum() / len(core_significant)) * 100), ha="right", va="top", color="#01016f", fontsize=8)
ax.text(+0.01, y=10e24, s="{:.1f}%".format(((core_significant > 0).sum() / len(core_significant)) * 100), ha="left", va="top", color="#01016f", fontsize=8)

ax.text(-0.01, y=10e22, s="{:.1f}%".format(((hsp_significant < 0).sum() / len(hsp_significant)) * 100), ha="right", va="top", color="#d8031c", fontsize=8)
ax.text(+0.01, y=10e22, s="{:.1f}%".format(((hsp_significant > 0).sum() / len(hsp_significant)) * 100), ha="left", va="top", color="#d8031c", fontsize=8)


legend_elements = [Patch(facecolor='black', edgecolor='black',
                         label='Any\nn={}'.format(len(all_significant))),
                   Patch(facecolor='#8a8a8a', edgecolor='#8a8a8a',
                         label='Peripheral\nn={}'.format(len(peri_significant))),
                   Patch(facecolor='#01016f', edgecolor='#01016f',
                         label='Core Gene\nn={}'.format(len(core_significant))),
                   Patch(facecolor='#d8031c', edgecolor="#d8031c",
                         label='HSP\nn={}'.format(len(hsp_significant)))]


leg = ax.legend(handles=legend_elements, loc='upper left', title="Perturbagen", fontsize=6.8, title_fontsize=7, ncol=4, columnspacing=1.7, handletextpad=-0.7)

for patch in leg.get_patches():
    patch.set_height(15)
    patch.set_width(5)
    patch.set_y(-5)

ax.set_ylim(bottom=5, top=10e38)
ax.set_ylabel(r"$-\log(FDR)$")
ax.set_xlabel("Mean Differential Perturbation\n(Core Gene - Peripheral)")
plt.tight_layout()
plt.savefig("Volcano_Knockdown_strongest_{}_{}.svg".format(trait, celltype), bbox_inches="tight")


In [None]:
results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_kd_df, allcore, hsps, noncore, use_min=False)
results_df = results_df.transpose()
print(len(results_df))
results_df = results_df[(results_df["Core Gene"] + results_df["HSP"] + results_df["Peripheral"]).values.astype(np.bool_)]
print(len(results_df))
results_df = results_df[results_df["FDR"] < 0.05]
print(len(results_df))

In [None]:
from speos.visualization.settings import *
from matplotlib.patches import Patch
import matplotlib.pyplot as plt
from adjustText import adjust_text

fig, ax = plt.subplots(figsize=(full_width*cm*0.5, 8*cm))
all_significant = results_df["meandiff"]
core_significant = results_df["meandiff"][results_df["Core Gene"]]
hsp_significant = results_df["meandiff"][results_df["HSP"]]
peri_significant = results_df["meandiff"][results_df["Peripheral"]]

core_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["Core Gene"])]
hsp_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["HSP"])]
peri_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["Peripheral"])]

ax.set_yscale("log")
ax.scatter(x=peri_significant,y = 1 / peri_fdr, s=5, c="#8a8a8a")
ax.scatter(x=core_significant,y = 1 / core_fdr, s=5, c="#01016f")
ax.scatter(x=hsp_significant,y = 1 / hsp_fdr, s=5, c="#d8031c")



texts = []
sorted_df = results_df.sort_values(by="FDR", ascending=True)
already_printed = []
for i in range(8):
    texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
    already_printed.append(sorted_df.index[i])

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y= (1/results_df["FDR"].values).tolist(), force_points=3, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)

texts = []
sorted_df = results_df.sort_values(by="meandiff", ascending=True)
for i in range(3):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

sorted_df = results_df.sort_values(by="meandiff", ascending=False)
for i in range(5):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

sorted_df = results_df[results_df["meandiff"] < 0].sort_values(by="FDR", ascending=True)
for i in range(5):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y=(1/results_df["FDR"].values).tolist(), force_points=10, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)

texts = []
hsp_df = results_df[results_df["HSP"]]
for i in range(len(hsp_df)):
   texts.append(ax.text(hsp_df["meandiff"][i], 1/hsp_df["FDR"][i], hsp_df.index[i], size=4, va="center"))

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y=(1/results_df["FDR"].values).tolist(), force_points=0.5, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)


ax.vlines(0, 1/0.05, 10e28, color="gray", linestyles=":")

ax.text(-0.01, y=10e28, s="{:.1f}%".format(((all_significant < 0).sum() / len(all_significant)) * 100), ha="right", va="top", fontsize=8)
ax.text(+0.01, y=10e28, s="{:.1f}%".format(((all_significant > 0).sum() / len(all_significant)) * 100), ha="left", va="top", fontsize=8)


ax.text(-0.01, y=10e26, s="{:.1f}%".format(((peri_significant < 0).sum() / len(peri_significant)) * 100), ha="right", va="top", color="#8a8a8a", fontsize=8)
ax.text(+0.01, y=10e26, s="{:.1f}%".format(((peri_significant > 0).sum() / len(peri_significant)) * 100), ha="left", va="top", color="#8a8a8a", fontsize=8)

ax.text(-0.01, y=10e24, s="{:.1f}%".format(((core_significant < 0).sum() / len(core_significant)) * 100), ha="right", va="top", color="#01016f", fontsize=8)
ax.text(+0.01, y=10e24, s="{:.1f}%".format(((core_significant > 0).sum() / len(core_significant)) * 100), ha="left", va="top", color="#01016f", fontsize=8)

ax.text(-0.01, y=10e22, s="{:.1f}%".format(((hsp_significant < 0).sum() / len(hsp_significant)) * 100), ha="right", va="top", color="#d8031c", fontsize=8)
ax.text(+0.01, y=10e22, s="{:.1f}%".format(((hsp_significant > 0).sum() / len(hsp_significant)) * 100), ha="left", va="top", color="#d8031c", fontsize=8)


legend_elements = [Patch(facecolor='black', edgecolor='black',
                         label='Any\nn={}'.format(len(all_significant))),
                   Patch(facecolor='#8a8a8a', edgecolor='#8a8a8a',
                         label='Peripheral\nn={}'.format(len(peri_significant))),
                   Patch(facecolor='#01016f', edgecolor='#01016f',
                         label='Core Gene\nn={}'.format(len(core_significant))),
                   Patch(facecolor='#d8031c', edgecolor="#d8031c",
                         label='HSP\nn={}'.format(len(hsp_significant)))]


leg = ax.legend(handles=legend_elements, loc='upper left', title="Perturbagen", fontsize=6.8, title_fontsize=7, ncol=4, columnspacing=1.7, handletextpad=-0.7)

for patch in leg.get_patches():
    patch.set_height(15)
    patch.set_width(5)
    patch.set_y(-5)

ax.set_ylim(bottom=5, top=10e38)

ax.set_ylabel(r"$-\log(FDR)$")
ax.set_xlabel("Mean Differential Perturbation\n(Core Gene - Peripheral)")
#plt.tight_layout()
plt.savefig("Volcano_Knockdown_all.svg", bbox_inches="tight")


In [None]:
"MLLT6" in results_df.index

In [None]:

coregene_target = full_kd_df.loc[full_kd_df.index.isin(coregenes), :]
noncore_target = full_kd_df.loc[full_kd_df.index.isin(noncore), :]
total_result = ttest_ind(coregene_target, noncore_target)
fdr = fdrcorrection(total_result[1])

total_meandiff = coregene_target.mean(axis=0) - noncore_target.mean(axis=0)

In [None]:
coregene_mask = np.asarray([value.split(".")[0] in coregenes for value in full_kd_df.columns])
hsp_mask = np.asarray([value.split(".")[0] in hsps for value in full_kd_df.columns])
noncore_mask = np.asarray([value.split(".")[0] in noncore.difference(hsps) for value in full_kd_df.columns])

In [None]:
total_meandiff[fdr[0] & hsp_mask]

In [None]:
total_meandiff[fdr[0] & coregene_mask]

In [None]:
total_meandiff[fdr[0] & noncore_mask]

In [None]:
from speos.visualization.settings import *
from matplotlib.patches import Patch
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(full_width*cm*0.5, 8*cm))
all_significant = total_meandiff[fdr[0] & (noncore_mask + coregene_mask + hsp_mask)]
core_significant = total_meandiff[fdr[0] & coregene_mask]
hsp_significant = total_meandiff[fdr[0] & hsp_mask]
peri_significant = total_meandiff[fdr[0] & noncore_mask]

ax.scatter(x=peri_significant,y = 1 / fdr[1][fdr[0] & noncore_mask], s=5, c="#5a5a5a")
ax.scatter(x=core_significant,y = 1 / fdr[1][fdr[0] & coregene_mask], s=5, c="#01016f")
ax.scatter(x=hsp_significant,y = 1 / fdr[1][fdr[0] & hsp_mask], s=5, c="#d8031c")

ax.vlines(0, 1/0.05, 10e28, color="gray", linestyles=":")

ax.text(-0.01, y=10e28, s="{:.1f}%".format(((all_significant < 0).sum() / len(all_significant)) * 100), ha="right", va="top", fontsize=8)
ax.text(+0.01, y=10e28, s="{:.1f}%".format(((all_significant > 0).sum() / len(all_significant)) * 100), ha="left", va="top", fontsize=8)

ax.text(-0.01, y=10e26, s="{:.1f}%".format(((peri_significant < 0).sum() / len(peri_significant)) * 100), ha="right", va="top", color="#5a5a5a", fontsize=8)
ax.text(+0.01, y=10e26, s="{:.1f}%".format(((peri_significant > 0).sum() / len(peri_significant)) * 100), ha="left", va="top", color="#5a5a5a", fontsize=8)

ax.text(-0.01, y=10e24, s="{:.1f}%".format(((core_significant < 0).sum() / len(core_significant)) * 100), ha="right", va="top", color="#01016f", fontsize=8)
ax.text(+0.01, y=10e24, s="{:.1f}%".format(((core_significant > 0).sum() / len(core_significant)) * 100), ha="left", va="top", color="#01016f", fontsize=8)

ax.text(-0.01, y=10e22, s="{:.1f}%".format(((hsp_significant < 0).sum() / len(hsp_significant)) * 100), ha="right", va="top", color="#d8031c", fontsize=8)
ax.text(+0.01, y=10e22, s="{:.1f}%".format(((hsp_significant > 0).sum() / len(hsp_significant)) * 100), ha="left", va="top", color="#d8031c", fontsize=8)


legend_elements = [Patch(facecolor='black', edgecolor='black',
                         label='Any\nn={}'.format(len(all_significant))),
                   Patch(facecolor='#5a5a5a', edgecolor='#5a5a5a',
                         label='Peripheral\nn={}'.format(len(peri_significant))),
                   Patch(facecolor='#01016f', edgecolor='#01016f',
                         label='Core Gene\nn={}'.format(len(core_significant))),
                   Patch(facecolor='#d8031c', edgecolor="#d8031c",
                         label='HSP\nn={}'.format(len(hsp_significant)))]


leg = ax.legend(handles=legend_elements, loc='upper left', title="Perturbagen", fontsize=6.8, title_fontsize=7, ncol=4, columnspacing=1.7, handletextpad=-0.7)

for patch in leg.get_patches():
    patch.set_height(15)
    patch.set_width(5)
    patch.set_y(-5)

ax.set_ylim(bottom=5, top=10e38)
ax.set_yscale("log")
ax.set_ylabel(r"$-\log(FDR)$")
ax.set_xlabel("Mean Differential Perturbation\n(Core Gene - Peripheral)")
plt.tight_layout()
plt.savefig("Volcano_Knockdown.svg", bbox_inches="tight")


# see if significant perturbagens are among diseas network

In [None]:
diff_df, _, _, _, _ = get_differential_percentages(full_kd_df, allcore, hsps, noncore, use_min=True)

In [None]:
sign_perturbagens = diff_df.columns[diff_df.transpose()["FDR"] < 0.05]
len(sign_perturbagens)

In [None]:
all_perturbagens = diff_df.columns
len(all_perturbagens)

In [None]:
disease_edges = pd.read_csv("../disease_network_02.txt", sep="\t", index_col=False, header=None, names=["from", "to", "type", "weight"])

In [None]:
disease_network_nodes = disease_edges["from"].tolist()
disease_network_nodes.extend(disease_edges["to"].tolist())
disease_network_nodes = set(disease_network_nodes).intersection(set(all_perturbagens))
len(disease_network_nodes)

In [None]:
len(disease_network_nodes.intersection(sign_perturbagens))

In [None]:
from scipy.stats import fisher_exact

network_and_perturbagen = len(disease_network_nodes.intersection(sign_perturbagens))
network_not_perturbagen = len(disease_network_nodes.difference(sign_perturbagens))
not_network_and_perturbagen = len(all_perturbagens.difference(disease_network_nodes).intersection(sign_perturbagens))
not_network_not_perturbagen = len(all_perturbagens.difference(disease_network_nodes).difference(sign_perturbagens))

array = [[network_and_perturbagen, network_not_perturbagen],
         [not_network_and_perturbagen, not_network_not_perturbagen]]

fisher_exact(array)

# Now for Overexpression

In [None]:
celltype = "HT29"

full_oe_df = pd.read_csv("/mnt/storage/cmap/2017/oe_df_{}.tsv".format(celltype), header=0, sep="\t", index_col=0)

In [None]:
get_differential_percentages(full_oe_df, allcore, hsps, noncore)[0].transpose().to_csv("/mnt/storage/cmap/2017/differential_perturbation_overexpression_UC_{}.tsv".format(celltype), sep="\t")

In [None]:
results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_oe_df, allcore, hsps, noncore)


random_core = []
random_hsp = []
random_peri = []
for i in tqdm(range(100)):
    _, _, core_result_random, hsp_result_random, peri_result_random = get_differential_percentages(full_oe_df, allcore, hsps, noncore, randomize_core=True, random_seed=i)
    random_core.append(core_result_random[0])
    random_hsp.append(hsp_result_random[0])
    random_peri.append(peri_result_random[0])


In [None]:
import seaborn as sns
from speos.visualization.settings import *
import matplotlib.pyplot as plt
fig, ax= plt.subplots(figsize=(8*cm,5*cm))

num_target_core_genes = len(allcore.intersection(set(full_kd_df.index)))

kd_matrix_mean = pd.DataFrame(index=["HSP\n(n={})".format(hsp_result[1]), "Peripheral\n(n={})".format(peri_result[1]), "Core\n(n={})".format(core_result[1])],
                         data={"Core Genes\nn={}".format(num_target_core_genes): [hsp_result[0], peri_result[0],  core_result[0]],
                               "Random Genes\nn={} ({}x)".format(num_target_core_genes, len(random_hsp)): [np.mean(random_hsp), np.mean(random_peri), np.mean(random_core)]})

ax = sns.heatmap(kd_matrix_mean.transpose(),vmin=0, vmax=1, cmap="Oranges", annot=True, fmt=".1%", ax=ax,
                 cbar_kws={'label': "Fraction Significant\nDifferential Perturbations",
                           "pad": 0.01})
cbar = ax.collections[-1].colorbar
cbar.ax.set_ylabel("Fraction Significant\nDifferential Perturbations", fontsize=5)
ax.set_ylabel("Target Gene Set", fontsize=7)
ax.set_xlabel("Perturbagen (Overexpression)", fontsize=7)
plt.tight_layout()
plt.savefig("Perurbation_overexpression_{}.svg".format(celltype), bbox_inches="tight")

In [None]:

def full_overexpression(trait, celltype, background):
    import matplotlib as mpl

    # set font
    mpl.rcParams['font.family'] = 'Helvetica'

    full_width = 18
    cm = 1/2.54
    small_font = 6
    medium_font = 8
    large_font = 10
    mpl.rc('xtick', labelsize=small_font)
    mpl.rc('ytick', labelsize=small_font)
    mpl.rcParams['axes.linewidth'] = 0.4
    mpl.rcParams['ytick.major.size'] = 3
    mpl.rcParams['ytick.major.width'] = 0.5
    mpl.rcParams['ytick.minor.size'] = 2
    mpl.rcParams['ytick.minor.width'] = 0.3
    mpl.rcParams['xtick.major.size'] = 2
    mpl.rcParams['xtick.major.width'] = 0.3
    mpl.rcParams['xtick.minor.size'] = 1
    mpl.rcParams['xtick.minor.width'] = 0.1


    print ("Starting OE Analysis for {} {}".format(trait, celltype))
    if isinstance(trait, str):
        allcore, other_coregenes, hsps,  noncore = get_coregenes(trait, background)
        traitstring = trait
    else:
        allcore = set()
        other_coregenes = set()
        hsps = set()
        noncore = set(list(background)[:])
        for _trait in trait:
            _allcore, _other_coregenes, _hsps,  _noncore = get_coregenes(_trait, background)
            allcore.update(set(_allcore))
            other_coregenes.update(set(_other_coregenes))
            hsps.update(set(_hsps))
            noncore = noncore.intersection(_noncore)
        traitstring = "_".join(trait)



    full_df = pd.read_csv("/mnt/storage/cmap/2017/oe_df_{}.tsv".format(celltype), header=0, sep="\t", index_col=0)
    get_differential_percentages(full_df, allcore, hsps, noncore, use_min=True)[0].transpose().to_csv("/mnt/storage/cmap/2017/differential_perturbation_overexpression_{}_{}.tsv".format(traitstring, celltype), sep="\t")

    results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_df, allcore, hsps, noncore, use_min=True)

    random_core = []
    random_hsp = []
    random_peri = []
    for i in range(100):
        _, _, core_result_random, hsp_result_random, peri_result_random = get_differential_percentages(full_df, allcore, hsps, noncore, randomize_core=True, random_seed=i)
        random_core.append(core_result_random[0])
        random_hsp.append(hsp_result_random[0])
        random_peri.append(peri_result_random[0])

    results_df = results_df.transpose()
        
    fig, ax= plt.subplots(figsize=(8*cm,5*cm))

    num_target_core_genes = len(allcore.intersection(set(full_df.index)))

    kd_matrix_mean = pd.DataFrame(index=["HSP" + "\n(n=%s)" % hsp_result[1], "Peripheral\n" + "(n=%s)" % peri_result[1], "Core Gene\n" + "(n=%s)" % core_result[1]],
                                data={"Core Genes\n" + "n={}".format(num_target_core_genes): [hsp_result[0], peri_result[0],  core_result[0]],
                                    "Random Genes\n" + "n={} ({}x)".format(num_target_core_genes, len(random_hsp)): [np.mean(random_hsp), np.mean(random_peri), np.mean(random_core)]})

    ax = sns.heatmap(kd_matrix_mean.transpose(), vmin=0,  vmax=1, cmap="Oranges", annot=True, fmt=".1%", ax=ax,
                        cbar_kws={'label': "Fraction Significant\nDifferential Perturbations",
                                "pad": 0.01})
    #ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize = 6, rotaion=90)
    #ax.set_yticklabels(ax.get_ymajorticklabels(), fontsize = 6, rotaion=-90)
    cbar = ax.collections[-1].colorbar
    cbar.ax.set_ylabel("Fraction Significant\nDifferential Perturbations", fontsize=5)
    ax.set_ylabel("Target Gene Set", fontsize=7)
    ax.set_xlabel("Perturbagen (Overexpression)", fontsize=7)
    plt.tight_layout()
    plt.savefig("Perturbation_overexpression_{}_{}.svg".format(traitstring, celltype), bbox_inches="tight")

In [None]:
from joblib import Parallel, delayed

traits = ["uc", "ra", "cad", "ad", "scz", ["uc", "cad", "scz"], ["uc", "ra", "cad", "ad", "scz"]]
celltypes = ["PC3", "HT29", "HEK293T"]

combinations = []

for trait in traits:
    for celltype in celltypes:
        combinations.append((trait, celltype))

with tqdm_joblib(tqdm(desc="My calculation", total=len(combinations))) as progress_bar:
    Parallel(n_jobs=len(combinations))(delayed(full_overexpression)(trait, celltype, id2hgnc.values()) for (trait, celltype) in combinations)

In [None]:
traits = ['uc', 'ra', 'cad', 'ad', 'scz']

sign_perturbagens = {trait: {} for trait in traits}
background = {trait: {} for trait in traits}
for trait in traits:
    for celltype in celltypes:
        df = pd.read_csv("/mnt/storage/cmap/2017/differential_perturbation_overexpression_{}_{}.tsv".format(trait, celltype), sep="\t", header=0, index_col=0)
        sign_perturbagens[trait][celltype] = set(df.index[df["FDR"] < 0.05])
        background[trait][celltype] = set(df.index)

overlap_background = {trait: {} for trait in traits}
overlap_indices = {trait: {} for trait in traits}
for traitA in traits:
    overlap_indices[traitA] = {trait: {} for trait in traits}
    overlap_background[traitA] = {trait: {} for trait in traits}
    for traitB in traits:
        #union = sign_perturbagens[trait][celltypes[0]].union(sign_perturbagens[trait][celltypes[1]]).union(sign_perturbagens[trait][celltypes[2]])
        for celltypeA in celltypes:
            row = []
            background_row = []
            for celltypeB in celltypes:
                setA = sign_perturbagens[traitA][celltypeA]
                setB = sign_perturbagens[traitB][celltypeB]
                row.append(len(setA.intersection(setB)) / min(len(setA), len(setB)))

            for celltypeB in celltypes:
                setA = background[traitA][celltypeA]
                setB = background[traitB][celltypeB]
                background_row.append(len(setA.intersection(setB)) / min(len(setA), len(setB)))
                
            overlap_indices[traitA][traitB][celltypeA] = row
            overlap_background[traitA][traitB][celltypeA] = background_row

In [None]:
rownames = []
rows = []
for traitA in traits:
    for celltype in celltypes:
        rownames.append(celltype)
        row = []
        for traitB in traits:
            row.extend(overlap_indices[traitA][traitB][celltype])
        rows.append(row)

rows = np.asarray(rows)

rownames = []
background_rows = []
for traitA in traits:
    for celltype in celltypes:
        rownames.append(celltype)
        row = []
        for traitB in traits:
            row.extend(overlap_background[traitA][traitB][celltype])
        background_rows.append(row)

background_rows = np.asarray(background_rows)

rows = rows / background_rows

rows = rows[:,[0,3,6,9,12,1,4,7,10,13,2,5,8,11,14]]
rows = rows[[0,3,6,9,12,1,4,7,10,13,2,5,8,11,14], :]

oldshape = rows.shape 
labels = rows.flatten()

labels = np.asarray([("%.2g" % k).lstrip('0') if k != 1 else k for k in labels]).reshape(oldshape)

fig, ax = plt.subplots(figsize=(full_width*0.5*cm,full_width*0.4*cm ))

ax = sns.heatmap(rows, vmin=0,  vmax=1, cmap="viridis", annot=labels, ax=ax, fmt="", annot_kws={"fontsize": 5})
ax.set_yticklabels([trait.upper() for trait in traits]*3, rotation=90 )
ax.set_xticklabels([trait.upper() for trait in traits]*3, rotation=0, ha="center")
plt.yticks(rotation=0)

maximum = 15
minimum = 0
stride = 5
for trait, start in zip(rownames, range(minimum, maximum, stride)):
    ax.text(x = start + (stride/2), y= 17.5, s=trait, ha="center")
    ax.text(y = start + (stride/2), x= -3, s=trait, va="center", rotation=90)

plt.savefig("across_traits_overexpression.svg", bbox_inches="tight")

In [None]:
labels

In [None]:
results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_oe_df, allcore, hsps, noncore, use_min=False)
results_df = results_df.transpose()
results_df = results_df[(results_df["Core Gene"] + results_df["HSP"] + results_df["Peripheral"]).values.astype(np.bool_)]
results_df = results_df[results_df["FDR"] < 0.05]

In [None]:
from speos.visualization.settings import *
from matplotlib.patches import Patch
from adjustText import adjust_text
import matplotlib.pyplot as plt


fig, ax = plt.subplots(figsize=(full_width*cm*0.5, 8*cm))
all_significant = results_df["meandiff"]
core_significant = results_df["meandiff"][results_df["Core Gene"]]
hsp_significant = results_df["meandiff"][results_df["HSP"]]
peri_significant = results_df["meandiff"][results_df["Peripheral"]]

core_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["Core Gene"])]
hsp_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["HSP"])]
peri_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["Peripheral"])]
ax.set_yscale("log")
ax.scatter(x=peri_significant,y = 1 / peri_fdr, s=5, c="#8a8a8a")
ax.scatter(x=core_significant,y = 1 / core_fdr, s=5, c="#01016f")
ax.scatter(x=hsp_significant,y = 1 / hsp_fdr, s=5, c="#d8031c")


texts = []
sorted_df = results_df.sort_values(by="FDR", ascending=True)
already_printed = []
for i in range(8):
    texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
    already_printed.append(sorted_df.index[i])

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y= (1/results_df["FDR"].values).tolist(), force_points=3, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)

texts = []
sorted_df = results_df.sort_values(by="meandiff", ascending=True)
for i in range(3):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

sorted_df = results_df.sort_values(by="meandiff", ascending=False)
for i in range(8):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

sorted_df = results_df[results_df["meandiff"] < 0].sort_values(by="FDR", ascending=True)
for i in range(3):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y=(1/results_df["FDR"].values).tolist(), force_points=10, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)


texts = []
hsp_df = results_df[results_df["HSP"]]
for i in range(len(hsp_df)):
   texts.append(ax.text(hsp_df["meandiff"][i], 1/hsp_df["FDR"][i], hsp_df.index[i], size=4, va="center"))

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y=(1/results_df["FDR"].values).tolist(), force_points=0.5, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)


ax.vlines(0, 1/0.05, 10e35, color="gray", linestyles=":")

ax.text(-0.01, y=10e35, s="{:.1f}%".format(((all_significant < 0).sum() / len(all_significant)) * 100), ha="right", va="top", fontsize=8)
ax.text(+0.01, y=10e35, s="{:.1f}%".format(((all_significant > 0).sum() / len(all_significant)) * 100), ha="left", va="top", fontsize=8)

ax.text(-0.01, y=10e32, s="{:.1f}%".format(((peri_significant < 0).sum() / len(peri_significant)) * 100), ha="right", va="top", color="#8a8a8a", fontsize=8)
ax.text(+0.01, y=10e32, s="{:.1f}%".format(((peri_significant > 0).sum() / len(peri_significant)) * 100), ha="left", va="top", color="#8a8a8a", fontsize=8)

ax.text(-0.01, y=10e29, s="{:.1f}%".format(((core_significant < 0).sum() / len(core_significant)) * 100), ha="right", va="top", color="#01016f", fontsize=8)
ax.text(+0.01, y=10e29, s="{:.1f}%".format(((core_significant > 0).sum() / len(core_significant)) * 100), ha="left", va="top", color="#01016f", fontsize=8)

ax.text(-0.01, y=10e26, s="{:.1f}%".format(((hsp_significant < 0).sum() / len(hsp_significant)) * 100), ha="right", va="top", color="#d8031c", fontsize=8)
ax.text(+0.01, y=10e26, s="{:.1f}%".format(((hsp_significant > 0).sum() / len(hsp_significant)) * 100), ha="left", va="top", color="#d8031c", fontsize=8)

legend_elements = [Patch(facecolor='black', edgecolor='black',
                         label='Any\nn={}'.format(len(all_significant))),
                   Patch(facecolor='#8a8a8a', edgecolor='#8a8a8a',
                         label='Peripheral\nn={}'.format(len(peri_significant))),
                   Patch(facecolor='#01016f', edgecolor='#01016f',
                         label='Core Gene\nn={}'.format(len(core_significant))),
                   Patch(facecolor='#d8031c', edgecolor="#d8031c",
                         label='HSP\nn={}'.format(len(hsp_significant)))]


leg = ax.legend(handles=legend_elements, loc='upper left', title="Perturbagen", fontsize=6.8, title_fontsize=7, ncol=2, columnspacing=0.5, handletextpad=-0.5)

for patch in leg.get_patches():
    patch.set_height(15)
    patch.set_width(5)
    patch.set_y(-5)

ax.set_ylim(bottom=5, top=1e65)

ax.set_ylabel(r"$-\log(FDR)$")
ax.set_xlabel("Mean Differential Perturbation\n(Core Gene - Peripheral)")
#plt.tight_layout()
plt.savefig("Volcano_Overexpression_all.svg", bbox_inches="tight")


In [None]:
results_df[results_df["meandiff"] < 0]

In [None]:
results_df

In [None]:
sorted_df["FDR"][:8]

In [None]:
coregene_target = full_oe_df.loc[full_oe_df.index.isin(allcore), :]
noncore_target = full_oe_df.loc[full_oe_df.index.isin(noncore), :]

coregene_mask = np.asarray([value.split(".")[0] in allcore for value in full_oe_df.columns])
hsp_mask = np.asarray([value.split(".")[0] in hsps for value in full_oe_df.columns])
noncore_mask = np.asarray([value.split(".")[0] in noncore.difference(hsps) for value in full_oe_df.columns])

total_result = ttest_ind(coregene_target, noncore_target)
fdr = fdrcorrection(total_result[1])

total_meandiff = coregene_target.mean(axis=0) - noncore_target.mean(axis=0)

In [None]:
from speos.visualization.settings import *
from matplotlib.patches import Patch
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(full_width*cm*0.5, 8*cm))
all_significant = total_meandiff[fdr[0] & (noncore_mask + coregene_mask + hsp_mask)]
core_significant = total_meandiff[fdr[0] & coregene_mask]
hsp_significant = total_meandiff[fdr[0] & hsp_mask]
peri_significant = total_meandiff[fdr[0] & noncore_mask]


ax.scatter(x=peri_significant,y = 1 / fdr[1][fdr[0] & noncore_mask], s=5, c="#5a5a5a")
ax.scatter(x=core_significant,y = 1 / fdr[1][fdr[0] & coregene_mask], s=5, c="#01016f")
ax.scatter(x=hsp_significant,y = 1 / fdr[1][fdr[0] & hsp_mask], s=5, c="#d8031c")

ax.vlines(0, 1/0.05, 10e35, color="gray", linestyles=":")

ax.text(-0.01, y=10e35, s="{:.1f}%".format(((all_significant < 0).sum() / len(all_significant)) * 100), ha="right", va="top", fontsize=8)
ax.text(+0.01, y=10e35, s="{:.1f}%".format(((all_significant > 0).sum() / len(all_significant)) * 100), ha="left", va="top", fontsize=8)

ax.text(-0.01, y=10e32, s="{:.1f}%".format(((peri_significant < 0).sum() / len(peri_significant)) * 100), ha="right", va="top", color="#5a5a5a", fontsize=8)
ax.text(+0.01, y=10e32, s="{:.1f}%".format(((peri_significant > 0).sum() / len(peri_significant)) * 100), ha="left", va="top", color="#5a5a5a", fontsize=8)

ax.text(-0.01, y=10e29, s="{:.1f}%".format(((core_significant < 0).sum() / len(core_significant)) * 100), ha="right", va="top", color="#01016f", fontsize=8)
ax.text(+0.01, y=10e29, s="{:.1f}%".format(((core_significant > 0).sum() / len(core_significant)) * 100), ha="left", va="top", color="#01016f", fontsize=8)

ax.text(-0.01, y=10e26, s="{:.1f}%".format(((hsp_significant < 0).sum() / len(hsp_significant)) * 100), ha="right", va="top", color="#d8031c", fontsize=8)
ax.text(+0.01, y=10e26, s="{:.1f}%".format(((hsp_significant > 0).sum() / len(hsp_significant)) * 100), ha="left", va="top", color="#d8031c", fontsize=8)


legend_elements = [Patch(facecolor='black', edgecolor='black',
                         label='Any\nn={}'.format(len(all_significant))),
                   Patch(facecolor='#5a5a5a', edgecolor='#5a5a5a',
                         label='Peripheral\nn={}'.format(len(peri_significant))),
                   Patch(facecolor='#01016f', edgecolor='#01016f',
                         label='Core Gene\nn={}'.format(len(core_significant))),
                   Patch(facecolor='#d8031c', edgecolor="#d8031c",
                         label='HSP\nn={}'.format(len(hsp_significant)))]


leg = ax.legend(handles=legend_elements, loc='upper left', title="Perturbagen", fontsize=6.8, title_fontsize=7, ncol=2, columnspacing=0.5, handletextpad=-0.5)

for patch in leg.get_patches():
    patch.set_height(15)
    patch.set_width(5)
    patch.set_y(-5)

ax.set_ylim(bottom=5, top=1e65)
ax.set_yscale("log")
ax.set_ylabel(r"$-\log(FDR)$")
ax.set_xlabel("Mean Differential Perturbation\n(Core Gene - Peripheral)")
plt.tight_layout()
plt.savefig("Volcano_Overexpression.svg", bbox_inches="tight")


In [None]:
get_differential_percentages(pd.concat((full_oe_df, full_kd_df), axis=1), allcore, hsps, noncore)

# see if significant perturbagens are among the disease network nodes

In [None]:
allcore, other_coregenes, hsps,  noncore = get_coregenes("uc", id2hgnc.values())

df = pd.read_csv("/mnt/storage/cmap/2017/differential_perturbation_knockdown_{}_{}.tsv".format("uc", "HEK293T"), sep="\t", header=0, index_col=0)
sign_perturbagens = set(df.index[df["FDR"] < 0.05])

In [None]:
df

In [None]:

len(sign_perturbagens)

In [None]:
all_perturbagens = df.index
len(all_perturbagens)

In [None]:
disease_edges = pd.read_csv("../disease_network_05.txt", sep="\t", index_col=False, header=None, names=["from", "to", "type", "weight"])

In [None]:
disease_network_nodes = disease_edges["from"].tolist()
disease_network_nodes.extend(disease_edges["to"].tolist())
disease_network_nodes = set(disease_network_nodes).intersection(set(all_perturbagens))
len(disease_network_nodes)

In [None]:
len(disease_network_nodes.intersection(sign_perturbagens))

In [None]:
from scipy.stats import fisher_exact

network_and_perturbagen = len(disease_network_nodes.intersection(sign_perturbagens))
network_not_perturbagen = len(disease_network_nodes.difference(sign_perturbagens))
not_network_and_perturbagen = len(all_perturbagens.difference(disease_network_nodes).intersection(sign_perturbagens))
not_network_not_perturbagen = len(all_perturbagens.difference(disease_network_nodes).difference(sign_perturbagens))

array = [[network_and_perturbagen, network_not_perturbagen],
         [not_network_and_perturbagen, not_network_not_perturbagen]]

fisher_exact(array)

In [None]:
array

# CAD Overexpression and Knockdown

In [None]:
import pandas as pd

trait = "cad"

hsps= pd.read_csv("../hsps/{}.txt".format(trait), header=None, index_col=None).iloc[:, 0].tolist()

#pertubagen_hsps = [hsp for hsp in hsps if hsp in overexpressed_genes]

from extensions.preprocessing import preprocess_labels

mendelians = preprocess_labels("../extensions/{}_really_only_genes.tsv".format(trait))

import json

with open("/mnt/storage/speos/results/{}_really_film_nohetioouter_results.json".format(trait), "r") as file:
    candidate2cs = json.load(file)[0]

coregenes = [key for key, value in candidate2cs.items() if value == 11]

other_coregenes = [key for key, value in candidate2cs.items() if value != 11]

allcore = set()
allcore.update(set(coregenes))
allcore.update(set(mendelians))
allcore = allcore.intersection(set(id2hgnc.values()))

noncore = set(id2hgnc.values()).difference(allcore).difference(other_coregenes)

In [None]:
full_oe_df = pd.read_csv("/mnt/storage/cmap/2017/oe_df.tsv", header=0, sep="\t", index_col=0)

In [None]:
results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_oe_df, allcore, hsps, noncore)


random_core = []
random_hsp = []
random_peri = []
for i in range(10):
    _, _, core_result_random, hsp_result_random, peri_result_random = get_differential_percentages(full_oe_df, allcore, hsps, noncore, randomize_core=True, random_seed=i)
    random_core.append(core_result_random[0])
    random_hsp.append(hsp_result_random[0])
    random_peri.append(peri_result_random[0])


In [None]:
import seaborn as sns
from speos.visualization.settings import *
import matplotlib.pyplot as plt
fig, ax= plt.subplots(figsize=(8*cm,5*cm))

kd_matrix_mean = pd.DataFrame(index=["HSP\n(n={})".format(hsp_result[1]), "Peripheral\n(n={})".format(peri_result[1]), "Core\n(n={})".format(core_result[1])],
                         data={"Core Genes\nn={}".format(len(allcore)): [hsp_result[0], peri_result[0],  core_result[0]],
                               "Random Genes\nn={} (10x)".format(len(allcore)): [np.mean(random_hsp), np.mean(random_peri), np.mean(random_core)]})

ax = sns.heatmap(kd_matrix_mean.transpose(),vmin=0, vmax=1, cmap="Oranges", annot=True, fmt=".1%", ax=ax,
                 cbar_kws={'label': "Fraction Significant\nDifferential Perturbations",
                           "pad": 0.01})
cbar = ax.collections[-1].colorbar
cbar.ax.set_ylabel("Fraction Significant\nDifferential Perturbations", fontsize=5)
ax.set_ylabel("Target Gene Set", fontsize=7)
ax.set_xlabel("Perturbagen (Overexpression)", fontsize=7)
plt.tight_layout()
plt.savefig("Perturbation_overexpression_cad.svg", bbox_inches="tight")

In [None]:
results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_oe_df, allcore, hsps, noncore, use_min=False)
results_df = results_df.transpose()
results_df = results_df[(results_df["Core Gene"] + results_df["HSP"] + results_df["Peripheral"]).values.astype(np.bool_)]
results_df = results_df[results_df["FDR"] < 0.05]

In [None]:
from speos.visualization.settings import *
from matplotlib.patches import Patch
from adjustText import adjust_text
import matplotlib.pyplot as plt


fig, ax = plt.subplots(figsize=(full_width*cm*0.5, 8*cm))
all_significant = results_df["meandiff"]
core_significant = results_df["meandiff"][results_df["Core Gene"]]
hsp_significant = results_df["meandiff"][results_df["HSP"]]
peri_significant = results_df["meandiff"][results_df["Peripheral"]]

core_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["Core Gene"])]
hsp_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["HSP"])]
peri_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["Peripheral"])]
ax.set_yscale("log")
ax.scatter(x=peri_significant,y = 1 / peri_fdr, s=5, c="#8a8a8a")
ax.scatter(x=core_significant,y = 1 / core_fdr, s=5, c="#01016f")
ax.scatter(x=hsp_significant,y = 1 / hsp_fdr, s=5, c="#d8031c")


texts = []
sorted_df = results_df.sort_values(by="FDR", ascending=True)
already_printed = []
for i in range(8):
    texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
    already_printed.append(sorted_df.index[i])

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y= (1/results_df["FDR"].values).tolist(), force_points=3, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)

texts = []
sorted_df = results_df.sort_values(by="meandiff", ascending=True)
for i in range(3):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

sorted_df = results_df.sort_values(by="meandiff", ascending=False)
for i in range(8):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

sorted_df = results_df[results_df["meandiff"] < 0].sort_values(by="FDR", ascending=True)
for i in range(3):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y=(1/results_df["FDR"].values).tolist(), force_points=10, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)


texts = []
hsp_df = results_df[results_df["HSP"]]
for i in range(len(hsp_df)):
   texts.append(ax.text(hsp_df["meandiff"][i], 1/hsp_df["FDR"][i], hsp_df.index[i], size=4, va="center"))

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y=(1/results_df["FDR"].values).tolist(), force_points=0.5, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)


ax.vlines(0, 1/0.05, 10e35, color="gray", linestyles=":")

ax.text(-0.01, y=10e35, s="{:.1f}%".format(((all_significant < 0).sum() / len(all_significant)) * 100), ha="right", va="top", fontsize=8)
ax.text(+0.01, y=10e35, s="{:.1f}%".format(((all_significant > 0).sum() / len(all_significant)) * 100), ha="left", va="top", fontsize=8)

ax.text(-0.01, y=10e32, s="{:.1f}%".format(((peri_significant < 0).sum() / len(peri_significant)) * 100), ha="right", va="top", color="#8a8a8a", fontsize=8)
ax.text(+0.01, y=10e32, s="{:.1f}%".format(((peri_significant > 0).sum() / len(peri_significant)) * 100), ha="left", va="top", color="#8a8a8a", fontsize=8)

ax.text(-0.01, y=10e29, s="{:.1f}%".format(((core_significant < 0).sum() / len(core_significant)) * 100), ha="right", va="top", color="#01016f", fontsize=8)
ax.text(+0.01, y=10e29, s="{:.1f}%".format(((core_significant > 0).sum() / len(core_significant)) * 100), ha="left", va="top", color="#01016f", fontsize=8)

ax.text(-0.01, y=10e26, s="{:.1f}%".format(((hsp_significant < 0).sum() / len(hsp_significant)) * 100), ha="right", va="top", color="#d8031c", fontsize=8)
ax.text(+0.01, y=10e26, s="{:.1f}%".format(((hsp_significant > 0).sum() / len(hsp_significant)) * 100), ha="left", va="top", color="#d8031c", fontsize=8)

legend_elements = [Patch(facecolor='black', edgecolor='black',
                         label='Any\nn={}'.format(len(all_significant))),
                   Patch(facecolor='#8a8a8a', edgecolor='#8a8a8a',
                         label='Peripheral\nn={}'.format(len(peri_significant))),
                   Patch(facecolor='#01016f', edgecolor='#01016f',
                         label='Core Gene\nn={}'.format(len(core_significant))),
                   Patch(facecolor='#d8031c', edgecolor="#d8031c",
                         label='HSP\nn={}'.format(len(hsp_significant)))]


leg = ax.legend(handles=legend_elements, loc='upper left', title="Perturbagen", fontsize=6.8, title_fontsize=7, ncol=2, columnspacing=0.5, handletextpad=-0.5)

for patch in leg.get_patches():
    patch.set_height(15)
    patch.set_width(5)
    patch.set_y(-5)

ax.set_ylim(bottom=5, top=1e65)

ax.set_ylabel(r"$-\log(FDR)$")
ax.set_xlabel("Mean Differential Perturbation\n(Core Gene - Peripheral)")
#plt.tight_layout()
plt.savefig("Volcano_Overexpression_cad.svg", bbox_inches="tight")


In [None]:
import pandas as pd
full_kd_df = pd.read_csv("/mnt/storage/cmap/2017/kd_df.tsv", header=0, sep="\t", index_col=0)

In [None]:
results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_kd_df, allcore, hsps, noncore, use_min=True)


random_core = []
random_hsp = []
random_peri = []
for i in range(10):
    _, _, core_result_random, hsp_result_random, peri_result_random = get_differential_percentages(full_kd_df, allcore, hsps, noncore, randomize_core=True, random_seed=i, use_min=True)
    random_core.append(core_result_random[0])
    random_hsp.append(hsp_result_random[0])
    random_peri.append(peri_result_random[0])

In [None]:
fig, ax= plt.subplots(figsize=(8*cm,5*cm))

kd_matrix_mean = pd.DataFrame(index=["HSP" + "\n(n=%s)" % hsp_result[1], "Peripheral\n" + "(n=%s)" % peri_result[1], "Core Gene\n" + "(n=%s)" % core_result[1]],
                         data={"Core Genes\n" + "n={}".format(len(allcore)): [hsp_result[0], peri_result[0],  core_result[0]],
                               "Random Genes\n" + "n={} (10x)".format(len(allcore)): [np.mean(random_hsp), np.mean(random_peri), np.mean(random_core)]})

ax = sns.heatmap(kd_matrix_mean.transpose(), vmin=0,  vmax=1, cmap="Purples", annot=True, fmt=".1%", ax=ax,
                 cbar_kws={'label': "Fraction Significant\nDifferential Perturbations",
                           "pad": 0.01})
cbar = ax.collections[-1].colorbar
cbar.ax.set_ylabel("Fraction Significant\nDifferential Perturbations", fontsize=5)
ax.set_ylabel("Target Gene Set", fontsize=7)
ax.set_xlabel("Perturbagen (Knockdown)", fontsize=7)
plt.tight_layout()
plt.savefig("Perturbation_knockdown_cad.svg", bbox_inches="tight")

In [None]:
results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_kd_df, allcore, hsps, noncore, use_min=True)
results_df = results_df.transpose()
results_df = results_df[(results_df["Core Gene"] + results_df["HSP"] + results_df["Peripheral"]).values.astype(np.bool_)]
results_df = results_df[results_df["FDR"] < 0.05]

In [None]:
results_df

In [None]:
from matplotlib.patches import Patch
import matplotlib.pyplot as plt


fig, ax = plt.subplots(figsize=(full_width*cm*0.5, 8*cm))
all_significant = results_df["meandiff"][(results_df["FDR"] < 0.05) & (results_df["Core Gene"] + results_df["HSP"] + results_df["Peripheral"])]
core_significant = results_df["meandiff"][(results_df["FDR"] < 0.05) & (results_df["Core Gene"])]
hsp_significant = results_df["meandiff"][(results_df["FDR"] < 0.05) & (results_df["HSP"])]
peri_significant = results_df["meandiff"][(results_df["FDR"] < 0.05) & (results_df["Peripheral"])]

core_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["Core Gene"])]
hsp_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["HSP"])]
peri_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["Peripheral"])]
ax.set_yscale("log")
ax.scatter(x=peri_significant,y = 1 / peri_fdr, s=5, c="#8a8a8a")
ax.scatter(x=core_significant,y = 1 / core_fdr, s=5, c="#01016f")
ax.scatter(x=hsp_significant,y = 1 / hsp_fdr, s=5, c="#d8031c")

texts = []
sorted_df = results_df.sort_values(by="FDR", ascending=True)
already_printed = []
for i in range(8):
    texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
    already_printed.append(sorted_df.index[i])

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y= (1/results_df["FDR"].values).tolist(), force_points=3, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)

texts = []
sorted_df = results_df.sort_values(by="meandiff", ascending=True)
for i in range(3):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

sorted_df = results_df.sort_values(by="meandiff", ascending=False)
for i in range(5):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

sorted_df = results_df[results_df["meandiff"] < 0].sort_values(by="FDR", ascending=True)
for i in range(5):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y=(1/results_df["FDR"].values).tolist(), force_points=5, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)

texts = []
hsp_df = results_df[results_df["HSP"]]
for i in range(len(hsp_df)):
   texts.append(ax.text(hsp_df["meandiff"][i], 1/hsp_df["FDR"][i], hsp_df.index[i], size=4, va="center"))

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y=(1/results_df["FDR"].values).tolist(), force_points=0.5, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)


ax.vlines(0, 1/0.05, 10e28, color="gray", linestyles=":")

ax.text(-0.01, y=10e28, s="{:.1f}%".format(((all_significant < 0).sum() / len(all_significant)) * 100), ha="right", va="top", fontsize=8)
ax.text(+0.01, y=10e28, s="{:.1f}%".format(((all_significant > 0).sum() / len(all_significant)) * 100), ha="left", va="top", fontsize=8)


ax.text(-0.01, y=10e26, s="{:.1f}%".format(((peri_significant < 0).sum() / len(peri_significant)) * 100), ha="right", va="top", color="#8a8a8a", fontsize=8)
ax.text(+0.01, y=10e26, s="{:.1f}%".format(((peri_significant > 0).sum() / len(peri_significant)) * 100), ha="left", va="top", color="#8a8a8a", fontsize=8)

ax.text(-0.01, y=10e24, s="{:.1f}%".format(((core_significant < 0).sum() / len(core_significant)) * 100), ha="right", va="top", color="#01016f", fontsize=8)
ax.text(+0.01, y=10e24, s="{:.1f}%".format(((core_significant > 0).sum() / len(core_significant)) * 100), ha="left", va="top", color="#01016f", fontsize=8)

ax.text(-0.01, y=10e22, s="{:.1f}%".format(((hsp_significant < 0).sum() / len(hsp_significant)) * 100), ha="right", va="top", color="#d8031c", fontsize=8)
ax.text(+0.01, y=10e22, s="{:.1f}%".format(((hsp_significant > 0).sum() / len(hsp_significant)) * 100), ha="left", va="top", color="#d8031c", fontsize=8)


legend_elements = [Patch(facecolor='black', edgecolor='black',
                         label='Any\nn={}'.format(len(all_significant))),
                   Patch(facecolor='#8a8a8a', edgecolor='#8a8a8a',
                         label='Peripheral\nn={}'.format(len(peri_significant))),
                   Patch(facecolor='#01016f', edgecolor='#01016f',
                         label='Core Gene\nn={}'.format(len(core_significant))),
                   Patch(facecolor='#d8031c', edgecolor="#d8031c",
                         label='HSP\nn={}'.format(len(hsp_significant)))]


leg = ax.legend(handles=legend_elements, loc='upper left', title="Perturbagen", fontsize=6.8, title_fontsize=7, ncol=4, columnspacing=1.7, handletextpad=-0.7)

for patch in leg.get_patches():
    patch.set_height(15)
    patch.set_width(5)
    patch.set_y(-5)

ax.set_ylim(bottom=5, top=10e38)
ax.set_ylabel(r"$-\log(FDR)$")
ax.set_xlabel("Mean Differential Perturbation\n(Core Gene - Peripheral)")
plt.tight_layout()
plt.savefig("Volcano_Knockdown_cad.svg", bbox_inches="tight")


# SCZ

In [None]:
import pandas as pd

trait = "scz"

hsps= pd.read_csv("../hsps/{}.txt".format(trait), header=None, index_col=None).iloc[:, 0].tolist()

#pertubagen_hsps = [hsp for hsp in hsps if hsp in overexpressed_genes]

from extensions.preprocessing import preprocess_labels

mendelians = preprocess_labels("../extensions/{}_only_genes.tsv".format(trait))

import json

with open("/mnt/storage/speos/results/{}_film_nohetioouter_results.json".format(trait), "r") as file:
    candidate2cs = json.load(file)[0]

coregenes = [key for key, value in candidate2cs.items() if value == 11]

other_coregenes = [key for key, value in candidate2cs.items() if value != 11]

allcore = set()
allcore.update(set(coregenes))
allcore.update(set(mendelians))
allcore = allcore.intersection(set(id2hgnc.values()))

noncore = set(id2hgnc.values()).difference(allcore).difference(other_coregenes)

In [None]:
results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_oe_df, allcore, hsps, noncore)


random_core = []
random_hsp = []
random_peri = []
for i in range(10):
    _, _, core_result_random, hsp_result_random, peri_result_random = get_differential_percentages(full_oe_df, allcore, hsps, noncore, randomize_core=True, random_seed=i)
    random_core.append(core_result_random[0])
    random_hsp.append(hsp_result_random[0])
    random_peri.append(peri_result_random[0])

import seaborn as sns
from speos.visualization.settings import *
import matplotlib.pyplot as plt
fig, ax= plt.subplots(figsize=(8*cm,5*cm))

kd_matrix_mean = pd.DataFrame(index=["HSP\n(n={})".format(hsp_result[1]), "Peripheral\n(n={})".format(peri_result[1]), "Core\n(n={})".format(core_result[1])],
                         data={"Core Genes\nn={}".format(len(allcore)): [hsp_result[0], peri_result[0],  core_result[0]],
                               "Random Genes\nn={} (10x)".format(len(allcore)): [np.mean(random_hsp), np.mean(random_peri), np.mean(random_core)]})

ax = sns.heatmap(kd_matrix_mean.transpose(),vmin=0, vmax=1, cmap="Oranges", annot=True, fmt=".1%", ax=ax,
                 cbar_kws={'label': "Fraction Significant\nDifferential Perturbations",
                           "pad": 0.01})
cbar = ax.collections[-1].colorbar
cbar.ax.set_ylabel("Fraction Significant\nDifferential Perturbations", fontsize=5)
ax.set_ylabel("Target Gene Set", fontsize=7)
ax.set_xlabel("Perturbagen (Overexpression)", fontsize=7)
plt.tight_layout()
plt.savefig("Perturbation_overexpression_scz.svg", bbox_inches="tight")

In [None]:
from matplotlib.patches import Patch
from adjustText import adjust_text

results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_oe_df, allcore, hsps, noncore, use_min=False)
results_df = results_df.transpose()
results_df = results_df[(results_df["Core Gene"] + results_df["HSP"] + results_df["Peripheral"]).values.astype(np.bool_)]
results_df = results_df[results_df["FDR"] < 0.05]

fig, ax = plt.subplots(figsize=(full_width*cm*0.5, 8*cm))
all_significant = results_df["meandiff"]
core_significant = results_df["meandiff"][results_df["Core Gene"]]
hsp_significant = results_df["meandiff"][results_df["HSP"]]
peri_significant = results_df["meandiff"][results_df["Peripheral"]]

core_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["Core Gene"])]
hsp_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["HSP"])]
peri_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["Peripheral"])]
ax.set_yscale("log")
ax.scatter(x=peri_significant,y = 1 / peri_fdr, s=5, c="#8a8a8a")
ax.scatter(x=core_significant,y = 1 / core_fdr, s=5, c="#01016f")
ax.scatter(x=hsp_significant,y = 1 / hsp_fdr, s=5, c="#d8031c")


texts = []
sorted_df = results_df.sort_values(by="FDR", ascending=True)
already_printed = []
for i in range(8):
    texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
    already_printed.append(sorted_df.index[i])

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y= (1/results_df["FDR"].values).tolist(), force_points=3, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)

texts = []
sorted_df = results_df.sort_values(by="meandiff", ascending=True)
for i in range(3):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

sorted_df = results_df.sort_values(by="meandiff", ascending=False)
for i in range(8):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

sorted_df = results_df[results_df["meandiff"] < 0].sort_values(by="FDR", ascending=True)
for i in range(3):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y=(1/results_df["FDR"].values).tolist(), force_points=10, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)


texts = []
hsp_df = results_df[results_df["HSP"]]
for i in range(len(hsp_df)):
   texts.append(ax.text(hsp_df["meandiff"][i], 1/hsp_df["FDR"][i], hsp_df.index[i], size=4, va="center"))

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y=(1/results_df["FDR"].values).tolist(), force_points=0.5, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)


ax.vlines(0, 1/0.05, 10e20, color="gray", linestyles=":")

ax.text(-0.01, y=10e18, s="{:.1f}%".format(((all_significant < 0).sum() / len(all_significant)) * 100), ha="right", va="top", fontsize=8)
ax.text(+0.01, y=10e18, s="{:.1f}%".format(((all_significant > 0).sum() / len(all_significant)) * 100), ha="left", va="top", fontsize=8)

ax.text(-0.01, y=10e17, s="{:.1f}%".format(((peri_significant < 0).sum() / len(peri_significant)) * 100), ha="right", va="top", color="#8a8a8a", fontsize=8)
ax.text(+0.01, y=10e17, s="{:.1f}%".format(((peri_significant > 0).sum() / len(peri_significant)) * 100), ha="left", va="top", color="#8a8a8a", fontsize=8)

ax.text(-0.01, y=10e16, s="{:.1f}%".format(((core_significant < 0).sum() / len(core_significant)) * 100), ha="right", va="top", color="#01016f", fontsize=8)
ax.text(+0.01, y=10e16, s="{:.1f}%".format(((core_significant > 0).sum() / len(core_significant)) * 100), ha="left", va="top", color="#01016f", fontsize=8)

ax.text(-0.01, y=10e15, s="{:.1f}%".format(((hsp_significant < 0).sum() / len(hsp_significant)) * 100), ha="right", va="top", color="#d8031c", fontsize=8)
ax.text(+0.01, y=10e15, s="{:.1f}%".format(((hsp_significant > 0).sum() / len(hsp_significant)) * 100), ha="left", va="top", color="#d8031c", fontsize=8)

legend_elements = [Patch(facecolor='black', edgecolor='black',
                         label='Any\nn={}'.format(len(all_significant))),
                   Patch(facecolor='#8a8a8a', edgecolor='#8a8a8a',
                         label='Peripheral\nn={}'.format(len(peri_significant))),
                   Patch(facecolor='#01016f', edgecolor='#01016f',
                         label='Core Gene\nn={}'.format(len(core_significant))),
                   Patch(facecolor='#d8031c', edgecolor="#d8031c",
                         label='HSP\nn={}'.format(len(hsp_significant)))]


leg = ax.legend(handles=legend_elements, loc='upper left', title="Perturbagen", fontsize=6.8, title_fontsize=7, ncol=2, columnspacing=0.5, handletextpad=-0.5)

for patch in leg.get_patches():
    patch.set_height(15)
    patch.set_width(5)
    patch.set_y(-5)

ax.set_ylim(bottom=5, top=1e20)

ax.set_ylabel(r"$-\log(FDR)$")
ax.set_xlabel("Mean Differential Perturbation\n(Core Gene - Peripheral)")
#plt.tight_layout()
plt.savefig("Volcano_Overexpression_scz.svg", bbox_inches="tight")


In [None]:
results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_kd_df, allcore, hsps, noncore, use_min=True)


random_core = []
random_hsp = []
random_peri = []
for i in range(10):
    _, _, core_result_random, hsp_result_random, peri_result_random = get_differential_percentages(full_kd_df, allcore, hsps, noncore, randomize_core=True, random_seed=i, use_min=True)
    random_core.append(core_result_random[0])
    random_hsp.append(hsp_result_random[0])
    random_peri.append(peri_result_random[0])

fig, ax= plt.subplots(figsize=(8*cm,5*cm))

kd_matrix_mean = pd.DataFrame(index=["HSP" + "\n(n=%s)" % hsp_result[1], "Peripheral\n" + "(n=%s)" % peri_result[1], "Core Gene\n" + "(n=%s)" % core_result[1]],
                         data={"Core Genes\n" + "n={}".format(len(allcore)): [hsp_result[0], peri_result[0],  core_result[0]],
                               "Random Genes\n" + "n={} (10x)".format(len(allcore)): [np.mean(random_hsp), np.mean(random_peri), np.mean(random_core)]})

ax = sns.heatmap(kd_matrix_mean.transpose(), vmin=0,  vmax=1, cmap="Purples", annot=True, fmt=".1%", ax=ax,
                 cbar_kws={'label': "Fraction Significant\nDifferential Perturbations",
                           "pad": 0.01})
cbar = ax.collections[-1].colorbar
cbar.ax.set_ylabel("Fraction Significant\nDifferential Perturbations", fontsize=5)
ax.set_ylabel("Target Gene Set", fontsize=7)
ax.set_xlabel("Perturbagen (Knockdown)", fontsize=7)
plt.tight_layout()
plt.savefig("Perturbation_knockdown_scz.svg", bbox_inches="tight")

In [None]:
from matplotlib.patches import Patch
import matplotlib.pyplot as plt


results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_kd_df, allcore, hsps, noncore, use_min=True)
results_df = results_df.transpose()
results_df = results_df[(results_df["Core Gene"] + results_df["HSP"] + results_df["Peripheral"]).values.astype(np.bool_)]
results_df = results_df[results_df["FDR"] < 0.05]


fig, ax = plt.subplots(figsize=(full_width*cm*0.5, 8*cm))
all_significant = results_df["meandiff"][(results_df["FDR"] < 0.05) & (results_df["Core Gene"] + results_df["HSP"] + results_df["Peripheral"])]
core_significant = results_df["meandiff"][(results_df["FDR"] < 0.05) & (results_df["Core Gene"])]
hsp_significant = results_df["meandiff"][(results_df["FDR"] < 0.05) & (results_df["HSP"])]
peri_significant = results_df["meandiff"][(results_df["FDR"] < 0.05) & (results_df["Peripheral"])]

core_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["Core Gene"])]
hsp_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["HSP"])]
peri_fdr = results_df["FDR"][(results_df["FDR"] < 0.05) & (results_df["Peripheral"])]
ax.set_yscale("log")
ax.scatter(x=peri_significant,y = 1 / peri_fdr, s=5, c="#8a8a8a")
ax.scatter(x=core_significant,y = 1 / core_fdr, s=5, c="#01016f")
ax.scatter(x=hsp_significant,y = 1 / hsp_fdr, s=5, c="#d8031c")

texts = []
sorted_df = results_df.sort_values(by="FDR", ascending=True)
already_printed = []
for i in range(8):
    texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
    already_printed.append(sorted_df.index[i])

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y= (1/results_df["FDR"].values).tolist(), force_points=3, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)

texts = []
sorted_df = results_df.sort_values(by="meandiff", ascending=True)
for i in range(3):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

sorted_df = results_df.sort_values(by="meandiff", ascending=False)
for i in range(5):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

sorted_df = results_df[results_df["meandiff"] < 0].sort_values(by="FDR", ascending=True)
for i in range(5):
    if sorted_df.index[i] not in already_printed:
      texts.append(ax.text(sorted_df["meandiff"][i], 1/sorted_df["FDR"][i], sorted_df.index[i], size=4, va="center"))
      already_printed.append(sorted_df.index[i])

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y=(1/results_df["FDR"].values).tolist(), force_points=5, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)

texts = []
hsp_df = results_df[results_df["HSP"]]
for i in range(len(hsp_df)):
   texts.append(ax.text(hsp_df["meandiff"][i], 1/hsp_df["FDR"][i], hsp_df.index[i], size=4, va="center"))

adjust_text(texts, x=results_df["meandiff"].values.tolist(), y=(1/results_df["FDR"].values).tolist(), force_points=0.5, arrowprops=dict(arrowstyle='-', color='black', lw=0.5), ax=ax)


ax.vlines(0, 1/0.05, 10e16, color="gray", linestyles=":")

ax.text(-0.01, y=10e16, s="{:.1f}%".format(((all_significant < 0).sum() / len(all_significant)) * 100), ha="right", va="top", fontsize=8)
ax.text(+0.01, y=10e16, s="{:.1f}%".format(((all_significant > 0).sum() / len(all_significant)) * 100), ha="left", va="top", fontsize=8)


ax.text(-0.01, y=10e15, s="{:.1f}%".format(((peri_significant < 0).sum() / len(peri_significant)) * 100), ha="right", va="top", color="#8a8a8a", fontsize=8)
ax.text(+0.01, y=10e15, s="{:.1f}%".format(((peri_significant > 0).sum() / len(peri_significant)) * 100), ha="left", va="top", color="#8a8a8a", fontsize=8)

ax.text(-0.01, y=10e14, s="{:.1f}%".format(((core_significant < 0).sum() / len(core_significant)) * 100), ha="right", va="top", color="#01016f", fontsize=8)
ax.text(+0.01, y=10e14, s="{:.1f}%".format(((core_significant > 0).sum() / len(core_significant)) * 100), ha="left", va="top", color="#01016f", fontsize=8)

ax.text(-0.01, y=10e13, s="{:.1f}%".format(((hsp_significant < 0).sum() / len(hsp_significant)) * 100), ha="right", va="top", color="#d8031c", fontsize=8)
ax.text(+0.01, y=10e13, s="{:.1f}%".format(((hsp_significant > 0).sum() / len(hsp_significant)) * 100), ha="left", va="top", color="#d8031c", fontsize=8)


legend_elements = [Patch(facecolor='black', edgecolor='black',
                         label='Any\nn={}'.format(len(all_significant))),
                   Patch(facecolor='#8a8a8a', edgecolor='#8a8a8a',
                         label='Peripheral\nn={}'.format(len(peri_significant))),
                   Patch(facecolor='#01016f', edgecolor='#01016f',
                         label='Core Gene\nn={}'.format(len(core_significant))),
                   Patch(facecolor='#d8031c', edgecolor="#d8031c",
                         label='HSP\nn={}'.format(len(hsp_significant)))]


leg = ax.legend(handles=legend_elements, loc='upper left', title="Perturbagen", fontsize=6.8, title_fontsize=7, ncol=4, columnspacing=1.7, handletextpad=-0.7)

for patch in leg.get_patches():
    patch.set_height(15)
    patch.set_width(5)
    patch.set_y(-5)

ax.set_ylim(bottom=5, top=10e20)
ax.set_ylabel(r"$-\log(FDR)$")
ax.set_xlabel("Mean Differential Perturbation\n(Core Gene - Peripheral)")
plt.tight_layout()
plt.savefig("Volcano_Knockdown_scz.svg", bbox_inches="tight")


# Now mix Core Genes from UC, SCZ and CAD

In [None]:
import pandas as pd

trait = "cad"

cad_hsps = pd.read_csv("../hsps/{}.txt".format(trait), header=None, index_col=None).iloc[:, 0].tolist()

#pertubagen_hsps = [hsp for hsp in hsps if hsp in overexpressed_genes]

from extensions.preprocessing import preprocess_labels

mendelians = preprocess_labels("../extensions/{}_really_only_genes.tsv".format(trait))

import json

with open("/mnt/storage/speos/results/{}_really_film_nohetioouter_results.json".format(trait), "r") as file:
    candidate2cs = json.load(file)[0]

coregenes = [key for key, value in candidate2cs.items() if value == 11]

other_coregenes = [key for key, value in candidate2cs.items() if value != 11]

allcore = set()
allcore.update(set(coregenes))
allcore.update(set(mendelians))
allcore = allcore.intersection(set(id2hgnc.values()))

noncore = set(set(id2hgnc.values()))

noncore = noncore.difference(allcore).difference(other_coregenes)

trait = "scz"

scz_hsps= pd.read_csv("../hsps/{}.txt".format(trait), header=None, index_col=None).iloc[:, 0].tolist()

#pertubagen_hsps = [hsp for hsp in hsps if hsp in overexpressed_genes]

from extensions.preprocessing import preprocess_labels

mendelians = preprocess_labels("../extensions/{}_only_genes.tsv".format(trait))

import json

with open("/mnt/storage/speos/results/{}_film_nohetioouter_results.json".format(trait), "r") as file:
    candidate2cs = json.load(file)[0]

coregenes = [key for key, value in candidate2cs.items() if value == 11]

other_coregenes = [key for key, value in candidate2cs.items() if value != 11]

allcore.update(set(coregenes))
allcore.update(set(mendelians))
allcore = allcore.intersection(set(id2hgnc.values()))

noncore = noncore.difference(allcore).difference(other_coregenes)


trait = "uc"

uc_hsps= pd.read_csv("../hsps/{}.txt".format(trait), header=None, index_col=None).iloc[:, 0].tolist()

#pertubagen_hsps = [hsp for hsp in hsps if hsp in overexpressed_genes]

from extensions.preprocessing import preprocess_labels

mendelians = preprocess_labels("../extensions/{}_only_genes.tsv".format(trait))

import json

with open("/mnt/storage/speos/results/{}_film_nohetioouter_results.json".format(trait), "r") as file:
    candidate2cs = json.load(file)[0]

coregenes = [key for key, value in candidate2cs.items() if value == 11]

other_coregenes = [key for key, value in candidate2cs.items() if value != 11]

allcore.update(set(coregenes))
allcore.update(set(mendelians))
allcore = allcore.intersection(set(id2hgnc.values()))

noncore = noncore.difference(allcore).difference(other_coregenes)

hsps = set()
hsps.update(scz_hsps)
hsps.update(cad_hsps)
hsps.update(uc_hsps)
len(hsps)

In [None]:
len(allcore)

In [None]:
len(noncore)

In [None]:
len(set(id2hgnc.values()))

In [None]:
results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_oe_df, allcore, hsps, noncore)


random_core = []
random_hsp = []
random_peri = []
for i in range(10):
    _, _, core_result_random, hsp_result_random, peri_result_random = get_differential_percentages(full_oe_df, allcore, hsps, noncore, randomize_core=True, random_seed=i)
    random_core.append(core_result_random[0])
    random_hsp.append(hsp_result_random[0])
    random_peri.append(peri_result_random[0])

import seaborn as sns
from speos.visualization.settings import *
import matplotlib.pyplot as plt
fig, ax= plt.subplots(figsize=(8*cm,5*cm))

kd_matrix_mean = pd.DataFrame(index=["HSP\n(n={})".format(hsp_result[1]), "Peripheral\n(n={})".format(peri_result[1]), "Core\n(n={})".format(core_result[1])],
                         data={"Core Genes\nn={}".format(len(allcore)): [hsp_result[0], peri_result[0],  core_result[0]],
                               "Random Genes\nn={} (10x)".format(len(allcore)): [np.mean(random_hsp), np.mean(random_peri), np.mean(random_core)]})

ax = sns.heatmap(kd_matrix_mean.transpose(),vmin=0, vmax=1, cmap="Oranges", annot=True, fmt=".1%", ax=ax,
                 cbar_kws={'label': "Fraction Significant\nDifferential Perturbations",
                           "pad": 0.01})
cbar = ax.collections[-1].colorbar
cbar.ax.set_ylabel("Fraction Significant\nDifferential Perturbations", fontsize=5)
ax.set_ylabel("Target Gene Set", fontsize=7)
ax.set_xlabel("Perturbagen (Overexpression)", fontsize=7)
plt.tight_layout()
plt.savefig("Perturbation_overexpression_mix.svg", bbox_inches="tight")

In [None]:
results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_kd_df, allcore, hsps, noncore, use_min=True, use_t_test=False)


random_core = []
random_hsp = []
random_peri = []
for i in range(10):
    _, _, core_result_random, hsp_result_random, peri_result_random = get_differential_percentages(full_kd_df, allcore, hsps, noncore, randomize_core=True, random_seed=i, use_min=True)
    random_core.append(core_result_random[0])
    random_hsp.append(hsp_result_random[0])
    random_peri.append(peri_result_random[0])

fig, ax= plt.subplots(figsize=(8*cm,5*cm))

kd_matrix_mean = pd.DataFrame(index=["HSP" + "\n(n=%s)" % hsp_result[1], "Peripheral\n" + "(n=%s)" % peri_result[1], "Core Gene\n" + "(n=%s)" % core_result[1]],
                         data={"Core Genes\n" + "n={}".format(len(allcore)): [hsp_result[0], peri_result[0],  core_result[0]],
                               "Random Genes\n" + "n={} (10x)".format(len(allcore)): [np.mean(random_hsp), np.mean(random_peri), np.mean(random_core)]})

ax = sns.heatmap(kd_matrix_mean.transpose(), vmin=0,  vmax=1, cmap="Purples", annot=True, fmt=".1%", ax=ax,
                 cbar_kws={'label': "Fraction Significant\nDifferential Perturbations",
                           "pad": 0.01})
cbar = ax.collections[-1].colorbar
cbar.ax.set_ylabel("Fraction Significant\nDifferential Perturbations", fontsize=5)
ax.set_ylabel("Target Gene Set", fontsize=7)
ax.set_xlabel("Perturbagen (Knockdown)", fontsize=7)
plt.tight_layout()
plt.savefig("Perturbation_knockdown_mix.svg", bbox_inches="tight")

In [None]:
results_df

In [None]:
results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_kd_df, allcore, hsps, noncore, use_min=True, use_t_test=True)


random_core = []
random_hsp = []
random_peri = []
for i in range(10):
    _, _, core_result_random, hsp_result_random, peri_result_random = get_differential_percentages(full_kd_df, allcore, hsps, noncore, randomize_core=True, random_seed=i, use_min=True)
    random_core.append(core_result_random[0])
    random_hsp.append(hsp_result_random[0])
    random_peri.append(peri_result_random[0])

fig, ax= plt.subplots(figsize=(8*cm,5*cm))

kd_matrix_mean = pd.DataFrame(index=["HSP" + "\n(n=%s)" % hsp_result[1], "Peripheral\n" + "(n=%s)" % peri_result[1], "Core Gene\n" + "(n=%s)" % core_result[1]],
                         data={"Core Genes\n" + "n={}".format(len(allcore)): [hsp_result[0], peri_result[0],  core_result[0]],
                               "Random Genes\n" + "n={} (10x)".format(len(allcore)): [np.mean(random_hsp), np.mean(random_peri), np.mean(random_core)]})

ax = sns.heatmap(kd_matrix_mean.transpose(), vmin=0,  vmax=1, cmap="Purples", annot=True, fmt=".1%", ax=ax,
                 cbar_kws={'label': "Fraction Significant\nDifferential Perturbations",
                           "pad": 0.01})
cbar = ax.collections[-1].colorbar
cbar.ax.set_ylabel("Fraction Significant\nDifferential Perturbations", fontsize=5)
ax.set_ylabel("Target Gene Set", fontsize=7)
ax.set_xlabel("Perturbagen (Knockdown)", fontsize=7)
plt.tight_layout()
plt.savefig("Perturbation_knockdown_mix.svg", bbox_inches="tight")

In [None]:
results_df

# more traits

In [None]:
import pandas as pd

additive_numbers = [0]
individual_numbers = [0]
set_numbers = [0]
traits = []


allcore = set()
noncore = set(id2hgnc.values())
intersect_allcore = set(id2hgnc.values())

trait = "uc"
traits.append(trait.upper())
uc_hsps= pd.read_csv("../hsps/{}.txt".format(trait), header=None, index_col=None).iloc[:, 0].tolist()

#pertubagen_hsps = [hsp for hsp in hsps if hsp in overexpressed_genes]

from extensions.preprocessing import preprocess_labels

mendelians = set(preprocess_labels("../extensions/{}_only_genes.tsv".format(trait)))

import json

with open("/mnt/storage/speos/results/{}_film_nohetioouter_results.json".format(trait), "r") as file:
    candidate2cs = json.load(file)[0]

coregenes = set([key for key, value in candidate2cs.items() if value == 11])

intersect_allcore = intersect_allcore.intersection(coregenes)

additive_numbers.append(additive_numbers[-1] + len(coregenes.union(mendelians).intersection(set(id2hgnc.values()))))
individual_numbers.append(len(coregenes.union(mendelians).intersection(set(id2hgnc.values()))))

other_coregenes = [key for key, value in candidate2cs.items() if value != 11]

allcore.update(set(coregenes))
allcore.update(set(mendelians))
allcore = allcore.intersection(set(id2hgnc.values()))
set_numbers.append(len(allcore))
noncore = noncore.difference(allcore).difference(other_coregenes)


trait = "cad"
traits.append(trait.upper())

cad_hsps = pd.read_csv("../hsps/{}.txt".format(trait), header=None, index_col=None).iloc[:, 0].tolist()

#pertubagen_hsps = [hsp for hsp in hsps if hsp in overexpressed_genes]

from extensions.preprocessing import preprocess_labels

mendelians =  set(preprocess_labels("../extensions/{}_really_only_genes.tsv".format(trait)))

import json

with open("/mnt/storage/speos/results/{}_really_film_nohetioouter_results.json".format(trait), "r") as file:
    candidate2cs = json.load(file)[0]

coregenes =  set([key for key, value in candidate2cs.items() if value == 11])
intersect_allcore = intersect_allcore.intersection(coregenes)

additive_numbers.append(additive_numbers[-1] + len(coregenes.union(mendelians).intersection(set(id2hgnc.values()))))
individual_numbers.append(len(coregenes.union(mendelians).intersection(set(id2hgnc.values()))))
other_coregenes = [key for key, value in candidate2cs.items() if value != 11]

allcore.update(set(coregenes))
allcore.update(set(mendelians))
allcore = allcore.intersection(set(id2hgnc.values()))
set_numbers.append(len(allcore))



noncore = noncore.difference(allcore).difference(other_coregenes)

trait = "scz"
traits.append(trait.upper())
scz_hsps= pd.read_csv("../hsps/{}.txt".format(trait), header=None, index_col=None).iloc[:, 0].tolist()

#pertubagen_hsps = [hsp for hsp in hsps if hsp in overexpressed_genes]

from extensions.preprocessing import preprocess_labels

mendelians =  set(preprocess_labels("../extensions/{}_only_genes.tsv".format(trait)))

import json

with open("/mnt/storage/speos/results/{}_film_nohetioouter_results.json".format(trait), "r") as file:
    candidate2cs = json.load(file)[0]

coregenes =  set([key for key, value in candidate2cs.items() if value == 11])
intersect_allcore = intersect_allcore.intersection(coregenes)

additive_numbers.append(additive_numbers[-1] + len(coregenes.union(mendelians).intersection(set(id2hgnc.values()))))
individual_numbers.append(len(coregenes.union(mendelians).intersection(set(id2hgnc.values()))))
other_coregenes = [key for key, value in candidate2cs.items() if value != 11]

allcore.update(set(coregenes))
allcore.update(set(mendelians))
allcore = allcore.intersection(set(id2hgnc.values()))
set_numbers.append(len(allcore))
noncore = noncore.difference(allcore).difference(other_coregenes)



trait = "ra"
traits.append(trait.upper())
ra_hsps= pd.read_csv("../hsps/{}.txt".format(trait), header=None, index_col=None).iloc[:, 0].tolist()

#pertubagen_hsps = [hsp for hsp in hsps if hsp in overexpressed_genes]

from extensions.preprocessing import preprocess_labels

mendelians = set(preprocess_labels("../extensions/{}_only_genes.tsv".format(trait)))

import json

with open("/mnt/storage/speos/results/{}_film_nohetioouter_results.json".format(trait), "r") as file:
    candidate2cs = json.load(file)[0]

coregenes =  set([key for key, value in candidate2cs.items() if value == 11])

additive_numbers.append(additive_numbers[-1] + len(coregenes.union(mendelians).intersection(set(id2hgnc.values()))))
individual_numbers.append(len(coregenes.union(mendelians).intersection(set(id2hgnc.values()))))
other_coregenes = [key for key, value in candidate2cs.items() if value != 11]

allcore.update(set(coregenes))
allcore.update(set(mendelians))
allcore = allcore.intersection(set(id2hgnc.values()))
set_numbers.append(len(allcore))
noncore = noncore.difference(allcore).difference(other_coregenes)

trait = "ad"
traits.append(trait.upper())
ad_hsps= pd.read_csv("../hsps/{}.txt".format(trait), header=None, index_col=None).iloc[:, 0].tolist()

#pertubagen_hsps = [hsp for hsp in hsps if hsp in overexpressed_genes]

from extensions.preprocessing import preprocess_labels

mendelians =  set(preprocess_labels("../extensions/alz_only_genes.tsv"))

import json

with open("/mnt/storage/speos/results/alz_film_nohetioouter_results.json".format(trait), "r") as file:
    candidate2cs = json.load(file)[0]

coregenes =  set([key for key, value in candidate2cs.items() if value == 11])

additive_numbers.append(additive_numbers[-1] + len(coregenes.union(mendelians).intersection(set(id2hgnc.values()))))
individual_numbers.append(len(coregenes.union(mendelians).intersection(set(id2hgnc.values()))))
other_coregenes = [key for key, value in candidate2cs.items() if value != 11]

allcore.update(set(coregenes))
allcore.update(set(mendelians))
allcore = allcore.intersection(set(id2hgnc.values()))
set_numbers.append(len(allcore))
noncore = noncore.difference(allcore).difference(other_coregenes)


hsps = set()
hsps.update(scz_hsps)
hsps.update(cad_hsps)
hsps.update(uc_hsps)
hsps.update(ra_hsps)
hsps.update(ad_hsps)
len(hsps)

In [None]:
from copy import deepcopy
import random

background_genes = set()
background_genes.update(noncore)
background_genes.update(allcore)

def get_random_sample(geneset, ns, seed):
    returnset = set()
    #random.seed(seed)
    for n in ns:
        genesetB = set(random.sample(list(geneset), n))
        returnset = returnset.union(set([genesetB.pop() for _ in range(n)]))
    return len(returnset)


random_expectation = []
q95 = []
q05 = []
for i in range(len(set_numbers)):
    if i < 2:
        random_expectation.append(set_numbers[i])
        q95.append(set_numbers[i])
        q05.append(set_numbers[i])
    else:
        random_overlaps = [get_random_sample(background_genes, individual_numbers[1:i+1], seed) for seed in range(1000)]
        random_expectation.append(np.mean(random_overlaps))
        q95.append(np.quantile(random_overlaps, q=0.99))
        q05.append(np.quantile(random_overlaps, q=0.01))

In [None]:
set_numbers

In [None]:
fig, ax = plt.subplots(figsize=(full_width*cm*0.5, 5*cm))

ax.plot(range(len(additive_numbers)), additive_numbers)
ax.plot(range(len(set_numbers)), set_numbers)
ax.plot(range(len(random_expectation)), random_expectation)
ax.fill_between(range(len(random_expectation)), random_expectation, q95, color="lightgreen")
ax.fill_between(range(len(random_expectation)), random_expectation, q05,  color="lightgreen")
ax.set_xticks(range(len(additive_numbers)))
ax.set_xticklabels(["None"] + traits, fontsize=10)
ax.grid(axis="y", color="lightgray")
ax.set_xlabel("Core Gene Traits")
ax.set_ylabel("Genes")
plt.savefig("union_coregenes.svg", bbox_inches="tight")

In [None]:
setA = set(random.sample(background_genes, individual_numbers[1]))
setB = set(random.sample(background_genes, individual_numbers[2]))
len(setA.intersection(setB))

In [None]:
random_expectation

In [None]:
additive_numbers

In [None]:
len(allcore)

In [None]:
len(noncore)

In [None]:
results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_oe_df, allcore, hsps, noncore)


random_core = []
random_hsp = []
random_peri = []
for i in range(10):
    _, _, core_result_random, hsp_result_random, peri_result_random = get_differential_percentages(full_oe_df, allcore, hsps, noncore, randomize_core=True, random_seed=i)
    random_core.append(core_result_random[0])
    random_hsp.append(hsp_result_random[0])
    random_peri.append(peri_result_random[0])

import seaborn as sns
from speos.visualization.settings import *
import matplotlib.pyplot as plt
fig, ax= plt.subplots(figsize=(8*cm,5*cm))

kd_matrix_mean = pd.DataFrame(index=["HSP\n(n={})".format(hsp_result[1]), "Peripheral\n(n={})".format(peri_result[1]), "Core\n(n={})".format(core_result[1])],
                         data={"Core Genes\nn={}".format(len(allcore)): [hsp_result[0], peri_result[0],  core_result[0]],
                               "Random Genes\nn={} (10x)".format(len(allcore)): [np.mean(random_hsp), np.mean(random_peri), np.mean(random_core)]})

ax = sns.heatmap(kd_matrix_mean.transpose(),vmin=0, vmax=1, cmap="Oranges", annot=True, fmt=".1%", ax=ax,
                 cbar_kws={'label': "Fraction Significant\nDifferential Perturbations",
                           "pad": 0.01})
cbar = ax.collections[-1].colorbar
cbar.ax.set_ylabel("Fraction Significant\nDifferential Perturbations", fontsize=5)
ax.set_ylabel("Target Gene Set", fontsize=7)
ax.set_xlabel("Perturbagen (Overexpression)", fontsize=7)
plt.tight_layout()
plt.savefig("Perturbation_overexpression_mix5.svg", bbox_inches="tight")

In [None]:
results_df, total, core_result, hsp_result, peri_result = get_differential_percentages(full_kd_df, allcore, hsps, noncore, use_min=True, use_t_test=True)


random_core = []
random_hsp = []
random_peri = []
for i in range(10):
    _, _, core_result_random, hsp_result_random, peri_result_random = get_differential_percentages(full_kd_df, allcore, hsps, noncore, randomize_core=True, random_seed=i, use_min=True)
    random_core.append(core_result_random[0])
    random_hsp.append(hsp_result_random[0])
    random_peri.append(peri_result_random[0])

fig, ax= plt.subplots(figsize=(8*cm,5*cm))

kd_matrix_mean = pd.DataFrame(index=["HSP" + "\n(n=%s)" % hsp_result[1], "Peripheral\n" + "(n=%s)" % peri_result[1], "Core Gene\n" + "(n=%s)" % core_result[1]],
                         data={"Core Genes\n" + "n={}".format(len(allcore)): [hsp_result[0], peri_result[0],  core_result[0]],
                               "Random Genes\n" + "n={} (10x)".format(len(allcore)): [np.mean(random_hsp), np.mean(random_peri), np.mean(random_core)]})

ax = sns.heatmap(kd_matrix_mean.transpose(), vmin=0,  vmax=1, cmap="Purples", annot=True, fmt=".1%", ax=ax,
                 cbar_kws={'label': "Fraction Significant\nDifferential Perturbations",
                           "pad": 0.01})
cbar = ax.collections[-1].colorbar
cbar.ax.set_ylabel("Fraction Significant\nDifferential Perturbations", fontsize=5)
ax.set_ylabel("Target Gene Set", fontsize=7)
ax.set_xlabel("Perturbagen (Knockdown)", fontsize=7)
plt.tight_layout()
plt.savefig("Perturbation_knockdown_mix5.svg", bbox_inches="tight")

In [None]:
intersect_allcore

In [None]:
df