In [2]:
%pylab inline

import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

import os
import pandas as pd
import scipy
#from sklearn.metrics import roc_auc_score, roc_curve
import seaborn as sns
OUTDIR = "pdfs/"

Populating the interactive namespace from numpy and matplotlib


This notebook contains code to generate the following results from Zheng et al 2020:

* ED Figure 5: singletons

In [3]:
factors = ["BHLHE40", "CEBPB", "CTCF", "E2F4", "EBF1", "ELF1", "ELK1", "ETS1", "FOS", "IRF4", "JunD", "Max", "MEF2A", "MEF2C", "Mxi1", "NFIC", "NFKB", "NFYA", "NFYB", "Nrf1", "NRSF", "PAX5", "PBX3", "POU2F2", "PU1", "RFX5", "RUNX3", "SP1", "SRF", "STAT1", "STAT3", "TCF3", "USF1", "USF2", "YY1", "ZBTB33", "ZEB1", "Znf143"]
rank_perc = [950, 990, 995]

factordata = {}
DIR="/storage/pandaman/project/singletons/AgentBind/"

# load data
for f in factors:
    print(f)
    fdata = pd.read_csv(os.path.join(DIR, f, "factor_singletons_r2.tab"), sep="\t", skiprows=1,
                            names=["chrom","start", "end", "ref", "AC", "AN",
                                   "raw.score","snr.score","rank","core"])
    fdata = fdata[~np.isnan(fdata["raw.score"])]
    fdata = fdata[~np.isnan(fdata["snr.score"])]
    fdata["factor"] = f
    factordata[f] = fdata

# Set singleton definition
for f in factors:
    factordata[f]["sing"] = (factordata[f]["AC"] == 1) | \
        ((factordata[f]["AN"] - factordata[f]["AC"]) == 1)

BHLHE40


FileNotFoundError: [Errno 2] File b'/storage/pandaman/project/singletons/AgentBind/BHLHE40/factor_singletons_r2.tab' does not exist: b'/storage/pandaman/project/singletons/AgentBind/BHLHE40/factor_singletons_r2.tab'

In [None]:
data = []

for f in factors:
    d = []
    fd = factordata[f]
    
    # Get pval for core
    p_all = np.mean(fd["sing"])
    p_core = np.mean(fd[fd["core"]>0]["sing"])
    n_all = fd.shape[0]
    n_core = fd[fd["core"]>0].shape[0]
    d = [f, p_all, p_core, n_all, n_core]
    n_not_core = n_all-n_core
    p_not_core = (p_all*n_all-p_core*n_core)/(n_not_core)
    se_core = np.sqrt(p_all*(1-p_all)*(1.0/n_core+1.0/n_not_core))
    pval_core = 1-scipy.stats.norm.cdf((p_core-p_not_core)/se_core)
    d.append(pval_core)
    
    for i in range(len(rank_perc)):
        rk = (fd[fd["rank"]>=rank_perc[i]]["sing"])
        p_rk = np.mean(rk)
        diff_p_rk = p_rk - p_all
        err_rk = np.sqrt(p_rk*(1-p_rk)/len(rk))
        n_rk = len(rk)
        n_not_rk = n_all-n_rk
        p_not_rk = (p_all*n_all-np.sum(rk))*1.0/n_not_rk
        # Compute pvals
        se_rk = np.sqrt(p_all*(1-p_all)*(1.0/n_rk+1.0/n_not_rk))
        pval_rk = 1-scipy.stats.norm.cdf((p_rk-p_not_rk)/se_rk)
        # Add data
        d.extend([p_rk, diff_p_rk, err_rk, pval_rk, len(rk)])
    data.append(d)

cols = ["factor","sing.all","sing.core","n.all","n.core","pval.core"]
for i in range(len(rank_perc)):
    cols.extend(["sing.rank.%s"%rank_perc[i], "diff.rank.%s"%rank_perc[i], 
                 "sing.rank.%s.err"%rank_perc[i], "pval.rank.%s"%rank_perc[i],
                "num.%s"%rank_perc[i]])
results = pd.DataFrame(data, columns=cols)
results["sing.all.err"] = results.apply(lambda x: np.sqrt(x["sing.all"]*(1-x["sing.all"])/x["n.all"]), 1)
results["sing.core.err"] = results.apply(lambda x: np.sqrt(x["sing.core"]*(1-x["sing.core"])/x["n.core"]), 1)

In [None]:
results[["factor","sing.all","sing.rank.950","sing.rank.990","sing.rank.995"]]

In [None]:
results[["factor","sing.all","sing.rank.950","sing.rank.990","sing.rank.995"]][results["sing.all"]> results["sing.rank.995"]]

In [None]:
results = results.sort_values("diff.rank.995", ascending=False)
xvals = np.array(range(results.shape[0]))

w = 0.1
fig = plt.figure()
fig.set_size_inches((12, 5))
ax = fig.add_subplot(111)
ax.errorbar(xvals, results["sing.all"], label="All", yerr=results["sing.all.err"], marker="o", linewidth=0, elinewidth=1, color="black")
#ax.errorbar(xvals+w, results["sing.core"], label="core", yerr=results["sing.core.err"], marker="o", linewidth=0, elinewidth=1, color="g")

i=0
x = rank_perc[i]
lab = "Rank"
ax.errorbar(xvals+w*(2+i), results["sing.rank.%s"%x], label=lab, yerr=results["sing.rank.%s.err"%x], marker="o", linewidth=0, elinewidth=1, color="orange")

i=2
x = rank_perc[i]
ax.errorbar(xvals+w*(2+i), results["sing.rank.%s"%x], label=lab, yerr=results["sing.rank.%s.err"%x], marker="o", linewidth=0, elinewidth=1, color="red")
    
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')

ax.set_xticks(xvals+w)
ax.set_xticklabels(results["factor"], rotation=90, size=12, fontname="Arial");
ax.set_yticklabels(["%0.2f"%item for item in ax.get_yticks()], size=12, fontname="Arial")
#ax.legend(loc="upper left")
ax.set_ylabel("Percent singletons", size=15, fontname="Arial")
plt.tight_layout()
fig.savefig(os.path.join(OUTDIR, "singleton_950_995.pdf"))

In [None]:
results = results.sort_values("diff.rank.995", ascending=False)
xvals = np.array(range(results.shape[0]))

w = 0.1
fig = plt.figure()
fig.set_size_inches((12, 5))
ax = fig.add_subplot(111)
ax.errorbar(xvals, results["sing.all"], label="All", yerr=results["sing.all.err"], marker="o", linewidth=0, elinewidth=1, color="black")
ax.errorbar(xvals+w, results["sing.core"], label="core", yerr=results["sing.core.err"], marker="o", linewidth=0, elinewidth=1, color="steelblue")
lab = True
for i in [2]:
    x = rank_perc[i]
    if not lab:
        lab = "Rank"
    else:
        lab = ""
    ax.errorbar(xvals+w*(2+i), results["sing.rank.%s"%x], label=lab, yerr=results["sing.rank.%s.err"%x], marker="o", linewidth=0, elinewidth=1, color="red")
    
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')

ax.set_xticks(xvals+w)
ax.set_xticklabels(results["factor"], rotation=90, size=12);
ax.set_yticklabels(["%0.2f"%item for item in ax.get_yticks()], size=12)
#ax.legend(loc="upper left")
ax.set_ylabel("Percent singletons", size=15);
fig.savefig(os.path.join(OUTDIR, "singleton_995_core.pdf"))