In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import logging, sys, gc, os
import pandas as pd
from scipy.stats import binned_statistic
from scipy.optimize import curve_fit
from frontiers_analysis import load_tissue
os.chdir("The_single_cell_transcriptome_as_a_component_system/MouseCellAtlas/")

In [None]:
df_genes = pd.read_csv("MGImarkerQuery_20200914_050053.txt", sep="\t").set_index("Symbol")
pc = df_genes[df_genes["Feature Type"]=="protein coding gene"].index.values
nc = df_genes[df_genes["Feature Type"]!="protein coding gene"].index.values

In [None]:
tissue="Muscle"
df = pd.read_csv(f"mca/mainTable_{tissue}.csv", index_col=0)
#df = pd.read_csv(f"../Smartseq3.HEK.fwdprimer.UMIcounts.txt", index_col=0, sep="\t")
#df = pd.read_csv(f"../Smartseq3.Fibroblasts.NovaSeq.UMIcounts.txt", index_col=0, sep="\t")
#df = df[df.index.isin(nc)]
M = df.sum(0)
f = df.divide(M,1).mean(1)
O = df.apply(lambda x: (x>0).sum()/float(len(x)), 1)

In [None]:
#f = df.divide(df.sum(0),1).mean(1)
#f.to_csv("A_ss3_fibro.dat")

In [None]:
%load_ext autoreload
%autoreload 2
from methods import mazzolini as sampling
from methods import mazzolini_broad as poissonian_sampling
from methods import mazzolini_nbinom as nbinom_sampling
from methods import mazzolini_gaus as gaus_sampling
from methods import mazzolini_timesM as times_sampling

In [None]:
M_tilde = 1000000

# Create models

In [None]:
method_delta = sampling(M=M, f=f)
method_pois = poissonian_sampling(M=M, f=f, M_tilde=M_tilde)
#method_nbinom = nbinom_sampling(M=M, f=f, M_tilde=M_tilde)
#method_gaus = gaus_sampling(M=M, f=f)

models = [method_delta, method_pois]
for method in models:
    print(method)
    method.run()

## Zipf

In [None]:
plt.plot(np.sort(f/f.sum())[::-1], lw=10, c="gray", alpha=0.5, label="data")
for model in models:
    plt.plot(model.get_f(), lw=10, alpha=0.5, label=model.name_, c=model.color_)

plt.xlabel("i")
plt.ylabel("f")

plt.xscale("log")
plt.yscale("log")

plt.legend()

## Heaps

In [None]:
bins = np.logspace(np.log10(M.min()), np.log10(M.max()), 35)
#bins = np.linspace(M.min(), M.max(), 35)
    
h=df.apply(lambda x: (x>0).sum(),0)
plt.scatter(M, h, c="gray", alpha=0.8)
means, edges, _ = binned_statistic(M, h, bins=bins)
var, edges, _ = binned_statistic(M, h, statistic="std", bins=bins)
cnt, edges, _ = binned_statistic(M, h, statistic="count", bins=bins)
var = var*var
mask = cnt > 10
means = means[mask]
var = var[mask]
l_edges = (edges[:-1])[mask]
r_edges = (edges[1:])[mask]
plt.hlines(means, l_edges, r_edges, lw=5, color="gray", ls="--")

for model in models:
    print(model.name_)
    means, edges, _ = binned_statistic(M, model.get_h(), bins=bins)
    var, edges, _ = binned_statistic(M, model.get_h(), statistic="std", bins=bins)
    cnt, edges, _ = binned_statistic(M, model.get_h(), statistic="count", bins=bins)
    var = var*var
    mask = cnt > 10
    means = means[mask]
    var = var[mask]
    l_edges = (edges[:-1])[mask]
    r_edges = (edges[1:])[mask]

    plt.hlines(means, l_edges, r_edges, lw=5, color="dark"+model.color_, ls="--")

    plt.scatter(M, model.get_h(), alpha=0.2, c=model.color_, label=model.name_)
    
    model.hmean = means.copy()
    model.hvar = var.copy()
    model.cnt = cnt.copy()


plt.xlabel("M")
plt.ylabel("h")

plt.legend()

### Fluctuations

In [None]:
x = np.logspace(np.log10(300),np.log10(2000))

for model in models:
    plt.scatter(model.hmean, model.hvar, s=50, label=model.name_, c=model.color_)

plt.scatter(means, var, label="data", c="gray")
    
popt, pcov= curve_fit(lambda x, C: C*x, models[0].hmean, models[0].hvar)
plt.plot(x, popt[0]*x, lw=5, ls="--", c="cyan", alpha=0.8, label="C*<h>")

popt, pcov= curve_fit(lambda x, C: C*x*x, models[0].hmean, models[0].hvar)
plt.plot(x, popt[0]*x**2, lw=5, ls="--", c="purple", alpha=0.8, label ="C*<h>^2")
    
plt.xlabel("<h>")
plt.ylabel("var(h)")

plt.xscale("log")
plt.yscale("log")

plt.legend()

#plt.ylim(1e2,1e3)

## CV^2

In [None]:
for model in models:
    means = np.mean(model.table,0)
    mean = means[means>0]
    var = np.var(model.table,0)
    cv2 = var[means>0]/mean/mean
    plt.scatter(mean,cv2, alpha=0.5, label=model.name_,c=model.color_)

x = np.linspace(1e-4,1e2,10)
plt.plot(x,1/x, alpha=0.5, label="<>",c="gray", ls="--", lw=5)

    
plt.xscale("log")
plt.yscale("log")

plt.xlabel("<>")
plt.ylabel("CV^2")

plt.ylim(1e-2,1e5)
plt.xlim(5e-5,1e2)


plt.legend()

## U

In [None]:
bins=np.linspace(-0.05,1.05,20)
for model in models:
    plt.hist(model.get_O(), lw=10, ls="--", density=True, histtype="step", label=model.name_, color=model.color_, alpha=0.4, bins=bins)
    
plt.hist(O, color="gray", label="data", density=True, bins=bins)
plt.legend()

# Sparsity

## Simulations

### Sampling

In [None]:
sparsities = []
occurrences = []
for sample in range(50):
    model = sampling(M=M, f=f)
    model.run()
    E = (np.array(model.table)>0).sum()
    N = np.array(model.table).shape[0] * np.array(model.table).shape[1]
    sparsities.append(1-float(E)/N)
    occurrences.append(model.get_O())
    
method_delta.O = np.average(occurrences, 0)
method_delta.O_err = np.std(occurrences, 0)

### Sampling from Poisson

In [None]:
sparsities_pois = []
occurrences = []
for sample in range(250):
    model = poissonian_sampling(M=M, f=f, M_tilde=M_tilde)
    model.run()
    E = (np.array(model.table)>0).sum().sum()
    N = np.array(model.table).shape[0] * np.array(model.table).shape[1]
    sparsities_pois.append(1-float(E)/N)
    occurrences.append(model.get_O())
    
method_pois.O = np.average(occurrences, 0)
method_pois.O_err = np.std(occurrences, 0)

### Sampling from gaus

In [None]:
sparsities_gaus = []
for sample in range(50):
    model = gaus_sampling(M=M, f=f)
    model.run()
    E = (np.array(model.table)>0).sum().sum()
    N = np.array(model.table).shape[0] * np.array(model.table).shape[1]
    sparsities_gaus.append(1-float(E)/N)

## U

In [None]:
bins=np.linspace(-0.05,1.05,20)
for model in models:
    plt.hist(model.get_O(), lw=10, ls="--", density=True, histtype="step", label=model.name_, color=model.color_, alpha=0.4, bins=bins)
    
plt.hist(O, color="gray", label="data", density=True, bins=bins)
plt.legend()

### Data

In [None]:
E = (df>0).sum().sum()
N = df.shape[0] * df.shape[1]
data_sparsity=[1-E/N]

## Plot

In [None]:
fig = plt.figure(figsize=(8,8))

#plt.boxplot([data_sparsity,sparsities,sparsities_pois_5, sparsities_pois], labels=["data", "sampling", "poissonian \n sampling \n (M=500000)", f"\n (M={M_tilde})"], medianprops={"lw":10, "color":"red"}, whiskerprops={"lw":5}, capprops={"lw":5})
plt.boxplot([data_sparsity,sparsities, sparsities_pois], labels=["data", "sampling", f"gaussian"], medianprops={"lw":10, "color":"red"}, whiskerprops={"lw":5}, capprops={"lw":5})



plt.ylabel("Sparsity", fontsize=45)
plt.xticks(rotation=90)
plt.tick_params(size=10, width=5, labelsize=35)
plt.tight_layout()
plt.show()
fig.savefig(f"Sparsity_box_{tissue}.pdf")

# P0

In [None]:
import multiprocessing as mp
from time import time
from scipy.special import gamma, gammainc

In [None]:
def p_sampling(f, M):
    return [np.exp(-fi*M) for fi in f]

#def p_poisson_sampling(f, M, M_tilde):
#    def a(fi):
#        return fi / (fi-1.) * np.exp(-M/M_tilde)
#    return [np.power(1-fi,M_tilde) * np.power(1-a(fi),M_tilde) for fi in f]

def p_poisson_sampling(f, M, M_tilde):
    def a(fi):
        #return fi * (M-M_tilde)/(1-fi)
        return fi * M_tilde * np.exp(-M/M_tilde)
    
    #norm = 1 + np.exp(-M_tilde) * np.exp(a(1)) 
    return [np.exp(-fi*M_tilde) * np.exp(a(fi)) for fi in f]

def p_poisson_sampling_caselle(n, x):
    n = n[0]
    return np.exp(-x)*np.power(x,2*n) / gamma(n+1)**2 * np.sqrt(2*np.pi*n)

In [None]:
#NSample = 500
#M_sampled = M.sample(NSample, replace=False)
M_sampled = M
NSample = M.shape[0]

In [None]:
def get_term_at(ftilde):
    return 1-1./NSample*np.sum([p_sampling([ftilde],m) for m in M_sampled])

start = time()
pool = mp.Pool(12)
temp = pool.map_async(get_term_at, f)

pool.close()
pool.join()

O_sampling=temp.get()

time()-start

In [None]:
def get_term_at(ftilde):
    return 1-1./NSample*np.sum([p_poisson_sampling([ftilde], m, M_tilde=M_tilde) for m in M_sampled])

start = time()
pool = mp.Pool(12)
temp = pool.map_async(get_term_at, f)

pool.close()
pool.join()

O_poisson_sampling=temp.get()

time()-start

In [None]:
fig,axs = plt.subplots(1,2, figsize=(30,10))

axs[0].scatter(x=O_poisson_sampling, y=models[1].get_O())
axs[0].errorbar(x=O_poisson_sampling, y=models[1].get_O(), marker="o", lw=0, elinewidth=1, yerr=models[1].O_err)
axs[0].plot([0,1],[0,1], lw=10, c="red", ls="--", alpha=0.8)
axs[0].tick_params(labelsize=25)

axs[0].set_ylabel("O_poisson_sampling",fontsize=35)
axs[0].set_xlabel("O_poisson_sampling_teo",fontsize=35)

axs[1].errorbar(x=O_sampling, y=models[0].get_O(), marker="o", lw=0, elinewidth=1, yerr=models[0].O_err)
axs[1].plot([0,1],[0,1], lw=10, c="red", ls="--", alpha=0.8)

axs[1].set_ylabel("O_sampling",fontsize=35)
axs[1].set_xlabel("O_sampling_teo",fontsize=35)

axs[1].tick_params(labelsize=25)

plt.show()
fig.savefig(f"Poisson_sampling_teo_{tissue}.pdf")

In [None]:
fig, ax = plt.subplots(1,1, figsize=(18,15))

ax.errorbar(O, models[0].O, marker="o", ms=8, lw=0, c="gray", elinewidth=2, yerr=models[0].O_err, alpha=0.6)

#ax.errorbar(O, O_sampling, marker="o", ms=10, lw=0, c="gray", elinewidth=2, yerr=models[0].O_err, alpha=0.8)

ax.plot([0,1],[0,1], lw=10, c="red", ls="--", alpha=1)

#ax.legend(fontsize=35, loc="lower right")

ax.set_xlabel("$o_i$ from data",fontsize=35)
ax.set_ylabel("$o_i$ predicted",fontsize=35)
ax.set_xlim(0,1)
ax.set_ylim(0,1)

ax.tick_params(labelsize=25)

plt.show()
fig.savefig(f"Poisson_sampling_err.pdf")

In [None]:
fig, axs = plt.subplots(1,2, figsize=(30,10))

axs[0].scatter(O/len(M), O_sampling, label="O sampling (teo)")
axs[0].scatter(O/len(M), O_poisson_sampling, label="O poisson sampling (teo)")
axs[0].plot([0,1],[0,1], lw=10, c="red", ls="--", alpha=0.8)

axs[0].legend(fontsize=35, loc="lower right")

#axs[0].set_ylabel("O_sampling (teo)")
axs[0].set_xlabel("O_data",fontsize=35)

axs[0].tick_params(labelsize=25)

#axs[1].scatter(O_sampling, O_poisson_sampling)
axs[1].scatter(O/len(M), models[0].get_O(), label="O sampling (sim)")
axs[1].scatter(O/len(M), models[1].get_O(), label="O poisson sampling (sim)")
axs[1].plot([0,1],[0,1], lw=10, c="red", ls="--", alpha=0.8)

axs[1].legend(fontsize=35, loc="lower right")
axs[1].set_xlabel("O_data",fontsize=35)
axs[1].tick_params(labelsize=25)

plt.show()

In [None]:
df_findzeros = pd.DataFrame()
df_findzeros = df_findzeros.join(pd.Series(name="O_real", index = O.index, data=O), how="outer")
df_findzeros = df_findzeros.join(pd.Series(name="O_sampling_teo", index = O.index, data=O_sampling), how="outer")
df_findzeros = df_findzeros.join(pd.Series(name="O_sampling", index = O.index, data=models[0].get_O()), how="outer")
df_findzeros = df_findzeros.join(pd.Series(name="O_sampling_err", index = O.index, data=models[0].O_err), how="outer")

In [None]:
df_findzeros["O_real-O_pred"]=df_findzeros["O_real"]-df_findzeros["O_sampling"]
df_findzeros["Z"]=(df_findzeros["O_real-O_pred"]).abs()/(1e-10+df_findzeros["O_sampling_err"])
for g in df_findzeros[(df_findzeros["O_sampling"]<df_findzeros["O_real"]) & (df_findzeros["Z"]>3)].index:
    print(g)

In [None]:
fig,ax = plt.subplots()
df_findzeros[(df_findzeros["O_sampling"]<df_findzeros["O_real"]) & (df_findzeros["Z"]>0)]["O_real"].hist(ax=ax)

ax.set_xlabel("Occurrence")
ax.set_ylabel("#")
ax.set_title("geni con Z (Opred-Oreal<0) > 3")

In [None]:
#tissue="HEK"
fig,ax = plt.subplots(figsize=(18,15))
bins = np.linspace(-0.25, np.sqrt(2)/2., num=50)

ax.hist((model.get_O()-O), density=True, bins=bins, color="gray")
#ax.hist(np.concatenate(diffs), density=True, histtype="step", color="blue", bins=bins, ls="--", lw=5)
ax.hist(np.concatenate([occ-np.array(model.get_O()) for occ in occurrences]), density=True, histtype="step", color="blue", bins=bins, ls="--", lw=5)


ax.tick_params(labelsize=40, length=10, width=10)
ax.tick_params(which="minor", labelsize=40, length=8, width=5)

ax.set_xlabel("$o_i$ predicted - $o_i$ from data", fontsize=60)
ax.set_ylabel("pdf", fontsize=60)
#ax.legend()

#ax.set_yscale("log")

#plt.ylim(1e-3,100)

plt.tight_layout()

plt.show()
fig.savefig(f"O_pred_hist_{tissue}.pdf")

# END

In [None]:
pom_sampling = p_sampling(f.sort_values(), 1000)
pom_poisson_sampling = p_poisson_sampling(f.sort_values(), 1000, M_tilde)

In [None]:
import plotly.graph_objects as go

In [None]:
fig = go.Figure()
fig.add_scatter(x=f.sort_values(),y=pom_sampling, mode="lines", name="sampling", line=dict(dash="longdash", width=5))
fig.add_scatter(x=f.sort_values(),y=pom_poisson_sampling, mode="lines", name="poisson", line_dash="dashdot", line_width=5)

fig.update_traces(opacity=0.8)
fig.update_layout(xaxis_title="f",
                  xaxis_type="log",
                  xaxis_range=(-5,np.log10(3e-2)),
                  xaxis_exponentformat="e",
                  yaxis_type="log",
                  yaxis_title="P(0|M)")
fig.show()

In [None]:
import gc
gc.collect()

In [None]:
def p0m(df, M, limits = (100,110)):
    M_sorted = M.sort_values(ascending=True)
    quantiles = np.quantile(M, q=np.linspace(0,1,10)[:-1])
    #quantiles = np.linspace(M.min(), M.max(), 20)
    M_classes = pd.Series(index=M.index, data=np.digitize(M, quantiles))
    f = df.mean(1)
    f = f/f.sum()
    f_ = df.apply(lambda x: x[x>0].mean(), 1)
    f_ = f_/f_.sum()
    ret = {}
    for g in f.sort_values(ascending=False).index[limits[0]:limits[1]]:
        genexpr = df.loc[g,:]
        ret[g] =  [(genexpr[M_classes[M_classes==c].index]==0).astype(int).sum()/len(M_classes[M_classes==c]) for c in np.arange(len(quantiles))+1]
        del genexpr
        gc.collect()
    return ret, quantiles, (f.sort_values(ascending=False)[limits[0]:limits[1]].mean(), f_.sort_values(ascending=False)[limits[0]:limits[1]].mean())

In [None]:
p0mdata, M_bins, f_mean = p0m(df, M, limits = (500,510))
tmp=np.average([v for v in p0mdata.values()], axis=0)

In [None]:
xM = M_bins
pom_sampling = np.ravel([p_sampling([f_mean[0]], m) for m in xM])
pom_poisson_sampling = np.ravel([p_poisson_sampling([f_mean[0]], m, M_tilde) for m in xM])

In [None]:
fig = go.Figure()
fig.add_scatter(x=xM, y=pom_sampling, mode="lines", name="sampling", line=dict(dash="dash"))
fig.add_scatter(x=xM, y=pom_poisson_sampling, mode="lines", name="poisson", line_dash="dashdot")
fig.add_scatter(x=xM, y=tmp, mode="lines", name="data")

fig.update_layout(xaxis_title="M", 
                  yaxis_type="log",
                  yaxis_title="P(0|M)")
fig.show()