In [None]:
import pandas as pd
import numpy as np
from tableanalyser import *
import os
import scipy.stats as st

In [None]:
working_dir = "/Users/filippo/Developer/tesi"
os.chdir(working_dir)

In [None]:
normalisation_str='counts'

In [None]:
df = pd.read_csv(("%s/mainTable.csv"%working_dir))
df.columns.values[0] = 'gene'
#df = df.to_sparse(fill_value=0.)

In [None]:
df_mv = pd.read_csv("meanVariances.csv", index_col = [0])
#type_of_gene='protein-coding'
#df_mv = df_mv.loc[df_mv['type_of_gene']==type_of_gene]
df_mv_occ=pd.read_csv("O.dat", header=None)
#df_mv.drop("type_of_gene", axis=1, inplace=True)
df_mv.insert(3, 'occurrence', df_mv_occ.values)
#df_mv.insert(2,'type_of_gene','protein-coding')
df_mv.head()

In [None]:
nfiles=len(df.columns)-1
means = df_mv['mean'].values
variances = df_mv['variance'].values
occurrences = np.array(df_mv['occurrence'].values*nfiles, dtype=int)

## Single gene

In [None]:
gene = geneinfo('ENSG00000078237', df, nfiles)
genedistr(gene, 50);
#geneplot(gene);
#genecoord(gene, means, variances);

## Search
### by mean variance

In [None]:
search_mean_max = 200
search_mean_min = 50
search_var_max = 1e12
search_var_min = 1e6
pc = True
if (pc):
    query_result = df_mv.loc[(df_mv['type_of_gene']=='protein-coding')&((df_mv['mean']>(search_mean_min)) & (df_mv['mean']<(search_mean_max))) & ((df_mv['variance']>(search_var_min)) & (df_mv['variance']<(search_var_max)))].sort_values(by='mean')
else:
    query_result = df_mv.loc[((df_mv['mean']>(search_mean_min)) & (df_mv['mean']<(search_mean_max))) & ((df_mv['variance']>(search_var_min)) & (df_mv['variance']<(search_var_max)))].sort_values(by='mean')
query_result

### by mean occurrence

In [None]:
search_mean_min = 2e5
search_occ_min = 0.995
query_result = df_mv.loc[(df_mv['mean']>(search_mean_min)) & (df_mv['occurrence']>(search_occ_min))].sort_values(by='mean')

In [None]:
genesnames = []
for g in query_result.index.values[:9]:
    gene = geneinfo(g, df, nfiles, metric=normalisation_str)
    genesnames.append(gene)

### coordinates

In [None]:
fig = plt.figure(figsize=(18,8))
plt.scatter(means, variances)
for g in genesnames:
    plt.scatter([g['avg']],[g['var']], marker='x', s=90, label=g['name'])
plt.xlabel("$<%s>$"%normalisation_str, fontsize=16)
plt.ylabel("$\sigma^2_{%s}$"%normalisation_str, fontsize=16)
plt.yscale('log')
plt.xlim(5e-5,np.power(10,np.log10(means.max())+1))
plt.ylim((variances[variances.nonzero()].min()/10,np.power(10,np.log10(variances.max())+1)))
plt.xscale('log')
plt.yscale('log')
plt.legend()
plt.show()
fig.savefig("plot/genes/subset.png")

## Plot all

In [None]:
for gene in genesnames:
    print(gene['name'])
    print("mean: %f"%gene['avg'])
    print("var: %f"%gene['var'])
    genedistr(gene, metric=normalisation_str,bins=np.logspace(0,8));
    #geneplot(gene);
    #genecoord(gene);

In [None]:
use_log_bins = False
use_log_scale = False
fig = plt.figure(figsize=(25,15))
axs = fig.subplots(len(genesnames)/3,3)
log_bin_x = np.logspace(0,np.log10(1e7))
if use_log_bins:
    log_bin_x = np.linspace(0,1e6)
print("ENSG", "a", "b", "loc", "var", "<>")
for i,gene in enumerate(genesnames):
    ax = axs[i/(len(genesnames)/3)][i%3]
    ax.set_title("%s [$<>$:%.0f $o_i$:%.1f]"%(gene['name'], gene['avg'],gene['occ']))
    data = gene['data']
    mu = np.average(data)
    var = np.var(data)
    a = mu*mu/var
    b = var/mu
    print(gene['name'],"%.3f  %.3f  %.3f  %.3f"%(a, b, var, mu))

    
    data = data / b
    mu = np.average(data)
    var = np.var(data)
    a = mu*mu/var
    b = var/mu
    func = st.gamma
    fit_alpha, fit_loc, fit_beta=func.fit(data)
    print(gene['name'],"%.3f (%.2f)  %.3f (%.2f) %.3f  %.3f  %.3f"%(fit_alpha, a, fit_beta, b, fit_loc, var, mu))
    count, bin_edges, _ = ax.hist(data, histtype='step',lw=2, density=True, label=gene['name'])
    ax.plot(np.linspace(0,10), func.pdf(np.linspace(0,10),fit_alpha, fit_loc, fit_beta), label='gamma')
    ax.plot(np.linspace(0,10), func.pdf(np.linspace(0,10),a, 0, 1), label='gamma')
    plt.text(0.5, 0.8, '($alpha$, $beta$, loc):\n(%4.2f, %4.2f, %4.2f)'%(fit_alpha, fit_beta, fit_loc), horizontalalignment='left',verticalalignment='top', fontsize=16, transform=ax.transAxes)



    ax.set_xlabel(normalisation_str, fontsize=16)
    if use_log_scale:
        ax.set_xscale('log')
        ax.set_yscale('log')
        ax.set_xlim(5e-1,1e1)
        ax.set_ylim(count[count.nonzero()].min()/np.sum(gene['data'])*1e8,1e2)
    else:
        ax.set_ylim(0,1)
    #ax.set_xlim(1,1e7)
    #ax.legend(fontsize=16, loc='upper right')
plt.show()
bins_str = ''
log_str = ''
if use_log_bins:
    bins_str = '_logbins'
if use_log_bins:
    log_str = '_log'
fig.savefig("highmean_highO%s%s_rescaled.pdf"%(bins_str,log_str))

In [None]:
use_log_bins = False
use_log_scale = False
fig = plt.figure(figsize=(25,18))
axs = fig.subplots(len(genesnames)/3,3)
log_bin_x = np.logspace(0,np.log10(1e7))
if use_log_bins:
    log_bin_x = np.linspace(0,1e6)
print("ENSG", "a", "b", "var", "<>")
for i,gene in enumerate(genesnames):
    ax = axs[i/(len(genesnames)/3)][i%3]
    ax.set_title("%s [$<>$:%.0f $o_i$:%.1f]"%(gene['name'], gene['avg'],gene['occ']))
    data = gene['data']
    mu = np.average(data)
    var = np.var(data)
    a = mu*mu/var
    b = var/mu
    print(gene['name'],"%.3f  %.3f  %.3f  %.3f"%(a, b, var, mu))

    func = st.gamma
    count, bin_edges, _ = ax.hist(data, histtype='step',lw=2, density=True, label=gene['name'])
    #ax.plot(np.linspace(0,data.max()), func.pdf(np.linspace(0,data.max()),a, 0, b), label='gamma')
    ax.plot(np.linspace(0,data.max()), func.pdf(np.linspace(0,data.max())/b,a, 0, 1)/b, label='gamma')
    plt.text(0.3, 0.8, '($alpha$, $beta$, loc):\n(%4.2f, %4.2f, %4.2f)'%(a, b, 0), horizontalalignment='left',verticalalignment='top', fontsize=16, transform=ax.transAxes)



    ax.set_xlabel(normalisation_str, fontsize=16)
    if use_log_scale:
        ax.set_xscale('log')
        ax.set_yscale('log')
        ax.set_xlim(5e-1,1e1)
        ax.set_ylim(count[count.nonzero()].min()/np.sum(gene['data'])*1e8,1e2)
    #else:
        #ax.set_ylim(0,1)
    #ax.set_xlim(1,1e7)
    #ax.legend(fontsize=16, loc='upper right')
plt.show()
bins_str = ''
log_str = ''
if use_log_bins:
    bins_str = '_logbins'
if use_log_bins:
    log_str = '_log'
fig.savefig("highmean_highO%s%s.pdf"%(bins_str,log_str))

In [None]:
def powgaus(x, a, b, c, d, e):
    return np.exp(a*np.power(x,-b)) - 1 + e*(np.exp(np.exp(-(x-c)*(x-c)/d)) - 1)

In [None]:
fitfunc = powgaus

In [None]:
xscale = 'log'
yscale='log'
bins = 30
fig = plt.figure(figsize=(10,4))
ax = fig.subplots()
counts, bin_edges, _ = ax.hist(gene['data'], histtype='step', bins=bins, range=(0,1e4))
bin_centres = (bin_edges[1:]+bin_edges[:-1])/2.
ax.errorbar(bin_centres, counts, np.sqrt(counts), None, 'bo', label='data')
bounds = ([2, 0.3, 3000, 1000, 0.01],[1000, 0.7, 5000, 500000, 3])
popt, pcov = curve_fit(fitfunc, bin_centres, counts, bounds=bounds)
print(popt)
x = np.linspace(bin_centres[0],1e4)
ax.set_title(gene['name'], fontsize=18)
ax.set_xlabel("$%s$"%normalisation_str)
ax.set_ylabel("#")
ax.set_yscale(yscale)
ax.set_xscale(xscale)
minimum = fminbound(fitfunc, 100, 1500, args=(popt[0],popt[1],popt[2],popt[3],popt[4]))
plt.plot(x, fitfunc(x, *popt), 'r', label='fit')
plt.plot(x, np.exp(popt[0]*np.power(x,-popt[1])) - 1, label='$e^{a*x^{-b}}$')
plt.plot(x, popt[4]*np.exp(np.exp(-(x-popt[2])*(x-popt[2])/popt[3])) - popt[4], label="$C(e^{e^{- (x-\mu)^2\sigma^{-2}}}-1)$")
plt.text(0.5, 0.45, 'min: %6.1f'%minimum, horizontalalignment='left',verticalalignment='top', fontsize=16, transform=ax.transAxes)
plt.text(0.5, 0.4, '(a, b, $\mu$, $\sigma$, C):\n(%d, %4.2f, %d, %d, %3.1f)'%(popt[0],popt[1],popt[2],popt[3],popt[4]), horizontalalignment='left',verticalalignment='top', fontsize=16, transform=ax.transAxes)
plt.ylim(1,2e4)
plt.legend(fontsize=14)
plt.show()