In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy import stats
from scipy.optimize import curve_fit
import os

In [None]:
df_mv = pd.read_csv("meanVariances.csv", index_col = [0])
df_genes=pd.read_csv("genes.txt",index_col=[0])
df_mv.index=df_genes.index
df_mv['type_of_gene']=df_genes['type_of_gene']
#df_mv = df_mv.loc[df_mv['type_of_gene']=='protein-coding']
df_mv.head()

In [None]:
means = df_mv['mean'].values
variances = df_mv['variance'].values
#occurrences = np.array(df_mv['occurrence'].values, dtype=float)
len(means)

In [None]:
## Plot by class

types_of_gene = np.unique(df_mv['type_of_gene'].values)
types_of_gene

xscale='log'
yscale='log'
fig = plt.figure(figsize=(10,10))
#['ncRNA', 'rRNA', 'scRNA', 'snRNA', 'snoRNA']
for type_of_gene in types_of_gene[:-1]:
    subdf = df_mv[df_mv['type_of_gene']==type_of_gene]
    plt.scatter(subdf['mean'], subdf['variance'], label = type_of_gene, marker='o', alpha=0.7, linewidths=0.1)
x=np.logspace(-7,5)
plt.plot(x,x)
plt.plot(x,x*x)
plt.xlabel("$<FPKM>$", fontsize=16)
plt.ylabel("$\sigma^2_{FPKM}$", fontsize=16)
plt.yscale(yscale)
plt.xscale(xscale)
plt.ylim(5e-7, 1e8)
plt.xlim(5e-6, 1.5e4)
plt.legend()
plt.show()
fig.savefig('allgenes_classes_known_'+yscale+xscale+'.png')

In [None]:
xscale='log'
yscale='log'
fig = plt.figure(figsize=(40,30))
pindex = 1
for type_of_gene in types_of_gene:
    ax = plt.subplot(3,4,pindex)
    subdf = df_mv[df_mv['type_of_gene']==type_of_gene]
    ax.scatter(subdf['mean'], subdf['variance'], label = type_of_gene, marker='o', alpha=0.7, linewidths=0.1)
    ax.set_title(type_of_gene, fontsize=20)
    plt.xlabel("$<FPKM>$", fontsize=14)
    plt.ylabel("$\sigma^2_{FPKM}$", fontsize=14)
    x=np.logspace(-7,5)
    ax.plot(x,x)
    ax.plot(x,x*x)
    ax.set_yscale(yscale)
    ax.set_xscale(xscale)
    ax.set_ylim(5e-7, 1e8)
    ax.set_xlim(5e-6, 1.5e4)
    pindex+=1
plt.show()
fig.savefig('allgenes_perclasses_'+yscale+xscale+'.png')

## specific class

In [None]:
type_of_gene = 'microRNA'
xscale='log'
yscale='log'
n_bins=30
subdf = df_mv[df_mv['type_of_gene']==type_of_gene]
subdf = subdf.fillna(value=0.)

In [None]:
fig = plt.figure(figsize=(10,10))
x = subdf['mean'].values
y = subdf['variance'].values
plt.scatter(x, y, label = type_of_gene, marker='o', alpha=0.5, linewidths=0.1)

log_bins_for_x = np.logspace(np.log10(x[x.nonzero()].min()), np.log10(np.max(x)), num=n_bins)
bin_means, bin_edges, binnumber = stats.binned_statistic(x, y, statistic='mean', bins=log_bins_for_x)
bin_centres = (bin_edges[:-1]+bin_edges[1:])/2
plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='r', lw=5, label='binned average')

plt.plot(log_bins_for_x,np.power(log_bins_for_x,2), 'g-', lw=5, label='$<FPKM>^2$')
plt.plot(log_bins_for_x,log_bins_for_x, 'r-', lw=5, label='$<FPKM>$')



popt, pcov = curve_fit(lambda x,a,b : a+np.power(x,b), bin_centres, bin_means)
plt.plot(log_bins_for_x, popt[0]*np.power(log_bins_for_x, popt[1]), color='y', lw=3, label='fit')
print(popt)


plt.xlabel("$<FPKM>$", fontsize=16)
plt.ylabel("$\sigma^2_{FPKM}$", fontsize=16)
plt.yscale(yscale)
plt.xscale(xscale)
plt.ylim(1e-5, 1e8)
plt.xlim(5e-5, 1.5e5)
plt.title(type_of_gene, fontsize=20)
plt.legend(fontsize=16)
plt.show()
fig.savefig('%s_'%type_of_gene+yscale+xscale+'.png')

In [None]:
bins = 50
rang = (0-0.5/bins, 1+0.5/bins)
fig = plt.figure(figsize=(10,5))
subdf = df_mv[df_mv['type_of_gene']==type_of_gene]
plt.hist(subdf['occurrence']/5000, label=type_of_gene, histtype='step', density=True, bins=bins, range=rang, lw=3)
plt.legend(fontsize=16, loc='upper center')
plt.xlabel("$o_i$", fontsize=18)
plt.ylabel("#")
plt.yscale('log')
plt.xscale('log')
plt.show()
fig.savefig("U_%s.pdf"%type_of_gene)

In [None]:
with open("ontology.txt", 'w') as g_file:
    g_file.writelines(["%s\n" % item  for item in df_mv[df_mv['occurrence']>0.9*5000].index.values])

## U

In [None]:
bins = 11
rang = (0-0.5/bins, 1+0.5/bins)
fig = plt.figure(figsize=(10,5))
for t in ['protein-coding', 'ncRNA']:
    subdf = df_mv[df_mv['type_of_gene']==t]
    plt.hist(subdf['occurrence']/5000, label=t, histtype='step', density=True, bins=bins, range=rang, lw=3)
plt.legend(fontsize=16, loc='upper center')
plt.xlabel("$o_i$", fontsize=18)
plt.ylabel("#")
plt.show()
fig.savefig("multiplecnc.pdf")