In [None]:
import os
import sys
import pandas as pd
import numpy as np
import regex as re
import mygene
from matplotlib import pyplot as plt
from scipy.optimize import curve_fit, fminbound
from scipy import stats
from tableanalyzer import *

In [None]:
mg = mygene.MyGeneInfo()

In [None]:
#mg.getgene("ENSG00000221782", 'name,symbol,refseq.rna,type_of_gene,exac.bp')

In [None]:
working_dir = "/Users/filippo/Developer/tesi/results/proteincoding_GTEX/"
os.chdir(working_dir)

In [None]:
df_tcga = pd.read_csv(("%s/TCGA/mainTable_TCGA.csv"%working_dir))
df_tcga.columns.values[0]='gene'

In [None]:
df_gtex = pd.read_csv(("%s/GTEX/mainTable_GTEX.csv"%working_dir))
df_gtex.columns.values[0]='gene'

In [None]:
df_tcga.head()

In [None]:
df_gtex.head()

In [None]:
ngenes_tcga = len(df_tcga['gene'])
nfiles_tcga = len(df_tcga.loc[0,:])-1
ngenes_gtex = len(df_gtex['gene'])
nfiles_gtex = len(df_gtex.loc[0,:])-1
print("TCGA genes:%d\trealizations:%d"%(ngenes_tcga,nfiles_tcga))
print("GTEX genes:%d\trealizations:%d"%(ngenes_gtex,nfiles_gtex))

## Means sigmas

In [None]:
df_mv_tcga = pd.read_csv("%s/TCGA/meanVariances_TCGA.csv"%working_dir, index_col = [0])
#type_of_gene='protein-coding'
#df_mv = df_mv.loc[df_mv['type_of_gene']==type_of_gene]
df_mv_occ=pd.read_csv("%s/TCGA/O_TCGA.dat"%working_dir, header=None)
df_mv_tcga.drop("type_of_gene", axis=1, inplace=True)
df_mv_tcga.insert(2, 'occurrence', df_mv_occ.values)
df_mv_tcga.head()

In [None]:
df_mv_gtex = pd.read_csv("%s/GTEX/meanVariances_GTEX.csv"%working_dir, index_col = [0])
#type_of_gene='protein-coding'
#df_mv = df_mv.loc[df_mv['type_of_gene']==type_of_gene]
df_mv_occ=pd.read_csv("%s/GTEX/O_GTEX.dat"%working_dir, header=None)
df_mv_gtex.drop("type_of_gene", axis=1, inplace=True)
df_mv_gtex.insert(2, 'occurrence', df_mv_occ.values)
df_mv_gtex.head()

In [None]:
means_tcga = df_mv_tcga['mean'].values
variances_tcga = df_mv_tcga['variance'].values
occurrences_tcga = np.array(df_mv_tcga['occurrence'].values, dtype=float)*nfiles_tcga
means_gtex = df_mv_gtex['mean'].values
variances_gtex = df_mv_gtex['variance'].values
occurrences_gtex = np.array(df_mv_gtex['occurrence'].values, dtype=float)*nfiles_tcga

### plot

#### **var** versus **mean**

In [None]:
fig=plt.figure(figsize=(15,4))
plt.subplot(121)
plt.scatter(means_gtex, variances_gtex, c='b')
plt.scatter(means_tcga, variances_tcga, c='r')
plt.xlabel("$<FPKM>$", fontsize=16)
plt.ylabel("$\sigma^2_{FPKM}$", fontsize=16)
plt.subplot(122)
plt.scatter(means_gtex, variances_gtex, c='b')
plt.scatter(means_tcga, variances_tcga, c='r')
plt.xlabel("$<FPKM>$", fontsize=16)
plt.ylabel("$\sigma^2_{FPKM}$", fontsize=16)
plt.yscale('log')
#plt.xlim(1e-2,200)
plt.ylim((1e-2,1e9))
plt.show()

In [None]:
fig.savefig("varmean.png")

In [None]:
fig=plt.figure(figsize=(15,4))
plt.scatter(means_tcga, variances_tcga, c='r', label='tcga', alpha=0.6)
plt.scatter(means_gtex, variances_gtex, c='b', label='gtex', alpha=0.6)
plt.xlabel("$<FPKM>$", fontsize=16)
plt.ylabel("$\sigma^2_{FPKM}$", fontsize=16)
plt.xscale('log')
plt.yscale('log')
plt.xlim(5e-2,5e5)
plt.ylim((1e-5,5e9))
plt.legend()
plt.show()
fig.savefig("varmean_loglog.png")

### mean versus occurrence

In [None]:
fig=plt.figure(figsize=(8,5))
plt.scatter(occurrences_tcga, means_tcga, c='r', label='tcga', alpha=0.6)
plt.scatter(occurrences_gtex, means_gtex, c='b', label='gtex', alpha=0.6)
plt.ylabel("$<FPKM>$", fontsize=16)
plt.xlabel("$\Sigma_j\Theta(FPKM-0.1)\Theta(1e5-FPKM)$", fontsize=16)
plt.xscale('log')
plt.yscale('log')
plt.ylim(1e-1,5e5)
plt.xlim(1e0,1e3)
plt.show()

In [None]:
fig.savefig("meanDiff_loglog.png")

### Distributions

In [None]:
len(means)

In [None]:
len(variances)

In [None]:
bins = 60
_range = (0-1e4*0.5/bins, 1e4+1e4*0.5/bins)
fig = plt.figure()
plt.hist(means_tcga, density = True, range=_range, bins=bins, histtype='step', label='tcga', color='r')
plt.hist(means_gtex, density = True, range=_range, bins=bins, histtype='step', label='gtex', color='b')
plt.title("means")
plt.xlabel("$<FPKM>$")
plt.ylabel("#")
plt.yscale('log')
plt.legend()
plt.show()
fig.savefig("mean_distr.pdf")

In [None]:
bins = 80
_range = (0-1e4*0.5/bins, 1e4+1e4*0.5/bins)
fig = plt.figure()
plt.hist(variances_tcga, density = True, range=_range, bins=bins, histtype='step', label='tcga', color='r')
plt.hist(variances_gtex, density = True, range=_range, bins=bins, histtype='step', label='gtex', color='b')
plt.title("vars")
plt.xlabel("$<\sigma_{FPKM}^2>$")
plt.ylabel("#")
plt.yscale('log')
plt.show()
fig.savefig("var_distr.pdf")

# single gene

In [None]:
fig = plt.figure()
ax=fig.subplots()
genedistr(geneinfo('PTPN6', df_tcga, nfiles_tcga), 50,ax=ax, density=True, label='tcga', save=False);
genedistr(geneinfo('PTPN6', df_gtex, nfiles_gtex), 50,ax=ax, density=True, label='gtex', save=False);
plt.legend()
plt.show()

# subset analysis

#### query genes

In [None]:
search_mean_max = 200
search_mean_min = 50
search_var_max = 1e12
search_var_min = 1e6
pc = True
query_result = df_mv_gtex.loc[((df_mv_gtex['mean']>(search_mean_min)) & (df_mv_gtex['mean']<(search_mean_max))) & ((df_mv_gtex['variance']>(search_var_min)) & (df_mv_gtex['variance']<(search_var_max)))].sort_values(by='mean')
query_result

In [None]:
genesnames = []
for g in query_result.index.values[:15]:
    genesnames.append((geneinfo(g[:15], df_tcga, nfiles_tcga),geneinfo(g[:15], df_gtex, nfiles_gtex)))

### plot all

In [None]:
for gene_tcga, gene_gtex in genesnames:
    print(gene_tcga['name'])
    print("mean: %f\t\t%f"%(gene_tcga['avg'],gene_gtex['avg']))
    print("var: %f\t\t%f"%(gene_tcga['var'],gene_gtex['var']))
    fig = plt.figure()
    ax=fig.subplots()
    genedistr(gene_gtex, 50,ax=ax, density=True, label='tcga', save=False);
    genedistr(gene_tcga, 50,ax=ax, density=True, label='gtex', save=False);
    plt.legend()
    fig.savefig("plot/genes/%s.png"%gene_tcga['name'])
    plt.show()

In [None]:
x_lin = np.logspace(-1,4, dtype=float)

In [None]:
def add_binned_plt(x,y, ax = None, label='', c='b'):
    if ax==None:
        fig=plt.figure()
        ax=fig.subplots()
    else:
        fig=ax.get_figure()
    ax.scatter(x, y, marker='o', alpha=0.2, linewidths=0.1, label=label, c=c)

    log_bins_for_x = np.logspace(-1, np.log10(np.max(x)), num=30)
    bin_means, bin_edges, binnumber = stats.binned_statistic(x, y, statistic='mean', bins=log_bins_for_x)
    bin_centres = (bin_edges[:-1]+bin_edges[1:])/2
    ax.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors=c, lw=5, label='binned average')

    
    #popt, pcov = curve_fit(lambda x,a,b : a*np.power(x,b), bin_centres, bin_means, bounds=([1,1],[35,5]))
    #plt.plot(bin_centres, popt[0]*np.power(bin_centres, popt[1]), color='y', lw=3, label='fit')
    #print(popt[0],popt[1])

    bin_sigmas,  bin_sigmas_edges, binsigmanumber = stats.binned_statistic(x, y, statistic=np.std, bins=log_bins_for_x)
    ax.plot((bin_edges[:-1] + bin_edges[1:])/2, bin_means+bin_sigmas*3, lw=3, color=c, label='binned average + $3\sigma$')
    ax.set_xlabel("$<FPKM>$", fontsize=16)
    ax.set_ylabel("$\sigma^2_{FPKM}$", fontsize=16)
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_ylim(1e-2, 1e9)
    ax.set_xlim(5e-2,5e5)

In [None]:
fig=plt.figure(figsize=(12,7))
ax=fig.subplots()

ax.plot(x_lin,np.power(x_lin,2), 'g-', lw=5, label='$<FPKM>^2$')
ax.plot(x_lin,x_lin, 'y-', lw=5, label='$<FPKM>$')

add_binned_plt(means_gtex, variances_gtex, label='gtex', ax=ax, c='b')
add_binned_plt(means_tcga, variances_tcga, label='tcga', ax=ax, c='r')
plt.legend()
plt.show()

In [None]:
fig.savefig("varmean_3sigma.png")

In [None]:
x = means_gtex
y = variances_gtex

# INIT FIGURE #################################################################

fig = plt.figure(figsize=(12, 6))
ax = fig.subplots()


# AX #########################################################################

xmin = np.log10(1e-1)
xmax = np.log10(x.max())
ymin = np.log10(1e-3)
ymax = np.log10(y.max())

xbins = np.logspace(xmin, xmax, 30) # <- make a range from 10**xmin to 10**xmax
ybins = np.logspace(ymin, ymax, 30) # <- make a range from 10**ymin to 10**ymax

counts, _, _, _ = ax.hist2d(x, y, bins=(xbins, ybins));

pcm = ax.pcolormesh(xbins, ybins, counts.T)
plt.colorbar(pcm)
#fig.colorbar(pcm, ax=ax2)  # this works too

## The following line doesn't actually work...
## See http://stackoverflow.com/questions/29175093/creating-a-log-linear-plot-in-matplotlib-using-hist2d
#H = ax2.hist2d(x, y, bins=[xbins, ybins])
#fig.colorbar(H[3], ax=ax2)

ax.set_xscale("log")               # <- Activate log scale on X axis
ax.set_yscale("log")               # <- Activate log scale on Y axis

ax.set_xlim(xmin=xbins[0])
ax.set_xlim(xmax=xbins[-1])
ax.set_ylim(ymin=ybins[0])
ax.set_ylim(ymax=ybins[-1])

ax.set_title("")
ax.set_xlabel("$<FPKM>$", fontsize=16)
ax.set_ylabel("$\sigma^2_{FPKM}$", fontsize=16)

# SHOW AND SAVE FILE ##########################################################

plt.tight_layout()
plt.show()

In [None]:
fig.savefig("varmean_density_gtex.png")

## data size Heaps check

In [None]:
col = df.loc[:,df.keys()[1]].values
np.sum(col)

In [None]:
len(col[col.nonzero()])

In [None]:
x = []
y = []
for i in range(1, 450):
    col = df.loc[:,df.keys()[i]].values
    x.append(np.sum(col))
    y.append(len(col[col.nonzero()]))
plt.scatter(x,y)