In [None]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import os
from matplotlib import pyplot as plt
import scipy.stats as st
from scipy.stats import binned_statistic
from gtex import *

In [None]:
working_dir = "/Users/filippo/Developer/tesi/gtex/"
os.chdir(working_dir)

In [None]:
normalisation_str='counts'

In [None]:
df_genes = pd.read_csv("../genes.txt", index_col=[0])
genes = df_genes[df_genes['type_of_gene'].isin(['intergenic', 'protein-coding', 'ncRNA', 'snRNA',
       'pseudo', 'antisense', 'microRNA', 'snoRNA', 'rRNA', 'scRNA'])].index.values
df_genes['type_of_gene'].unique()

In [None]:
# GTEX
common_tissues = ['Blood','Heart', 'Muscle', 'Brain', 'Skin', 'Adipose Tissue', 'Nerve', 'Thyroid', 'Testis']
more_common_tissues = np.unique(samples['primary_site'])

In [None]:
samples = pd.read_csv("files.dat", index_col=[0])
samples.head()

In [None]:
primaries = samples['primary_site'].unique()

In [None]:
np.sort(primaries)

In [None]:
np.sort(np.unique(samples['secondary_site']))

In [None]:
# GTex
df = dd.read_csv("GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_median_tpm.gct", sep='\t')
df = df.drop('Description', axis=1)
df['gene_id']=df.apply(lambda g: g.gene_id[:15], axis=1)
df = df.set_index('gene_id')

In [None]:
df.head()

In [None]:
#df.loc[:,'Description'].to_csv('gene_symbol.txt', index=True)

In [None]:
df = df.loc[df_genes[df_genes['type_of_gene']=='protein-coding'].index.values,:]

## Tissue U

In [None]:
O = df.dropna().apply(lambda x: len(x[x!=0])/float(len(x)),axis=1).compute()

In [None]:
fig = plt.figure(figsize=(20,8))
ax = fig.subplots(1,2)
bins = 20
rang = (0-0.5/20,1+0.5/20)
ax[0].hist(np.array(O.loc[df_genes[df_genes['type_of_gene']!='protein-coding'].index.values].dropna().values,dtype=float), histtype='step', lw=4, density=True, bins=bins, range=rang)
ax[0].set_title("non coding", fontsize=18)
ax[0].set_xlabel('$O_i$', fontsize=16)
ax[0].set_ylabel('#')
ax[1].hist(np.array(O.loc[df_genes[df_genes['type_of_gene']=='protein-coding'].index.values].dropna().values,dtype=float), histtype='step', lw=4, density=True, bins=bins, range=rang)
ax[1].set_title("coding", fontsize=18)
ax[1].set_xlabel('$O_i$', fontsize=16)
ax[1].set_ylabel('#')
plt.show()
fig.savefig("U_tissue.pdf")

## average tissues

In [None]:
df_tissue = pd.DataFrame(index=df.index)
for tissue in primaries:
    print(tissue)
    #GTEX TPM
    df_tissue.insert(0,tissue,df.loc[:,get_specific_mapping_to(tissue)].dropna().mean(axis=1))
    #GTex counts 
    #df_tissue.insert(0,tissue,df.loc[:,samples[samples['primary_site']==tissue].index.values].dropna().mean(axis=1))
    #df_tissue.insert(0,tissue,df.loc[:,tissue].dropna())
df_tissue.head()

## integral

In [None]:
def get_integral_tissue(tissue):
    x = np.sort(df_tissue.loc[genes,tissue].fillna(0).values)[::-1]
    norm = np.sum(x)
    return np.cumsum(x/float(norm))

In [None]:
fig=plt.figure(figsize=(15,10))
ax=fig.subplots()
ax.set_title('%s sorted integral'%normalisation_str, fontsize=18)
for tissue in common_tissues:
#for tissue in more_common_tissues:
#for tissue in primaries:
        print(tissue)
        ax.plot(get_integral_tissue(tissue),label=tissue, lw=3)

#blood = df.loc[:,samples[samples['secondary_site']=='Whole Blood'].index.values].values
#ax.plot(np.cumsum(np.sort(blood)[::-1])/np.sum(blood), label='Whole Blood', lw=2)

ax.set_xscale('log')
ax.set_xlabel('Number of genes', fontsize=18)
ax.set_ylabel('Fraction of total tissue expression', fontsize=18)
ax.set_xlim((1,5e4))
ax.set_yticks([0,0.25,0.5,0.75,1])
ax.set_ylim(0,1)
plt.legend(ncol=2, fontsize=16)
plt.show()
fig.savefig("fraction_of_trascriptome.pdf")

In [None]:
df.columns

In [None]:
for ensg in df['Whole Blood'].sort_values(ascending=False)[:10].index.values:
    print(ensg)

In [None]:
print(df_tissue['Brain'].sort_values(ascending=False)[:15])
x = df_tissue['Brain'].sort_values(ascending=False)
for g in df_tissue['Brain'].sort_values(ascending=False).index.values[:10]:
    print(g)

## Zipf & Heaps

## Zipf

In [None]:
variable_tissues = ['Heart','Brain','Nerve', 'Blood']

In [None]:
def get_zipf_tissue(tissue):
    '''
    return array zipf
    '''
    A = np.sort(df_tissue.loc[:,tissue].dropna().values)[::-1]
    A /= np.sum(A)
    return A

In [None]:
fig = plt.figure(figsize=(13,8))
for tissue in variable_tissues:
    plt.plot(get_zipf_tissue(tissue), lw=2, ls='-', label=tissue)
plt.plot(np.arange(1,2e4), 1./np.arange(1,2e4), 'g--', label='$r^-1$')
plt.yscale('log')
plt.xscale('log')
plt.xlabel('rank_i', fontsize=20)
plt.ylabel('$frequency_i$', fontsize=20)
plt.xlim(1,2e4)
plt.ylim(2e-8,1e0)
plt.legend(ncol=4, fontsize=20)
plt.show()
fig.savefig("Zipf_tissue.pdf")

## Heaps

In [None]:
def get_heaps_tissue(tissue):
    subdf = df.loc[:,samples[samples['secondary_site'].isin(get_specific_mapping_to(tissue))].index.values]
    M = subdf.apply(lambda x: np.sum(x), axis=0).values
    ndw = subdf.apply(lambda x: len(np.nonzero(x)[0]), axis=0).values
    return M, ndw    

In [None]:
fig = plt.figure(figsize=(13,8))
for tissue in variable_tissues:
    heaps = get_heaps_tissue(tissue)
    plt.scatter(heaps[0], heaps[1],  label=tissue)
plt.xlabel('Sample size', fontsize=20)
plt.ylabel('# of genes expressed', fontsize=20)
plt.legend(ncol=4, fontsize=20)
plt.show()
fig.savefig("Heaps_tissue.pdf")

## global

In [None]:
fig=plt.figure(figsize=(25,8))
ax = fig.subplots(1,3)
colors = (['darksalmon','lawngreen', 'lightskyblue','pink'],['r','g','b','m'])
variable_tissues = ['Heart','Brain','Nerve', 'Blood']
for i,tissue in enumerate(variable_tissues):
    ax[0].plot(get_integral_tissue(tissue), label=tissue, color=colors[0][i])
    ax[1].plot(get_zipf_tissue(tissue), label=tissue,color=colors[0][i])
    heaps = get_heaps_tissue(tissue)
    ax[2].scatter(heaps[0],heaps[1], label=tissue, c=colors[0][i], alpha=0.2)
    bin_means, bin_edges, _ = binned_statistic(heaps[0], heaps[1], bins = np.linspace(0.2e8,1.5e8))
    #bin_means, bin_edges, _ = binned_statistic(heaps[0], heaps[1], bins = np.linspace(8.6e5,9.8e5,8))
    ax[2].scatter((bin_edges[:-1]+bin_edges[1:])/2., bin_means, marker='x', c=colors[1][i], label='binned[%s]'%tissue)
    
ax[1].plot(np.arange(1,1e4), 1./np.arange(1,1e4), 'g--', label='$r^-1$')
ax[0].set_xscale('log')
ax[1].set_xscale('log')
ax[1].set_yscale('log')
ax[0].legend(fontsize=16)
ax[1].legend(fontsize=16)
ax[2].legend(fontsize=16)
ax[0].set_xlabel('Number of genes', fontsize=16)
ax[0].set_ylabel('Fraction of total tissue expression', fontsize=16)
ax[1].set_xlabel('rank_i', fontsize=16)
ax[1].set_ylabel('$f_i$', fontsize=16)
ax[2].set_xlabel('Sample size', fontsize=16)
ax[2].set_ylabel('# of genes expressed', fontsize=16)
ax[2].set_xlim(0.05e8,1.5e8)
ax[1].set_ylim(1e-6,1)
plt.show()
fig.savefig("zipfheaps_tissue.pdf")

# lenght

In [None]:
q_many = pd.read_csv("genes.txt", index_col=[0], header=[0])
q_many = q_many[q_many['type_of_gene']=='protein-coding']
lenghts = q_many['lenght']

In [None]:
#from scipy.stats import binned_statistic
fig=plt.figure(figsize=(15,7))
bins_for_l = np.logspace(1,8,40)
for tissue in primaries:
    bin_means, bin_edges, _ = binned_statistic(lenghts, df_tissue.loc[:,tissue], statistic='mean', bins=bins_for_l)
    #plt.scatter(lenghts,df_tissue.loc[:,tissue])
    plt.scatter((bin_edges[1:]+bin_edges[:-1])/2,bin_means, marker='x',label=tissue)
    plt.yscale('log')
    plt.xscale('log')
plt.xlabel('lenght (bp)', fontsize=16)
plt.ylabel('mean (counts)', fontsize=16)
plt.xlim((lenghts.min(),lenghts.max()))
plt.ylim((1e-3,1e5))
plt.legend(ncol=2)
plt.show()
fig.savefig("meanLenght_tissue.pdf")

## inter intra

In [None]:
inter_vars = df_tissue.apply(np.var, axis=1).values
inter_means = df_tissue.apply(np.average, axis=1).values

In [None]:
inter_cv2s = [inter_vars[i]/(m*m) if m>0 else 0 for i,m in enumerate(inter_means)]

In [None]:
fig=plt.figure(figsize=(15,4))
plt.scatter(inter_means, inter_cv2s, c='b')
plt.plot(x_lin[:30],1./x_lin[:30], 'g-', lw=3.5, label='Poisson')
plt.plot(x_lin[-30:],[1 for _ in x_lin[-30:]], 'r-', lw=3.5, label='Taylor')
plt.plot(x_lin,[len(df_tissue.columns)-1 for _ in x_lin], '--', lw=3.5, label='bound')

bin_means, bin_edges,_=binned_statistic(inter_means, inter_cv2s, statistic='mean', bins=np.logspace(np.log10(np.min(inter_means[inter_means.nonzero()])),np.log10(np.max(inter_means))))

plt.scatter((bin_edges[1:]+bin_edges[:-1])/2, bin_means, marker='x', lw=2, color='orange')

plt.title('inter_tissue',fontsize=18)
plt.xlabel("$<%s>$"%normalisation_str, fontsize=16)
plt.ylabel("$cv^2$", fontsize=16)
plt.xscale('log')
plt.yscale('log')
plt.xlim(means[means.nonzero()].min()/5,np.power(10,np.log10(means.max())+1))
plt.ylim(variances[variances.nonzero()].min()/10,len(subdf.columns)*10)
plt.legend(fontsize=16)
plt.show()
fig.savefig("cvmean_loglog_inter_tissue.png")

In [None]:
bin_means, bin_edges, binnumber = binned_statistic(inter_means, inter_cv2s, statistic='mean', bins=np.logspace(np.log10(np.min(inter_means[inter_means.nonzero()])),np.log10(np.max(inter_means))))

def get_inter_distance_from_mean(mean, cv2):
    bin_i = 0
    for i,_ in enumerate(bin_edges[:-1]):
        if mean<bin_edges[i+1] and mean > bin_edges[i]:
            bin_i = i
            break
    return(cv2-bin_means[bin_i])

In [None]:
subdf = df.loc[:,samples[samples['primary_site']==tissue].index.values]
intra_means = subdf.apply(np.nanmean, axis=1).values
intra_variance = subdf.apply(np.nanvar, axis=1).values
intra_cv2 = [intra_variance[i]/(np.power(mean,2)) if mean>0 else 0 for i,mean in enumerate(intra_means) ]

bin_means_intra, bin_edges_intra, _ = binned_statistic(intra_means, intra_cv2, statistic='mean', bins=np.logspace(np.log10(np.min(intra_means[intra_means.nonzero()])),np.log10(np.max(intra_means))))

def get_intra_distance_from_mean(mean, cv2):
    bin_i = 0
    for i,_ in enumerate(bin_edges[:-1]):
        if mean<bin_edges_intra[i+1] and mean > bin_edges_intra[i]:
            bin_i = i
            break
    return(cv2-bin_means_intra[bin_i])

In [None]:
inter = []
intra = []
for i,g in enumerate(df_tissue.index.values):
    inter.append(get_inter_distance_from_mean(inter_means[i],inter_cv2s[i]))
    intra.append(get_intra_distance_from_mean(intra_means[i], intra_cv2[i]))
intra=np.array(intra)
inter=np.array(inter)

In [None]:
x = np.logspace(np.log10(5e-1),20)
fig = plt.figure()
plt.scatter(inter, intra, label=tissue)
#plt.plot(x,x)
#plt.plot(x,np.sqrt(x))
plt.xlabel('inter_(tissue+individual)')
plt.ylabel('intra_%s'%tissue)
#plt.yscale('log')
#plt.xscale('log')
plt.ylim(-50,150)
plt.xlim(-50,35)
plt.show()
fig.savefig("inter_intra_%s.png"%tissue)

In [None]:
for i, inter_g in enumerate(inter):
    if((np.abs(intra[i])<1) & (inter_g<10 and inter_g>8)):
        print(df.index[i])