In [None]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import scipy.stats as st
from scipy.stats import binned_statistic

In [None]:
working_dir = "/Users/filippo/Developer/tesi"
os.chdir(working_dir)

In [None]:
normalisation_str='counts'

In [None]:
# TCGA
common_tissues = ['Bronchus and lung', 'Brain', 'Breast', 'Ovary', 'Kidney', 'Colon', 'Corpus uteri','Adrenal gland','Skin']
more_common_tissues = np.concatenate((common_tissues,['Blood','Heart, mediastinum, and pleura','Skin','Testis','Thyroid gland']))
common_desease = ['Adenomas and Adenocarcinomas','Epithelial Neoplasms, NOS', 'Squamous Cell Neoplasms', 'Gliomas','Nevi and Melanomas','Cystic, Mucinous and Serous Neoplasms','Mature B-Cell Lymphomas','Thymic Epithelial Neoplasms','Paragangliomas and Glomus Tumors']

In [None]:
samples = pd.read_csv("files.dat", index_col=[0])
samples.head()

In [None]:
primaries = samples['primary_site'].unique()
diseases = samples['disease_type'].unique()

In [None]:
np.sort(diseases)

In [None]:
# TCGA
df = pd.read_csv(("%s/mainTable.csv"%working_dir), index_col=[0])
df = df.to_sparse(fill_value=0.)

## Tissue U

In [None]:
tissue = 'Brain'

In [None]:
sample_list = samples[samples['primary_site']==tissue].index.values
subdf = df.loc[:,sample_list]

In [None]:
O = []
for g in subdf.index.values:
    o = len([fpkm for fpkm in subdf.loc[g].values if fpkm > 1e-1])
    if o>1:
        O.append(o)

In [None]:
bins = 20
rang = (0-0.5/20,1+0.5/20)
fig = plt.figure()
plt.hist(np.array(O,dtype=float)/len(sample_list), histtype='step', lw=4, density=True, bins=bins, range=rang)
plt.title(tissue, fontsize=18)
plt.xlabel('$O_i$', fontsize=16)
plt.ylabel('#')
plt.show()
fig.savefig("U_%s.png"%tissue)

## average tissues

In [None]:
label = 'disease_type'
df_tissue = pd.DataFrame(index=df.index)
for tissue in samples[label].unique():
    print(tissue)
    subdf = df.loc[:,samples[samples[label]==tissue].index.values]
    df_tissue.insert(0,tissue,subdf.mean(axis=1))
df_tissue.to_csv("mainTable_%s.csv"%label, index=True, header=True)

In [None]:
df_tissue = pd.read_csv("mainTable_%s.csv"%label, index_col=[0])
df_tissue.fillna(value=0.,inplace=True)
df_tissue.head()

In [None]:
gene='ENSG00000198888'

In [None]:
data = np.array(df.loc[gene,:].values,dtype=float)
x = np.linspace(0,data.max()+10)
mu = np.average(data)
var = np.var(data)
a = mu*mu/var
b = var/mu
fig=plt.figure()
ax=fig.subplots()
ax.set_title('Expression across all tissues: %s'%gene, fontsize=18)
plt.hist(data, histtype='step', lw=2,bins=x, label='gene', density=True)
plt.plot(x, st.gamma.pdf(x/b, a,0,1)/b)
ax.set_xlabel('%s'%normalisation_str,fontsize=16)
ax.set_ylabel('#')
#ax.set_xscale('log')
#ax.set_yscale('log')
ax.set_xlim(5e-2,data.max())
plt.show()
fig.savefig("plot/%s_%s_alltissues.png"%(normalisation_str,gene))

In [None]:
fig=plt.figure(figsize=(25,20))
N=len(common_tissues)
axs=fig.subplots(3,N/3)
for i,tissue in enumerate(common_tissues):
    ax=axs[i/3][i%(N/3)]
    ax.set_title('%s: %s'%(tissue,gene), fontsize=13)
    sample_list = samples[samples['primary_site']==tissue].index.values
    subdf = df.loc[gene,sample_list]
    data = subdf.fillna(value=0.).values
    #data = data * np.average(data) / np.var(data)
    x = np.linspace(0,data.max()+10)
    mu = np.average(data)
    var = np.var(data)
    a = mu*mu/var
    b = var/mu
    ax.hist(data,density=True,label="%s [%d]"%(tissue, len(data)),histtype='step',lw=1.5)
    ax.plot(x,st.gamma.pdf(x,a,0,b))
    ax.set_xlim(xmax=data.max())
    ax.set_xlabel('%s'%normalisation_str,fontsize=13)
    ax.set_ylabel('#')
    ax.set_xlim(5e-2,data.max())
    ax.legend(fontsize=16)
plt.show()
fig.savefig("plot/%s_%s_per_tissue.png"%(normalisation_str,gene))

In [None]:
fig=plt.figure()
ax=fig.subplots()
ax.set_title('Expression per tissue: %s'%gene, fontsize=18)
data = np.array(df_tissue.loc[gene,:].values,dtype=float)
x = np.linspace(0,data.max())
mu = np.average(data)
var = np.var(data)
a = mu*mu/var
b = var/mu
plt.hist(data, histtype='step', lw=2, label='gene', density=True, bins=10)
plt.plot(x, st.gamma.pdf(x/b,a)/b)
ax.set_xlabel('%s'%normalisation_str,fontsize=16)
ax.set_ylabel('#')
#ax.set_xscale('log')
#ax.set_yscale('log')
plt.show()
fig.savefig("plot/%s_%s_across_tissue.png"%(normalisation_str,gene))

## integral

In [None]:
#TCGA
def get_integral_tissue(tissue):
    subdf = df_tissue.loc[:,tissue]
    frac = []
    try:
        x = np.sort(subdf.values)[::-1]
        norm = np.sum(x)
        frac.append(np.cumsum(x/float(norm)))
    except:
        print("error with. %s"%sample)
    return np.average(frac, axis=0)

In [None]:
fig=plt.figure(figsize=(15,10))
ax=fig.subplots()
ax.set_title('%s sorted integral'%normalisation_str, fontsize=18)
#for tissue in more_common_tissues:
for tissue in common_desease:
        print(tissue)
        ax.plot(get_integral_tissue(tissue),label=tissue, lw=3)
ax.set_xscale('log')
ax.set_xlabel('Number of genes', fontsize=18)
ax.set_ylabel('Fraction of total tissue expression', fontsize=18)
ax.set_xlim((1,2e4))
plt.legend(loc='upper left', ncol=2, fontsize=16)
plt.show()
fig.savefig("fraction_of_trascriptome.pdf")

## Mean Variance

In [None]:
tissue = 'Blood'
subdf = df.loc[:,samples[samples['primary_site']==tissue].index.values]
means = subdf.apply(np.nanmean, axis=1).dropna().values
variances = subdf.apply(np.nanvar, axis=1).dropna().values
distrs = subdf.sum(axis=0)

In [None]:
plt.hist(distrs)
print(np.mean(distrs),np.var(distrs))

In [None]:
x_lin = np.logspace(np.log10(means[means.nonzero()].min()),np.log10(means[means.nonzero()].max()), dtype=float,num=50)
fig=plt.figure(figsize=(15,4))
plt.scatter(means, variances, c='b')
plt.plot(x_lin[-40:],np.power(x_lin[-40:],2), 'g-', lw=3.5, label='$<%s>^2$'%normalisation_str)
plt.plot(x_lin[:20],x_lin[:20], 'r-', lw=3.5, label='$<%s>$'%normalisation_str)

plt.xlabel("$<%s>$"%normalisation_str, fontsize=16)
plt.ylabel("$\sigma^2_{%s}$"%normalisation_str, fontsize=16)
plt.xscale('log')
plt.yscale('log')
plt.title(tissue,fontsize=18)
plt.xlim(means[means.nonzero()].min()/5,np.power(10,np.log10(means.max())+1))
plt.ylim((variances[variances.nonzero()].min()/10,np.power(10,np.log10(variances.max())+1)))
plt.legend(fontsize=16)
plt.show()
fig.savefig("varmean_loglog_%s.png"%tissue)

In [None]:
cv2 = [variances[i]/(np.power(mean,2)) for i,mean in enumerate(means) if mean>0]
fig=plt.figure(figsize=(15,4))
plt.scatter(means[means.nonzero()], cv2, c='b')
plt.plot(x_lin[:30],1./x_lin[:30], 'g-', lw=3.5, label='Poisson')
plt.plot(x_lin[-30:],[1 for _ in x_lin[-30:]], 'r-', lw=3.5, label='Taylor')
plt.plot(x_lin,[len(subdf.columns)-1 for _ in x_lin], '--', lw=3.5, label='bound')

bin_means, bin_edges,_=binned_statistic(means[means.nonzero()], cv2, statistic='mean', bins=np.logspace(np.log10(np.min(means[means.nonzero()])),np.log10(np.max(means))))

plt.scatter((bin_edges[1:]+bin_edges[:-1])/2, bin_means, marker='x', lw=2, color='orange')

plt.title(tissue,fontsize=18)
plt.xlabel("$<%s>$"%normalisation_str, fontsize=16)
plt.ylabel("$cv^2$", fontsize=16)
plt.xscale('log')
plt.yscale('log')
plt.xlim(means[means.nonzero()].min()/5,np.power(10,np.log10(means.max())+1))
plt.ylim(variances[variances.nonzero()].min()/10,len(subdf.columns)*10)
plt.legend(fontsize=16)
plt.show()
fig.savefig("cvmean_loglog_%s.png"%tissue)

## Lenght

In [None]:
q_many = pd.read_csv("genes.txt", index_col=[0], header=[0])
q_many = q_many[q_many['type_of_gene']=='protein-coding']
lenghts = q_many['lenght']

In [None]:
from scipy.stats import binned_statistic
fig=plt.figure(figsize=(15,7))
means = subdf.mean(axis=1).values
bin_means, bin_edges, _ = binned_statistic(lenghts, means, statistic='mean', bins=np.logspace(1,7,50))
plt.scatter(lenghts,means)
plt.scatter((bin_edges[1:]+bin_edges[:-1])/2., bin_means, marker='x')
plt.title(tissue, fontsize=18)
plt.yscale('log')
plt.xscale('log')
plt.xlabel('lenght (bp)', fontsize=16)
plt.ylabel('mean (counts)', fontsize=16)
plt.xlim((lenghts.min()/10,lenghts.max()*10))
plt.ylim((means[means.nonzero()].min()/10,means.max()*10))
plt.show()
fig.savefig("meanLenght_%s.pdf"%tissue)

## Zipf & Heaps

## Zipf

In [None]:
variable_tissues = ['Breast','Blood','Brain', 'Adrenal gland']
variable_tissues = ['Paragangliomas and Glomus Tumors','Adenomas and Adenocarcinomas','Nevi and Melanomas']

In [None]:
def get_zipf_tissue(tissue):
    '''
    return array zipf
    '''
    A = np.sort(df_tissue.loc[:,tissue].values)[::-1]
    A /= np.sum(A)
    return A

In [None]:
fig = plt.figure(figsize=(15,7))
for tissue in variable_tissues:
    plt.plot(get_zipf_tissue(tissue), lw=2, ls='-', label=tissue)
plt.plot(np.arange(1,2e4), 1./np.arange(1,2e4), 'g--', label='$r^-1$')
plt.yscale('log')
plt.xscale('log')
plt.xlabel('rank_i', fontsize=16)
plt.ylabel('$f_i$', fontsize=16)
plt.xlim(1,2e4)
plt.ylim(2e-8,1e0)
plt.legend(ncol=2, fontsize=14)
plt.show()
fig.savefig("Zipf_tissue.pdf")

## Heaps

In [None]:
def get_heaps_tissue(tissue, label='primary_site'):
    subdf = df.loc[:,samples[samples[label]==tissue].index.values]
    M = subdf.apply(lambda x: np.sum(x), axis=0).dropna().values
    ndw = subdf.apply(lambda x: len(np.nonzero(x)[0]), axis=0).dropna().values
    return M, ndw    

In [None]:
fig = plt.figure(figsize=(15,7))
for tissue in variable_tissues:
    heaps = get_heaps_tissue(tissue, label=label)
    plt.scatter(heaps[0], heaps[1],  label=tissue)
plt.xlabel('Sample size', fontsize=16)
plt.ylabel('# of genes expressed', fontsize=16)
plt.legend(ncol=4, fontsize=12)
plt.xlim(1e6,1e8)
plt.ylim(13e3,20e3)
plt.show()
fig.savefig("Heaps_tissue.pdf")

## global

In [None]:
fig=plt.figure(figsize=(30,15))
ax = fig.subplots(1,3)
colors = (['darksalmon','lawngreen', 'lightskyblue','pink'],['r','g','b','m'])
for i,tissue in enumerate(variable_tissues):
    ax[0].plot(get_integral_tissue(tissue), label=tissue, color=colors[0][i])
    ax[1].plot(get_zipf_tissue(tissue), label=tissue,color=colors[0][i])
    heaps = get_heaps_tissue(tissue, label)
    ax[2].scatter(heaps[0],heaps[1], label=tissue, c=colors[0][i], alpha=0.2)
    bin_means, bin_edges, _ = binned_statistic(heaps[0], heaps[1], bins = np.linspace(0.2e8,1.5e8))
    ax[2].scatter((bin_edges[:-1]+bin_edges[1:])/2., bin_means, marker='x', c=colors[1][i], label='binned[%s]'%tissue)
    
ax[1].plot(np.arange(1,1e4), 1./np.arange(1,1e4), 'g--', label='$r^-1$')
ax[0].set_xscale('log')
ax[1].set_xscale('log')
ax[1].set_yscale('log')
ax[0].legend(fontsize=16)
ax[1].legend(fontsize=16)
ax[2].legend(fontsize=12, ncol=2)
ax[0].set_xlabel('Number of genes', fontsize=16)
ax[0].set_ylabel('Fraction of total tissue expression', fontsize=16)
ax[1].set_xlabel('rank_i', fontsize=16)
ax[1].set_ylabel('$f_i$', fontsize=16)
ax[2].set_xlabel('Sample size', fontsize=16)
ax[2].set_ylabel('# of genes expressed', fontsize=16)
ax[2].set_xlim(0.05e8,1.5e8)
ax[1].set_ylim(1e-6,1)
plt.show()
fig.savefig("zipffracheaps_tissue.pdf")

# lenght

In [None]:
q_many = pd.read_csv("genes.txt", index_col=[0], header=[0])
q_many = q_many[q_many['type_of_gene']=='protein-coding']
lenghts = q_many['lenght']

In [None]:
#from scipy.stats import binned_statistic
fig=plt.figure(figsize=(15,7))
bins_for_l = np.logspace(1,8,40)
for tissue in primaries:
    bin_means, bin_edges, _ = binned_statistic(lenghts, df_tissue.loc[:,tissue], statistic='mean', bins=bins_for_l)
    #plt.scatter(lenghts,df_tissue.loc[:,tissue])
    plt.scatter((bin_edges[1:]+bin_edges[:-1])/2,bin_means, marker='x',label=tissue)
    plt.yscale('log')
    plt.xscale('log')
plt.xlabel('lenght (bp)', fontsize=16)
plt.ylabel('mean (counts)', fontsize=16)
plt.xlim((lenghts.min(),lenghts.max()))
plt.ylim((1e-3,1e5))
plt.legend(ncol=2)
plt.show()
fig.savefig("meanLenght_tissue.pdf")

## inter intra

In [None]:
inter_vars = df_tissue.apply(np.var, axis=1).values
inter_means = df_tissue.apply(np.average, axis=1).values

In [None]:
inter_cv2s = [inter_vars[i]/(m*m) if m>0 else 0 for i,m in enumerate(inter_means)]

In [None]:
fig=plt.figure(figsize=(15,4))
plt.scatter(inter_means, inter_cv2s, c='b')
plt.plot(x_lin[:30],1./x_lin[:30], 'g-', lw=3.5, label='Poisson')
plt.plot(x_lin[-30:],[1 for _ in x_lin[-30:]], 'r-', lw=3.5, label='Taylor')
plt.plot(x_lin,[len(df_tissue.columns)-1 for _ in x_lin], '--', lw=3.5, label='bound')

bin_means, bin_edges,_=binned_statistic(inter_means, inter_cv2s, statistic='mean', bins=np.logspace(np.log10(np.min(inter_means[inter_means.nonzero()])),np.log10(np.max(inter_means))))

plt.scatter((bin_edges[1:]+bin_edges[:-1])/2, bin_means, marker='x', lw=2, color='orange')

plt.title('inter_tissue',fontsize=18)
plt.xlabel("$<%s>$"%normalisation_str, fontsize=16)
plt.ylabel("$cv^2$", fontsize=16)
plt.xscale('log')
plt.yscale('log')
plt.xlim(means[means.nonzero()].min()/5,np.power(10,np.log10(means.max())+1))
plt.ylim(variances[variances.nonzero()].min()/10,len(subdf.columns)*10)
plt.legend(fontsize=16)
plt.show()
fig.savefig("cvmean_loglog_inter_tissue.png")

In [None]:
bin_means, bin_edges, binnumber = binned_statistic(inter_means, inter_cv2s, statistic='mean', bins=np.logspace(np.log10(np.min(inter_means[inter_means.nonzero()])),np.log10(np.max(inter_means))))

def get_inter_distance_from_mean(mean, cv2):
    bin_i = 0
    for i,_ in enumerate(bin_edges[:-1]):
        if mean<bin_edges[i+1] and mean > bin_edges[i]:
            bin_i = i
            break
    return(cv2-bin_means[bin_i])

In [None]:
subdf = df.loc[:,samples[samples['primary_site']==tissue].index.values]
intra_means = subdf.apply(np.nanmean, axis=1).values
intra_variance = subdf.apply(np.nanvar, axis=1).values
intra_cv2 = [intra_variance[i]/(np.power(mean,2)) if mean>0 else 0 for i,mean in enumerate(intra_means) ]

bin_means_intra, bin_edges_intra, _ = binned_statistic(intra_means, intra_cv2, statistic='mean', bins=np.logspace(np.log10(np.min(intra_means[intra_means.nonzero()])),np.log10(np.max(intra_means))))

def get_intra_distance_from_mean(mean, cv2):
    bin_i = 0
    for i,_ in enumerate(bin_edges[:-1]):
        if mean<bin_edges_intra[i+1] and mean > bin_edges_intra[i]:
            bin_i = i
            break
    return(cv2-bin_means_intra[bin_i])

In [None]:
inter = []
intra = []
for i,g in enumerate(df_tissue.index.values):
    inter.append(get_inter_distance_from_mean(inter_means[i],inter_cv2s[i]))
    intra.append(get_intra_distance_from_mean(intra_means[i], intra_cv2[i]))
intra=np.array(intra)
inter=np.array(inter)

In [None]:
x = np.logspace(np.log10(5e-1),20)
fig = plt.figure()
plt.scatter(inter, intra, label=tissue)
#plt.plot(x,x)
#plt.plot(x,np.sqrt(x))
plt.xlabel('inter_(tissue+individual)')
plt.ylabel('intra_%s'%tissue)
#plt.yscale('log')
#plt.xscale('log')
plt.ylim(-50,150)
plt.xlim(-50,35)
plt.show()
fig.savefig("inter_intra_%s.png"%tissue)

In [None]:
for i, inter_g in enumerate(inter):
    if((np.abs(intra[i])<1) & (inter_g<10 and inter_g>8)):
        print(df.index[i])