In [None]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import scipy.stats as st

In [None]:
working_dir = "/Users/filippo/Developer/tesi"
os.chdir(working_dir)
dirs = os.listdir("data")

In [None]:
normalisation_str='counts'

In [None]:
common_tissues = ['Bronchus and lung', 'Brain', 'Breast', 'Ovary', 'Kidney', 'Colon', 'Corpus uteri','Adrenal gland','Skin']
more_common_tissues = np.concatenate((common_tissues,['Blood','Heart, mediastinum, and pleura','Brain','Skin','Testis','Thyroid gland']))

In [None]:
samples = pd.read_csv("files.dat", index_col=[0])
samples.head()

In [None]:
primaries = samples['primary_site'].unique()
diseases = samples['disease_type'].unique()

In [None]:
df = pd.read_csv(("%s/mainTable.csv"%working_dir), index_col=[0])
#df = df.to_sparse(fill_value=0.)
df.fillna(value=0., inplace=True)
df.head()

## Tissue U

In [None]:
tissue = 'Colon'

In [None]:
sample_list = samples[samples['primary_site']==tissue].index.values
subdf = df.loc[:,sample_list]

In [None]:
O = []
for g in subdf.index.values:
    o = len([fpkm for fpkm in subdf.loc[g].values if fpkm > 1e-1])
    if o>1:
        O.append(o)

In [None]:
bins = 20
rang = (0-0.5/20,1+0.5/20)
fig = plt.figure()
plt.hist(np.array(O,dtype=float)/len(sample_list), histtype='step', lw=4, density=True, bins=bins, range=rang)
plt.title(tissue, fontsize=18)
plt.xlabel('$O_i$', fontsize=16)
plt.ylabel('#')
plt.show()
fig.savefig("U_%s.png"%tissue)

## average tissues

In [None]:
df_tissue = pd.read_csv("mainTable_tissues.csv", index_col=[0])
df_tissue.fillna(value=0.,inplace=True)
df_tissue.head()

In [None]:
df_tissue_var = pd.read_csv("mainTable_tissues_var.csv", index_col=[0])
df_tissue_var.fillna(value=0.,inplace=True)
df_tissue_var.head()

In [None]:
#df_tissue_new = pd.DataFrame(index=df.index, columns=np.unique(samples['primary_site'].values), dtype=float)
#for i,g in enumerate(df_tissue_new.index.values):
#    gdf = df.loc[g,:]
#    print(g,i)
#    for tissue in df_tissue_new.columns.values:
#        sample_list = samples[samples['primary_site']==tissue].index.values
#        subdf = gdf.loc[sample_list]
#        df_tissue_new.at[g,tissue]=np.nanvar(np.array(subdf.values, dtype=float))

In [None]:
#df_tissue_new.to_csv("mainTable_tissues_var.csv")

In [None]:
gene='ENSG00000198888'

In [None]:
data = np.array(df.loc[gene,:].values,dtype=float)
x = np.linspace(0,data.max()+10)
mu = np.average(data)
var = np.var(data)
a = mu*mu/var
b = var/mu
fig=plt.figure()
ax=fig.subplots()
ax.set_title('Expression across all tissues: %s'%gene, fontsize=18)
plt.hist(data, histtype='step', lw=2,bins=x, label='gene', density=True)
plt.plot(x, st.gamma.pdf(x/b, a,0,1)/b)
ax.set_xlabel('%s'%normalisation_str,fontsize=16)
ax.set_ylabel('#')
#ax.set_xscale('log')
#ax.set_yscale('log')
ax.set_xlim(5e-2,data.max())
plt.show()
fig.savefig("plot/%s_%s_alltissues.png"%(normalisation_str,gene))

In [None]:
fig=plt.figure(figsize=(25,20))
N=len(common_tissues)
axs=fig.subplots(3,N/3)
for i,tissue in enumerate(common_tissues):
    ax=axs[i/3][i%(N/3)]
    ax.set_title('%s: %s'%(tissue,gene), fontsize=13)
    sample_list = samples[samples['primary_site']==tissue].index.values
    subdf = df.loc[gene,sample_list]
    data = subdf.fillna(value=0.).values
    #data = data * np.average(data) / np.var(data)
    x = np.linspace(0,data.max()+10)
    mu = np.average(data)
    var = np.var(data)
    a = mu*mu/var
    b = var/mu
    ax.hist(data,density=True,label="%s [%d]"%(tissue, len(data)),histtype='step',lw=1.5)
    ax.plot(x,st.gamma.pdf(x,a,0,b))
    ax.set_xlim(xmax=data.max())
    ax.set_xlabel('%s'%normalisation_str,fontsize=13)
    ax.set_ylabel('#')
    ax.set_xlim(5e-2,data.max())
    ax.legend(fontsize=16)
plt.show()
fig.savefig("plot/%s_%s_per_tissue.png"%(normalisation_str,gene))

In [None]:
fig=plt.figure()
ax=fig.subplots()
ax.set_title('Expression per tissue: %s'%gene, fontsize=18)
data = np.array(df_tissue.loc[gene,:].values,dtype=float)
x = np.linspace(0,data.max())
mu = np.average(data)
var = np.var(data)
a = mu*mu/var
b = var/mu
plt.hist(data, histtype='step', lw=2, label='gene', density=True, bins=10)
plt.plot(x, st.gamma.pdf(x/b,a)/b)
ax.set_xlabel('%s'%normalisation_str,fontsize=16)
ax.set_ylabel('#')
#ax.set_xscale('log')
#ax.set_yscale('log')
plt.show()
fig.savefig("plot/%s_%s_across_tissue.png"%(normalisation_str,gene))

## integral

In [None]:
fig=plt.figure(figsize=(15,10))
ax=fig.subplots()
ax.set_title('%s sorted integral'%normalisation_str, fontsize=18)
#for tissue in ['Blood','Heart, mediastinum, and pleura','Brain','Skin','Testis','Thyroid gland']:
for tissue in more_common_tissues:
    subdf = df_tissue.sort_values(by=tissue, axis=0, ascending=False).loc[:,tissue]
    norm = np.sum(subdf.values[:])
    if norm!=0:
        ax.plot(np.cumsum(subdf.values[:])/norm,label=tissue, lw=3)
ax.set_xscale('log')
ax.set_xlabel('Number of genes', fontsize=18)
ax.set_ylabel('Fraction of total tissue expression', fontsize=18)
ax.set_xlim((1,2e4))
plt.legend()
plt.show()
fig.savefig("fraction_of_trascriptome.pdf")

## Mean Variance

In [None]:
tissue = 'Breast'
subdf = df.loc[:,samples[samples['primary_site']==tissue].index.values]
means = subdf.apply(np.nanmean, axis=1).values
variances = subdf.apply(np.nanvar, axis=1).values
distrs = subdf.sum(axis=0)

In [None]:
plt.hist(distrs)
print(np.mean(distrs),np.var(distrs))

In [None]:
x_lin = np.logspace(np.log10(means[means.nonzero()].min()),np.log10(means[means.nonzero()].max()), dtype=float,num=50)
fig=plt.figure(figsize=(15,4))
plt.scatter(means, variances, c='b')
plt.plot(x_lin[-40:],np.power(x_lin[-40:],2), 'g-', lw=3.5, label='$<%s>^2$'%normalisation_str)
plt.plot(x_lin[:20],x_lin[:20], 'r-', lw=3.5, label='$<%s>$'%normalisation_str)

plt.xlabel("$<%s>$"%normalisation_str, fontsize=16)
plt.ylabel("$\sigma^2_{%s}$"%normalisation_str, fontsize=16)
plt.xscale('log')
plt.yscale('log')
plt.title(tissue,fontsize=18)
plt.xlim(means[means.nonzero()].min()/5,np.power(10,np.log10(means.max())+1))
plt.ylim((variances[variances.nonzero()].min()/10,np.power(10,np.log10(variances.max())+1)))
plt.legend(fontsize=16)
plt.show()
fig.savefig("varmean_loglog_%s.png"%tissue)

In [None]:
cv2 = [variances[i]/(np.power(mean,2)) for i,mean in enumerate(means) if mean>0]
fig=plt.figure(figsize=(15,4))
plt.scatter(means[means.nonzero()], cv2, c='b')
plt.plot(x_lin[:30],1./x_lin[:30], 'g-', lw=3.5, label='Poisson')
plt.plot(x_lin[-30:],[1 for _ in x_lin[-30:]], 'r-', lw=3.5, label='Taylor')
plt.plot(x_lin,[len(subdf.columns)-1 for _ in x_lin], '--', lw=3.5, label='bound')


plt.title(tissue,fontsize=18)
plt.xlabel("$<%s>$"%normalisation_str, fontsize=16)
plt.ylabel("$cv^2$", fontsize=16)
plt.xscale('log')
plt.yscale('log')
plt.xlim(means[means.nonzero()].min()/5,np.power(10,np.log10(means.max())+1))
plt.ylim(variances[variances.nonzero()].min()/10,len(subdf.columns)*10)
plt.legend(fontsize=16)
plt.show()
fig.savefig("cvmean_loglog_%s.png"%tissue)

## Lenght

In [None]:
q_many = pd.read_csv("genes.txt", index_col=[0], header=[0])
q_many = q_many[q_many['type_of_gene']=='protein-coding']
lenghts = q_many['lenght']

In [None]:
from scipy.stats import binned_statistic
fig=plt.figure(figsize=(15,7))
means = subdf.mean(axis=1).values
bin_means, bin_edges, _ = binned_statistic(lenghts, means, statistic='mean', bins=np.logspace(1,7))
plt.scatter(lenghts,means)
plt.scatter((bin_edges[1:]+bin_edges[:-1])/2., bin_means, marker='x')
plt.title(tissue, fontsize=18)
plt.yscale('log')
plt.xscale('log')
plt.xlabel('lenght (bp)', fontsize=16)
plt.ylabel('mean (counts)', fontsize=16)
plt.xlim((lenghts.min()/10,lenghts.max()*10))
plt.ylim((means[means.nonzero()].min()/10,means.max()*10))
plt.show()
fig.savefig("meanLenght_tissue.pdf")

## frac K

In [None]:
frak_k = []
for tissue in more_common_tissues:
    A = df_tissue.loc[:,tissue].values
    frak_k.append(np.array([[k,float(len(A[A>=k]))/len(A)] for k in np.logspace(0,np.log10(A.max()),60)]))

In [None]:
fig = plt.figure(figsize=(15,7))
for i,dataset in enumerate(frak_k):
    plt.plot(dataset.T[0],dataset.T[1], lw=1, marker='o', label=more_common_tissues[i])
plt.yscale('log')
plt.xscale('log')
plt.xlabel('counts k', fontsize=16)
plt.ylabel('fraction of genes\n with more than k counts', fontsize=12)
plt.xlim(9e-1,5e4)
plt.ylim(2e-5,1e0)
plt.legend()
plt.show()
fig.savefig("frak_k_tissue.pdf")

## lenght

In [None]:
q_many = pd.read_csv("genes.txt", index_col=[0], header=[0])
q_many = q_many[q_many['type_of_gene']=='protein-coding']
lenghts = q_many['lenght']

In [None]:
#from scipy.stats import binned_statistic
fig=plt.figure(figsize=(15,7))
for tissue in common_tissues:
    bin_means, bin_edges, _ = binned_statistic(lenghts, df_tissue.loc[:,tissue], statistic='mean', bins=np.logspace(1,7))
    #plt.scatter(lenghts,df_tissue.loc[:,tissue])
    plt.scatter((bin_edges[1:]+bin_edges[:-1])/2,bin_means, marker='x',label=tissue)
    plt.yscale('log')
    plt.xscale('log')
plt.xlabel('lenght (bp)', fontsize=16)
plt.ylabel('mean (counts)', fontsize=16)
plt.xlim((lenghts.min(),lenghts.max()))
plt.ylim((1e-3,1e5))
plt.legend()
plt.show()
fig.savefig("meanLenght_tissue.pdf")

## inter intra

In [None]:
inter_var = df_tissue.apply(np.var, axis=1)

In [None]:
x = np.logspace(np.log10(5e-1),20)
fig = plt.figure()
for tissue in common_tissues:
    intra_var = df_tissue_var[tissue]
    plt.scatter(inter_var, intra_var, label=tissue)
plt.plot(x,x)
plt.plot(x,np.sqrt(x))
plt.xlabel('inter_(tissue+individual)')
plt.ylabel('intra_tissue')
plt.yscale('log')
plt.xscale('log')
plt.ylim(5e-1,5e12)
plt.xlim(5e-1,5e21)
plt.show()
fig.savefig("inter_intra.png")

In [None]:
tissue = 'Kidney'
intra_var = df_tissue_var[tissue]
for g in intra_var.index:
    if (intra_var[g] < np.sqrt(inter_var[g])) & (inter_var[g]>1e10):
        print(g)

In [None]:
inter_var[g]>1e8