In [None]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt

In [None]:
working_dir = "/Users/filippo/Developer/tesi"
os.chdir(working_dir)
dirs = os.listdir("data")

In [None]:
common_tissues = ['Bronchus and lung', 'Brain', 'Breast', 'Ovary', 'Kidney', 'Colon', 'Corpus uteri','Adrenal gland','Skin']

In [None]:
samples = pd.read_csv("files.txt", index_col=[0])
samples.head()

In [None]:
primaries = samples['primary_site'].unique()
diseases = samples['disease_type'].unique()

In [None]:
df = pd.read_csv(("%s/mainTable.csv"%working_dir))
genes = np.array([gene[:15] for gene in df.loc[:,'gene'].values])
df['gene'] = genes
df.columns=[c[:36] for c in df.columns]
#df = df.to_sparse(fill_value=0.)
df.head()

## Tissue U

In [None]:
tissue = 'Colon'

In [None]:
sample_list = np.concatenate((['gene'],samples[samples['primary_site']==tissue].index.values))
subdf = df.loc[:,sample_list]
subdf.set_index('gene', inplace=True)

In [None]:
O = []
for g in subdf.index.values:
    o = len([fpkm for fpkm in subdf.loc[g].values if fpkm > 1e-1])
    if o>1:
        O.append(o)

In [None]:
bins = 20
rang = (0-0.5/20,1+0.5/20)
fig = plt.figure()
plt.hist(np.array(O,dtype=float)/len(sample_list), histtype='step', lw=4, density=True, bins=bins, range=rang)
plt.title(tissue, fontsize=18)
plt.xlabel('$O_i$', fontsize=16)
plt.ylabel('#')
plt.show()
fig.savefig("U_%s.png"%tissue)

## average tissues

In [None]:
#df_tissue = pd.DataFrame(index=df.gene, columns=np.unique(samples['primary_site'].values), dtype=float)
#df_tissue.fillna(value=0.,inplace=True)
df_tissue = pd.read_csv("results/proteincoding/mainTable_tissues.csv")
df_tissue.head()

In [None]:
#for i,g in enumerate(df_tissue.index.values):
#    gdf = df[df['gene']==g]
#    print(g,i)
#    for tissue in df_tissue.columns.values:
#        sample_list = np.concatenate((['gene'],samples[samples['primary_site']==tissue].index.values))
#        subdf = gdf.loc[:,sample_list]
#        df_tissue.at[g,tissue]=np.average(np.array(subdf.values[0][1:], dtype=float))

In [None]:
#df_tissue.to_csv("mainTable_tissues.csv")

In [None]:
df_tissue.fillna(value=0.,inplace=True)

In [None]:
gene='ENSG00000108849'

In [None]:
fig=plt.figure()
ax=fig.subplots()
ax.set_title('FPKM across all tissues: %s'%gene, fontsize=18)
plt.hist(np.array(df[df['gene']==gene].values[0][1:],dtype=float), histtype='step', lw=2,bins=10, label='gene')
ax.set_xlabel('FPKM',fontsize=16)
ax.set_ylabel('#')
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(5e-2,1e5)
plt.show()
fig.savefig("plot/proteincoding/tissue/fpkm_%s_alltissues.png"%gene)

In [None]:
fig=plt.figure(figsize=(15,10))
ax=fig.subplots()
ax.set_title('FPKM per tissue: %s'%gene, fontsize=18)
for tissue in common_tissues:
    #print(tissue)
    try:
        sample_list = np.concatenate((['gene'],samples[samples['primary_site']==tissue].index.values))
        subdf = df[df['gene']==gene].loc[:,sample_list]
        ax.hist(np.array(subdf.values[0][1:],dtype=float),density=False,label=tissue,histtype='step',lw=1.5)
    except:
        pass
#ax.set_ylim(ymax=0.2)
ax.set_xlim(xmax=100)
ax.set_xlabel('FPKM',fontsize=16)
ax.set_ylabel('#')
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(5e-2,1e5)
plt.legend()
plt.show()
fig.savefig("plot/proteincoding/tissue/fpkm_%s_per_tissue.png"%gene)

In [None]:
fig=plt.figure()
ax=fig.subplots()
ax.set_title('FPKM across per tissue: %s'%gene, fontsize=18)
plt.hist(np.array(df_tissue[df_tissue['gene']==gene].values[0][1:],dtype=float), histtype='step', lw=2,bins=10, label='gene')
ax.set_xlabel('FPKM',fontsize=16)
ax.set_ylabel('#')
plt.show()
ax.set_xscale('log')
ax.set_yscale('log')
fig.savefig("plot/proteincoding/tissue/fpkm_%s_across_tissue.png"%gene)

## integral

In [None]:
fig=plt.figure(figsize=(15,10))
ax=fig.subplots()
ax.set_title('FPKM sorted integral', fontsize=18)
for tissue in ['Blood','Heart, mediastinum, and pleura','Brain','Skin','Testis','Nervous System','Thyroid gland']:
#for tissue in np.concatenate((common_tissues,['Blood','Heart, mediastinum, and pleura','Brain','Skin','Testis','Nervous System','Thyroid gland'])):
    subdf = df_tissue.sort_values(by=tissue, axis=0, ascending=False).loc[:,tissue]
    norm = np.sum(subdf.values[:])
    if norm!=0:
        ax.plot(np.cumsum(subdf.values[:])/norm,label=tissue, lw=3)
ax.set_xscale('log')
ax.set_xlabel('Number of genes', fontsize=18)
ax.set_ylabel('Fraction of total tissue expression', fontsize=18)
ax.set_xlim((1,2e4))
plt.legend()
plt.show()
fig.savefig("fraction_of_trascriptome.png")