In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import findspark
findspark.init()
import pyspark as spark
sc = spark.SparkContext()
sql = spark.SQLContext(sc)

In [None]:
df = pd.read_csv("GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_reads.gct", index_col=[0], sep='\t')
df.drop(labels='Description', axis=1,inplace=True)
df.index=[g[:15] for g in df.index]
df.head()

In [None]:
genes = sql.read.option("header",'true').csv('/Users/filippo/Developer/tesi/genes.txt')
genes.registerTempTable("genes")

In [None]:
pc = sql.sql("SELECT * FROM genes WHERE type_of_gene='protein-coding'").toPandas()
pc.set_index('_c0', inplace=True)

In [None]:
df.to_csv("mainTable.csv", header=True, index=True)

## GTEx Biospecimen
[GTEX_biospecimen](https://gtexportal.org/home/samplesPage)

In [None]:
#df_file = pd.read_csv("https://storage.googleapis.com/gtex_analysis_v7/annotations/GTEx_v7_Annotations_SampleAttributesDS.txt", sep='\t', index_col=[0])
#df_file=df_file.loc[:,['SMTS', 'SMTSD']]
df_file=pd.read_csv("files.dat", index_col=[0])
df_file.head()

In [None]:
#df_sites=pd.DataFrame()
#df_sites.insert(0,'sample',df_file.index.values)
#df_sites.insert(1,'primary_site',df_file.loc[:,'SMTS'].values)
#df_sites.insert(2,'secondary_site', df_file.loc[:,'SMTSD'].values)
#df_sites.set_index('sample', inplace=True)
df_sites.head()

In [None]:
np.unique(df_file['primary_site'])

In [None]:
pc=pd.read_csv("../genes.txt", index_col=[0])
pc=pc[pc['type_of_gene']=='protein-coding']
pc.head()

In [None]:
subdf = df.loc[pc.index.values,df_file[df_file['primary_site'].isin(['Blood', 'Brain', 'Lung', 'Thyroid', 'Breast', 'Skin', 'Pancreas', 'Testis', 'Adipose Tissue', 'Colon', 'Nerve', 'Heart', 'Kidney', 'Uterus', 'Adrenal Gland'])].index.values].fillna(0).astype(int)
subdf.to_csv("mainTable_tissue.csv",index=True, header=True)

# U

In [None]:
tissue = 'Breast'

In [None]:
sample_list = df_sites[df_sites['primary_site']==tissue].index.values
subdf = df.loc[:,sample_list]

In [None]:
odf = pd.read_csv("meanVariances.csv", header=0, index_col=0)
odf.insert(3,"A",pd.read_csv("A.dat", header=None).values)
odf.head()

In [None]:
O = odf[odf.index.isin(pc.index)]['occurrence']
O_nc = odf[~odf.index.isin(pc.index)]['occurrence']

In [None]:
bins = 20
rang = (0-0.5/20,1+0.5/20)
fig = plt.figure()
plt.hist(O[O<=1], histtype='step', lw=4, density=True, bins=bins, range=rang, label='coding')
plt.hist(O_nc[O_nc<=1], histtype='step', lw=4, density=True, bins=bins, range=rang, label='non-coding')
#plt.title(tissue, fontsize=18)
plt.xlabel('$O_i$', fontsize=16)
plt.ylabel('pdf', fontsize=16)
plt.legend(loc='upper left', fontsize=16)
plt.show()
fig.savefig("U_gtex_cnc.pdf")

In [None]:
A = odf['A'].values
A_c = odf[odf.index.isin(pc.index)]['A'].values
A_nc = odf[~odf.index.isin(pc.index)]['A'].values
fig=plt.figure()
plt.plot(np.arange(0,len(A)), 1./np.arange(0,len(A)), ls='--', c='g', label='$r^-1$')
#plt.plot(np.arange(0,len(A)), np.sort(A)[::-1]/np.sum(A))
#plt.plot(np.arange(0,len(A_c)), np.sort(A_c)[::-1]/np.sum(A), label='coding genes')
plt.plot(np.arange(0,len(A_nc)), np.sort(A_nc)[::-1]/np.sum(A), label='non coding genes')
plt.xscale('log')
plt.yscale('log')
plt.ylabel('$f_i$', fontsize=18)
plt.xlabel("Rank_i", fontsize=18)
plt.legend(fontsize=18)
fig.savefig("globalZipf_nc.pdf")