In [None]:
import os, sys
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import findspark
findspark.init()
import pyspark as spark
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType
sc=spark.SparkContext()
sql=spark.SQLContext(sc)

In [None]:
os.chdir("/Users/filippo/Developer/tesi/gtex/")

In [None]:
genes = sql.read.option("header",'true').csv('/Users/filippo/Developer/tesi/genes.txt')
genes.registerTempTable("genes")
genes.select("type_of_gene").distinct().show()
pc = sql.sql("SELECT * FROM genes WHERE type_of_gene='protein-coding'")
pc_list = list(pc.select('_c0').toPandas().values.T[0])

In [None]:
df = sql.read.option("header",True).option("delimiter",'\t').csv("/Users/filippo/Developer/tesi/gtex/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_reads.gct")
df = df.withColumn('ensg',udf(lambda x: x[:15], StringType())(col('Name')))
df = df.filter(col("ensg").isin(pc_list))

In [None]:
df_files = pd.read_csv("/Users/filippo/Developer/tesi/gtex/files.dat.ok", index_col=0)

In [None]:
tissues = df_files['primary_site'].unique()

In [None]:
model=PCA()

In [None]:
expl_var = {}

In [None]:
for i in np.arange(2,len(tissues)-1,1,dtype=int):
    print(i)
    selected=df_files[df_files['primary_site'].isin(tissues[:i])].index.values
    np.random.shuffle(selected)
    data=df.select(list(selected[:100])).toPandas().transpose()
    model.fit(data)
    expl_var[i]=model.explained_variance_

In [None]:
import matplotlib.pyplot as plt

In [None]:
x=expl_var.keys()
fig=plt.figure()
for thr in [1e10,2.5e10,6e10]:
    plot_data=[]
    for k in expl_var.keys():
        var=expl_var[k]
        plot_data.append(len(var[var>thr]))
    plt.plot(x,plot_data, marker='o', label="thr %.1e"%thr)
plt.xlabel('# tissues', fontsize=20)
plt.ylabel('intrinsic\ndimension', fontsize=20)
plt.plot(x,x)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend()
plt.show()
fig.savefig("intrinsic_dimension.pdf")

In [None]:
pd.DataFrame(data=expl_var).to_csv("intrdim.csv",index=False, header=True)