In [None]:
import os, sys, gc
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import findspark
findspark.init()
import pyspark as spark
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType
conf = spark.SparkConf().set('spark.driver.host','127.0.0.1')
sc = spark.SparkContext(master='local', appName='myAppName',conf=conf)
sql = spark.SQLContext(sc)

In [None]:
sys.path.append("/home/fvalle/phd/TWO-NN/")
from TwoNN import twonn_dimension

In [None]:
os.chdir("/home/fvalle/phd/datasets/gtex/log/10")

In [None]:
genes = sql.read.option("header",'true').csv('/home/fvalle/phd/master_thesis/genes.txt')
genes.registerTempTable("genes")
genes.select("type_of_gene").distinct().show()
pc = sql.sql("SELECT * FROM genes WHERE type_of_gene='protein-coding'")

In [None]:
pc_list=pd.read_csv("http://stephenslab.github.io/count-clustering/project/utilities/gene_names_all_gtex.txt", header=None).values.ravel().astype(str)

In [None]:
df = sql.read.option("header",True).option("delimiter",'\t').csv("/home/fvalle/phd/datasets/gtex/10/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz")
df = df.withColumn('ensg',udf(lambda x: x[:15], StringType())(col('Name')))
df = df.withColumn('type',udf(lambda x: 'pc' if x in pc_list else 'nc', StringType())(col('ensg')))
df.registerTempTable("df")
df = sql.sql("SELECT * FROM df WHERE type='pc'")

In [None]:
df_files = pd.read_csv("https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", sep='\t').loc[:,['SAMPID','SMTS', 'SMTSD']]
df_files.set_index('SAMPID', inplace=True)
df_files = df_files.reindex(index=df.columns).dropna(how='all', axis=0)

In [None]:
tissues = df_files.groupby('SMTS').count().sort_values('SMTSD', ascending=False).index

In [None]:
model=PCA()

In [None]:
selected = []
expl_var = {}
twonn_dims = []
N=100
for i in np.arange(1,len(tissues)-1,1,dtype=int):
    print(i)
    selected=np.concatenate((selected, df_files[df_files['SMTS']==tissues[i]][:N].index.values))
    np.random.shuffle(selected)
    data=df.select(list(selected)).toPandas().astype(float).applymap(lambda x: np.log(x+1)).transpose().astype(float).values
    model.fit(data)
    expl_var[i]=model.explained_variance_
    twonn_dims.append(twonn_dimension(data))
    gc.collect()

In [None]:
try:
    pd.DataFrame(data=expl_var).to_csv("intr_dim.csv")
except:
    pass
try:
    pd.DataFrame(data=twonn_dims).to_csv("twonn.csv")
except:
    pass

In [None]:
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

In [None]:
np.cumsum(expl_var[3])

In [None]:
fig, ax=plt.subplots()
#list(map(lambda data: ax.plot(data[1]), expl_var.items()))
ax.plot(np.cumsum(expl_var[3])/np.nansum(expl_var[3]))
ax.set_yscale('log')
#ax.set_xscale('log')
#ax.set_ylim(1e-1,1e5)
plt.show()

In [None]:
x=np.array([i for i in expl_var.keys()])
parab = lambda x,a,b,c: a+ b*x+c*x*x
fig=plt.figure(figsize=(15,8))
for thr in [0.7,0.75,0.8]:
    plot_data=[]
    for k in expl_var.keys():
        var=np.array(expl_var[k])
        var = np.cumsum(var)/np.nansum(var)
        plot_data.append(len(var[var<thr]))
    #par, pot = curve_fit(parab, x,plot_data)
    #plt.plot(x, parab(x, *par), ls=':', lw=2, label="%d+%.2f x%.3f xx"%(par[0],par[1],par[2]))
    plt.plot(x,plot_data, marker='o', label="thr %.1e"%thr)
plt.xlabel('# tissues', fontsize=20)
plt.ylabel('intrinsic\ndimension', fontsize=20)
plt.plot(x,x)
#plt.plot(x,1.06*x+2)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(fontsize=20)
n_tissues=np.array([n for n in expl_var.keys()])
plt.xticks(ticks=n_tissues, labels=n_tissues+1, rotation=90)
plt.show()
fig.savefig("intrinsic_dimension.pdf")

In [None]:
plot_data

In [None]:
pd.DataFrame(data=list(map(lambda x: np.concatenate([expl_var[x],[np.nan for _ in range(1200-len(expl_var[x]))]]), expl_var))).to_csv("intrdim.csv",index=False, header=True)

In [None]:
df_ev=pd.read_csv("intrdim.csv",  header=0).transpose()
df_ev.columns=df_ev.columns.astype(int)
expl_var=df_ev.to_dict('list')

In [None]:
fig=plt.figure()
plt.plot(twonn_dims, marker='o', label='twonn')
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(fontsize=20)
plt.xlabel('# tissues', fontsize=20)
plt.ylabel('intrinsic\ndimension', fontsize=20)
#plt.ylim(6,11)
n_tissues=np.array([n for n in expl_var.keys()])
plt.xticks(ticks=n_tissues-1, labels=n_tissues, rotation=90)
plt.show()
fig.savefig("intrinsic_dimension_twonn.pdf")

In [None]:
sc.stop()