In [None]:
import requests as rq
import json
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.cm as cm
#from ensembleAPI import geneinfo, genesinfo
from sklearn import metrics
import os, sys, gc
sys.path.append("master_thesis")
from TCGA_files import *
from hsbmpy import *

In [None]:
import importlib, hsbmpy
importlib.reload(hsbmpy)
from hsbmpy import *

## query many

In [None]:
level = 1
#setup='highlyvariable_colonrectal'
#setup = 'oversigma_10tissue'
#label = 'disease_type'
label='primary_site'
#labels = ['primary_site', 'disease_type', 'disease_tissue']
#labels=['RPPA','PAM50', 'disease_type','primary_diagnosis']
#labels = ['primary_site', 'secondary_site']
algorithm = 'topsbm'
#labels = ['primary_site', 'secondary_site', 'status']
#labels=['RPPA Clusters']
directory = "/home/fvalle/phd/datasets/gtex/log/10"
os.chdir(directory)
L=get_max_available_L(directory, algorithm)
df_clusters = pd.read_csv("%s/%s_level_%d_clusters.csv"%(algorithm,algorithm,L), header=[0])
df_clusters.head()

In [None]:
labels = ['cases.0.diagnoses.0.prior_malignancy',
       'cases.0.diagnoses.0.tumor_grade', 'cases.0.project.primary_site',
       'cases.0.project.disease_type', 'cases.0.diagnoses.0.tumor_stage',
       'cases.0.diagnoses.0.progression_or_recurrence']
labels=['primary_site', 'disease_type']
labels=['SMTS']

In [None]:
df_topics = pd.read_csv("%s/%s_level_%d_topics.csv"%(algorithm,algorithm,L), header=[0])
df_topics.head()

# Files
[http://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats](http://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats)
## Dataset (*.txt)

In [None]:
#GTEx
df = pd.read_csv('https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz', skiprows=2, compression='gzip', sep='\t')
df['ensg'] = [x[:15] for x in df['Name']]
df.set_index('Name', inplace=True)
df.set_index(['ensg'],inplace=True)
df=df.drop(['Description'],1)
genelist=pd.read_csv("https://stephenslab.github.io/count-clustering/project/utilities/gene_names_all_gtex.txt", header=None).values.ravel()
df = df[df.index.isin(genelist)]
df_file = pd.read_csv("https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", sep='\t').loc[:,['SAMPID','SMTS', 'SMTSD']]
df_file.set_index('SAMPID', inplace=True)

In [None]:
df.insert(0, 'Description', pd.np.nan)

In [None]:
samples = [get_file(sample, df_file).name for sample in df_clusters.values.ravel() if str(sample)!='nan']
genes = [x[:15] for x in df_topics.values.ravel() if str(x)!='nan']
dataset = df.reindex(columns=samples, index=genes)

In [None]:
dataset.head()

In [None]:
dataset.to_csv("dataset.txt", index=True, header=True, sep='\t')

In [None]:
gc.collect()

## Phenotype annotation (*.cls)

In [None]:
from functools import reduce

In [None]:
clusters = []
for sample in dataset.columns:
    findSeries = df_clusters.isin([sample]).any()
    clusters.append(findSeries[findSeries==True].index.values[0])

In [None]:
clusters = df_file.reindex(index=dataset.columns)['SMTS'].values.ravel()

In [None]:
classes = np.unique(clusters)
n_samples, n_classes = len(dataset.columns), len(classes)

In [None]:
with open("categorical.cls", "w") as cat_file:
    cat_file.write("%d %d 1"%(n_samples, n_classes)+'\n')
    cat_file.write('# ' + reduce(lambda x,y:x+' '+y, list(map(lambda x: x.replace(' ',''),classes)))+'\n')
    cat_file.write(reduce(lambda x,y: str(x)+' '+str(y), np.unique(clusters, return_inverse=True)[1]))

## Gene sets (*.gmx)

In [None]:
df_sets = pd.DataFrame(columns=df_topics.columns)

In [None]:
df_sets = df_sets.append(pd.Series(data=[np.nan for _ in df_topics.columns], index=df_topics.columns, name='description')).append(df_topics.applymap(lambda x: x[:15] if str(x)!='nan' else ''))

In [None]:
df_sets.to_csv("set.gmx", index=False, header=True, sep='\t')

## (*.gmt)

In [None]:
df_sets.transpose().to_csv("set.gmt", index=False, header=True, sep='\t')

## World

In [None]:
with open("gene_world.txt", 'w') as f:
    set(map(f.write, (x.split('.')[0]+'\n' for x in df_topics.values.ravel() if str(x)!='nan')))

# GSEA

In [None]:
import gseapy as gp

In [None]:
phenoA, phenoB, class_vector =  gp.parser.gsea_cls_parser("categorical.cls")

In [None]:
# enrichr libraries are supported by gsea module. Just provide the name

gs_res = gp.gsea(data='dataset.txt', # or data='./P53_resampling_data.txt'
                 gene_sets='set.gmt', # enrichr library names
                 cls= class_vector, # cls=class_vector
                 # set permutation_type to phenotype if samples >=15
                 permutation_type='phenotype',
                 permutation_num=100, # reduce number to speed up test
                 outdir='gsea',  # do not write output to disk
                 no_plot=True, # Skip plotting
                 method='signal_to_noise',
                 processes=4,
                 seed=42,
                 format='png',
                 verbose=True)