In [None]:
import pandas as pd
import numpy as np
import scanpy as sc
import requests
import regex as re
import json
import os, sys
sys.path.append('/home/fvalle/phd/master_thesis/hsbm/')

# Get Manifest File

In [None]:
files_endpt = "https://api.gdc.cancer.gov/files"

# The 'fields' parameter is passed as a comma-separated string of single names.
fields = [
    "file_name",
    #"analysis.workflow_type",
    #"experimental_strategy",
    "cases.project.primary_site",
    "cases.project.disease_type",
    #"cases.diagnoses.primary_diagnosis",
    "cases.submitter_id",
    "cases.samples.portions.analytes.aliquots.submitter_id",
    "cases.diagnoses.tumor_stage",
    "cases.diagnoses.tumor_grade",
    "cases.diagnoses.progression_or_recurrence",
    "cases.diagnoses.prior_malignancy",
    "cases.project.project_id"
    #"cases.diagnoses.vital_status",
    #"cases.diagnoses.days_to_birth"
    #"cases.project.project_id"
]

fields = ','.join(fields)

In [None]:
# 10 tissues project filters only top 10 tissue (@ 18/12/2019) from TCGA are downloaded
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["Gene Expression Quantification"]
            }
        },
        {
        "op": "in",
        "content":{
               "field": "files.analysis.workflow_type",
                "value": ["HTSeq - FPKM"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.program.name",
            "value": ["TCGA"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.primary_site",
            "value": ["bladder","brain","breast","bronchus and lung","colon","corpus uteri","kidney","prostate gland","skin","thyroid gland"]
            }
        }
        
    ]
}

In [None]:
# A POST is used, so the filter parameters can be passed directly as a Dict object.
##remove manifest to have file fields
params = {
   "return_type": "manifest",
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "15000"
    }

In [None]:
# The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
with open("manifest.txt","w") as manifest:
    manifest.write(response.content.decode("utf-8"))
    manifest.close()

Use **gdc-client** to download data files.

```bash
mkdir data
mv manifest.txt data/.
cd data
gdc-client download -m manifest.txt
```

In [None]:
#genes filtered as Dey
df_genes = pd.read_csv("https://stephenslab.github.io/count-clustering/project/utilities/gene_names_all_gtex.txt", header=None, index_col=0)
df = df_genes.copy()

In [None]:
#fpkm
def getFilenameFromDir(directory):
    for element in os.listdir(directory):
        if re.match("[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9\-]{4}-[a-zA-Z0-9\-]{12}[\.FPKM]{5}.txt[\.gz]{0,3}",element):
            cfile = element
            print(element)
            return cfile
    raise BaseException("Not found %s"%os.listdir(directory))

In [None]:
added = len(df.columns)
for i,cdirectory in enumerate(os.listdir("data")):
    if re.match("manifest\.txt",cdirectory):
        print("SKIPPING %s "%cdirectory)
        continue
    if "Icon" in cdirectory:
        print("SKIPPING %s "%cdirectory)
        continue
    cfile = getFilenameFromDir("data/%s"%cdirectory)
    cdf = pd.read_csv(("data/%s/%s"%(cdirectory,cfile)), sep='\t', header=None)
    cdf.columns = ["gene", cfile[:]]
    cdf['gene'] = [gene[:15] for gene in cdf['gene']]
    cdf.set_index('gene',inplace=True)
    cdf = cdf.reindex(index=df_genes.index)
    old_L = len(df.columns)
    df.insert(0,cdf.keys()[0][:],cdf.values)
    if len(df.columns) != old_L+1:
        print(*sys.exc_info())
        raise(Exception("Not able to add: %s"%cfile))
print(added, i)

In [None]:
df.dropna(how='all', axis=0, inplace=True)

In [None]:
params = {
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "15000"
    }
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
files_data = [row.split('\t') for row in response.content.decode("utf-8").split('\r\n')]
df_files = pd.DataFrame(data=files_data[1:], columns=files_data[0])
df_files = df_files.set_index('file_name').dropna()
del files_data

# Select tissues

In [None]:
rs = np.random.RandomState(seed=42)
samples = pd.DataFrame(columns=df_files.columns)
for site in df_files.groupby('cases.0.project.primary_site').count().sort_values(by='cases.0.project.project_id', ascending=False).index[:10]:
    samples = samples.append(df_files[df_files['cases.0.project.primary_site']==site].sample(100, random_state=rs))

# Select HVG

In [None]:
adata = sc.AnnData(X=df[samples.index].transpose(), obs=df_files.reindex(index=samples.index))

In [None]:
adata_log=sc.pp.log1p(adata, copy=True)

In [None]:
sc.pp.highly_variable_genes(adata_log, n_top_genes=3000, n_bins=50, max_mean=8)

In [None]:
sc.pl.highly_variable_genes(adata_log, save="hvg.svg")

In [None]:
hvg = adata_log.var[adata_log.var['highly_variable']==True].index
samples = adata_log.obs.index

# Apply SBM

In [None]:
from sbmtm import sbmtm

In [None]:
model = sbmtm()

In [None]:
model.make_graph_from_BoW_df(df.loc[hvg, samples])

In [None]:
model.save_graph("graph.xml.gz")