In [None]:
import pandas as pd
import numpy as np
import scanpy as sc
import requests
import regex as re
import json
import os, sys
sys.path.append('/home/fvalle/phd/master_thesis/hsbm/')

# Get Manifest File

In [None]:
files_endpt = "https://api.gdc.cancer.gov/files"

# The 'fields' parameter is passed as a comma-separated string of single names.
fields = [
    "file_name",
    #"analysis.workflow_type",
    #"experimental_strategy",
    "cases.project.primary_site",
    "cases.project.disease_type",
    #"cases.diagnoses.primary_diagnosis",
    "cases.submitter_id",
    "cases.samples.portions.analytes.aliquots.submitter_id",
    "cases.diagnoses.tumor_stage",
    "cases.diagnoses.tumor_grade",
    "cases.diagnoses.progression_or_recurrence",
    "cases.diagnoses.prior_malignancy",
    "cases.project.project_id"
    #"cases.diagnoses.vital_status",
    #"cases.diagnoses.days_to_birth"
    #"cases.project.project_id"
]

fields = ','.join(fields)

In [None]:
# 10 tissues project filters only top 10 tissue (@ 18/12/2019) from TCGA are downloaded
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["Gene Expression Quantification"]
            }
        },
        {
        "op": "in",
        "content":{
               "field": "files.analysis.workflow_type",
                "value": ["HTSeq - FPKM"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.program.name",
            "value": ["TCGA"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.primary_site",
            "value": ["bladder","brain","breast","bronchus and lung","colon","corpus uteri","kidney","prostate gland","skin","thyroid gland"]
            }
        }
        
    ]
}

In [None]:
# A POST is used, so the filter parameters can be passed directly as a Dict object.
##remove manifest to have file fields
params = {
   "return_type": "manifest",
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "15000"
    }

In [None]:
# The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
with open("manifest.txt","w") as manifest:
    manifest.write(response.content.decode("utf-8"))
    manifest.close()

Use **gdc-client** to download data files.

```bash
mkdir data
mv manifest.txt data/.
cd data
gdc-client download -m manifest.txt
```

In [None]:
#genes filtered as Dey
df_genes = pd.read_csv("https://stephenslab.github.io/count-clustering/project/utilities/gene_names_all_gtex.txt", header=None, index_col=0)
df = df_genes.copy()

In [None]:
#fpkm
def getFilenameFromDir(directory):
    for element in os.listdir(directory):
        if re.match("[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9\-]{4}-[a-zA-Z0-9\-]{12}[\.FPKM]{5}.txt[\.gz]{0,3}",element):
            cfile = element
            print(element)
            return cfile
    raise BaseException("Not found %s"%os.listdir(directory))

In [None]:
added = len(df.columns)
for i,cdirectory in enumerate(os.listdir("data")):
    if re.match("manifest\.txt",cdirectory):
        print("SKIPPING %s "%cdirectory)
        continue
    if "Icon" in cdirectory:
        print("SKIPPING %s "%cdirectory)
        continue
    cfile = getFilenameFromDir("data/%s"%cdirectory)
    cdf = pd.read_csv(("data/%s/%s"%(cdirectory,cfile)), sep='\t', header=None)
    cdf.columns = ["gene", cfile[:]]
    cdf['gene'] = [gene[:15] for gene in cdf['gene']]
    cdf.set_index('gene',inplace=True)
    cdf = cdf.reindex(index=df_genes.index)
    old_L = len(df.columns)
    df.insert(0,cdf.keys()[0][:],cdf.values)
    if len(df.columns) != old_L+1:
        print(*sys.exc_info())
        raise(Exception("Not able to add: %s"%cfile))
print(added, i)

In [None]:
df.dropna(how='all', axis=0, inplace=True)

In [None]:
params = {
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "15000"
    }
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
files_data = [row.split('\t') for row in response.content.decode("utf-8").split('\r\n')]
df_files = pd.DataFrame(data=files_data[1:], columns=files_data[0])
df_files = df_files.set_index('file_name').dropna()
del files_data

# Select tissues

In [None]:
rs = np.random.RandomState(seed=42)
samples = pd.DataFrame(columns=df_files.columns)
for site in df_files.groupby('cases.0.project.primary_site').count().sort_values(by='cases.0.project.project_id', ascending=False).index[:10]:
    samples = samples.append(df_files[df_files['cases.0.project.primary_site']==site].sample(100, random_state=rs))

# Select HVG

In [None]:
adata = sc.AnnData(X=df[samples.index].transpose(), obs=df_files.reindex(index=samples.index))

In [None]:
adata_log=sc.pp.log1p(adata, copy=True)

In [None]:
sc.pp.highly_variable_genes(adata_log, n_top_genes=3000, n_bins=50, max_mean=8)

In [None]:
sc.pl.highly_variable_genes(adata_log, save="hvg.svg")

In [None]:
hvg = adata_log.var[adata_log.var['highly_variable']==True].index
samples = adata_log.obs.index

# Null model

In [None]:
A = df.loc[:,samples].sum(axis=1)
M = df.loc[:,samples].sum(axis=0)

In [None]:
rs = np.random.RandomState(seed=42)
df_null = pd.DataFrame(index=A.index)
for sample in M.index:
    df_null.insert(0,sample,np.average(np.array([rs.multinomial(M[sample], A.astype(float).values/A.sum()) for _ in range(100)]), axis=0))
#df_null=df_null.astype(int)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.set_context('paper')

In [None]:
ax.get_yticks().dtype

In [None]:
fig, ax = plt.subplots()
A.sort_values(ascending=False).plot(logx=True, logy=True, lw=5, ax=ax)
df_null.sum(axis=1).sort_values(ascending=False).plot(logx=True, logy=True, ls='--', lw=5, ax=ax)
ax.set_xlabel('rank, $i$', fontsize=20)
ax.set_ylabel('frequency, $f_i$', fontsize=20)
ax.tick_params(labelsize=18)
ax.set_xticks(ticks =np.array([1,1e1,1e2,1e3,1e4]))
ax.set_xticklabels(labels =np.array([1,1e1,1e2,1e3,1e4], dtype=np.float64))
plt.show()

In [None]:
fig, ax = plt.subplots()
M.hist(histtype='step', lw=5, ax=ax)
df_null.sum(axis=0).hist(histtype='step', ls='--', lw=5, ax=ax)
ax.set_xlabel('Size, $M$', fontsize=20)
ax.set_ylabel('#', fontsize=20)
ax.tick_params(labelsize=18)
plt.show()

In [None]:
fig, ax = plt.subplots()
df.apply(lambda x: len(x[x>=1])/float(len(x)), axis=1).hist(histtype='step', lw=5, density=True, ax=ax)
df_null.apply(lambda x: len(x[x>=1])/float(len(x)), axis=1).hist(histtype='step', lw=5, ls='--', density=True, ax=ax)
ax.set_xlabel('Occurrence, $O_i$', fontsize=20)
ax.set_ylabel('pdf', fontsize=20)
ax.tick_params(labelsize=18)
plt.show()

## CV2

In [None]:
means = df.loc[:,samples].mean(axis=1)
var = df.loc[:,samples].var(axis=1)
cv2=var/means/means

means_hv = df.loc[hvg, samples].mean(axis=1)
var_hv = df.loc[hvg, samples].var(axis=1)
cv2_hv = var_hv/means_hv/means_hv

means_null = df_null.mean(axis=1)
var_null = df_null.var(axis=1)
cv2_null = var_null/means_null/means_null

means_null_hv = df_null.loc[hvg, samples].mean(axis=1)
var_null_hv = df_null.loc[hvg, samples].var(axis=1)
cv2_null_hv = var_null_hv/means_null_hv/means_null_hv

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(means, cv2, c='b', label='data')
ax.scatter(means_null, cv2_null, c='orange', label='null model')
ax.scatter(means_hv, cv2_hv, c='cyan',  marker='x', label='HV')
ax.scatter(means_null_hv, cv2_null_hv, c='red',  marker='x', label='HV of null model')
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(1e-3,1e4)
ax.set_ylim(1e-2,1e3)
ax.set_xlabel('mean', fontsize=20)
ax.set_ylabel('$CV^2$', fontsize=20)
ax.tick_params(labelsize=18)
ax.legend(fontsize=20, ncol=2)
plt.show()
fig.savefig("cv_tcga_10.pdf")

# Apply SBM

In [None]:
from sbmtm import sbmtm

## HVG

In [None]:
model = sbmtm()

In [None]:
model.make_graph_from_BoW_df(df.reindex(index=hvg, columns=samples))

In [None]:
model.save_graph("graph.xml.gz")

## Null model

In [None]:
model = sbmtm()
model.make_graph_from_BoW_df(df_null.reindex(index=hvg, columns=samples))
model.save_graph("graph_null.xml.gz")

## House Keeping genes

In [None]:
#https://www.genomics-online.com/resources/16/5049/housekeeping-genes/
df_hk=pd.read_csv("HK_exons.csv")
genes=df_hk['Gene Name'].unique()

In [None]:
url="https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_pub_refseq_ids&col=md_ensembl_id&col=md_eg_id&col=md_rgd_id&col=md_mim_id&col=md_vega_id&col=md_lncipedia&col=md_gtrnadb&col=md_ucsc_id&col=md_refseq_id&col=md_prot_id&col=md_mgd_id&col=gd_pub_ensembl_id&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit"
df_conversion=pd.read_csv(url, sep="\t", index_col=0)
ensgs=df_conversion[df_conversion['Approved symbol'].isin(genes)]['Ensembl gene ID'].values.ravel()

In [None]:
df_hk = df.reindex(index=ensgs, columns=samples).dropna(how='all', axis=0)

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
df.reindex(columns=samples).apply(lambda x: len(x[x>=1])/float(len(x)), axis=1).hist(histtype='step', lw=5, density=True, ax=ax, label='all_genes')
df_hk.apply(lambda x: len(x[x>=1])/float(len(x)), axis=1).hist(histtype='step', lw=5, ls='--', density=True, ax=ax, label='Housekeeping genes')
ax.set_xlabel('Occurrence, $O_i$', fontsize=20)
ax.set_ylabel('pdf', fontsize=20)
ax.tick_params(labelsize=18)
ax.legend(fontsize=18, loc='upper left')
plt.show()
fig.savefig('U_hk.svg')

In [None]:
model = sbmtm()
model.make_graph_from_BoW_df(df_hk)
model.save_graph("graph_hk.xml.gz")

## Random genes

In [None]:
rs = np.random.RandomState(seed=42)
df_random = df.reindex(columns=samples).sample(3000, random_state=rs)

In [None]:
model = sbmtm()
model.make_graph_from_BoW_df(df_random)
model.save_graph("graph_random.xml.gz")

## Highly Expressed

In [None]:
highly_expressed = df.reindex(columns=samples).mean(1).sort_values(ascending=False)
df_he = df.reindex(index=highly_expressed[:3000].index, columns=samples)

In [None]:
model = sbmtm()
model.make_graph_from_BoW_df(df_random)
model.save_graph("graph_he.xml.gz")

## Sum up

In [None]:
fig, ax = plt.subplots(figsize=(15,8))

ax.scatter(df.reindex(columns=samples).mean(1), df.reindex(columns=samples).apply(lambda x: x.var()/x.mean()/x.mean(), axis=1), alpha=0.3, c='b', label='data')
ax.scatter(df_null.mean(1), df_null.apply(lambda x: x.var()/x.mean()/x.mean(), axis=1), c='orange', label='null model')
ax.scatter(df.reindex(index=hvg, columns=samples).mean(1), df.reindex(index=hvg, columns=samples).apply(lambda x: x.var()/x.mean()/x.mean(), axis=1), c='orange', marker='x', alpha=0.9, label='highly variable')
ax.scatter(df_hk.mean(1), df_hk.apply(lambda x: x.var()/x.mean()/x.mean(), axis=1), c='cyan',  marker='x', alpha=0.8, label='house Keeping')
ax.scatter(df_he.mean(1), df_he.apply(lambda x: x.var()/x.mean()/x.mean(), axis=1), c='red',  marker='x', alpha=0.7, label='highly Expressed')
ax.scatter(df_random.mean(1), df_random.apply(lambda x: x.var()/x.mean()/x.mean(), axis=1), c='green',  marker='.', alpha=0.6, label='random')

ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(1e-3,1e4)
ax.set_ylim(1e-2,1e3)
ax.set_xlabel('mean', fontsize=20)
ax.set_ylabel('$CV^2$', fontsize=20)
ax.tick_params(labelsize=18)
ax.legend(fontsize=20, ncol=2)

plt.show()
fig.savefig("cv_tcga_10_selections.svg")
fig.savefig("cv_tcga_10_selections.pdf")

# LDA

# Hierarchical Clustering