In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import requests
import json
import os, sys
sys.path.append('/home/fvalle/phd/master_thesis/hsbm/')

  from pandas.core.index import RangeIndex


# Download data

In [2]:
#path/to/files/downloaded/from https://doi.org/10.6084/m9.figshare.5330593
working_dir = "/home/fvalle/phd/datasets/merged/"
os.chdir(working_dir)

In [3]:
files = [f for f in os.listdir("data") if "fpkm" in f]

In [4]:
df_gtex=pd.read_csv("https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", sep='\t', index_col=0)

In [5]:
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "cases.project.program.name",
            "value": ["TCGA"]
            }
        }
        
    ]
}
params = {
    "filters": json.dumps(filters),
    "fields": "primary_site,disease_type,submitter_id",
    "format": "TSV",
    "size": "10000000"
    }
response = requests.get("https://api.gdc.cancer.gov/cases", headers = {"Content-Type": "application/json"}, params = params)
with open("files.txt","w") as file:
    file.write(response.content.decode("utf-8"))
    file.close()
df_tcga = pd.read_csv("files.txt", sep='\t').set_index("submitter_id")

In [6]:
df=pd.DataFrame()

In [7]:
for file in files:
    df = df.append(pd.read_csv("data/%s"%file, sep='\t', index_col=0).drop('Entrez_Gene_Id',1).transpose(), sort=True)
df = df.transpose()
df = df.dropna(how='any', axis=0) # drop genes not always determined

In [48]:
df_files = pd.DataFrame(index=df.columns)

In [49]:
def get_site(file):
    if 'GTEX' in file:
        return df_gtex.at[file, 'SMTS']
    if 'TCGA' in file:
        return df_tcga.at[file[:12],'primary_site']

def get_source(file):
    if 'GTEX' in file:
        return 'gtex'
    if 'TCGA' in file:
        return 'tcga'

In [50]:
df_files.insert(0, 'primary_site', [get_site(file) for file in df.columns])
df_files.insert(1, 'dataset', [get_source(file) for file in df.columns])

In [51]:
#df.to_csv("mainTable.csv", index=True, header=True)
df_files.to_csv("files.dat", index=True, header=True)

## Split / shuffle and select

In [52]:
#df = pd.read_csv("mainTable.csv", index_col=0)
df_files = pd.read_csv("files.dat", index_col=0)
df.head()

Unnamed: 0,TCGA-L5-A4OQ-11A-12R-A260-31,TCGA-IG-A3I8-11A-11R-A24K-31,TCGA-L5-A4OM-11A-11R-A260-31,TCGA-L5-A4OG-11A-12R-A260-31,TCGA-L5-A4OJ-11A-12R-A260-31,TCGA-L5-A43C-11A-11R-A24K-31,TCGA-IC-A6RF-11A-21R-A336-31,TCGA-L5-A4OF-11A-12R-A260-31,TCGA-L5-A4OO-11A-12R-A260-31,TCGA-IC-A6RE-11A-12R-A336-31,...,TCGA-KL-8329-11A-01R-2315-07,TCGA-KN-8423-11A-01R-2315-07,TCGA-KN-8437-11A-01R-2315-07,TCGA-KL-8324-11A-01R-2315-07,TCGA-KL-8336-11A-01R-2315-07,TCGA-KN-8433-11A-01R-2315-07,TCGA-KL-8339-11A-01R-2315-07,TCGA-KN-8422-11A-01R-2315-07,TCGA-KN-8429-11A-01R-2315-07,TCGA-KN-8424-11A-01R-2315-07
A1BG,141.02,118.43,158.79,171.45,159.9,189.02,118.43,53.95,156.59,166.73,...,166.73,187.71,698.41,237.86,248.0,273.37,237.86,215.77,555.41,154.42
A2M,37379.55,12502.12,8421.31,7433.4,5791.62,4450.27,4972.34,5219.6,11035.54,2520.38,...,7749.1,10512.82,30361.44,13776.25,14065.74,4152.18,8247.98,9945.68,6652.97,7331.05
A2ML1,8.58,25.91,3395.89,4.74,1066.48,3.35,40904.3,2.94,20.71,142439.51,...,1.43,2.89,1.79,0.0,0.37,0.0,0.0,1.71,0.0,0.0
A4GALT,1304.15,1313.23,463.65,836.53,753.83,669.92,1617.0,743.43,860.08,624.99,...,1119.56,559.28,1477.58,1088.92,1936.53,848.22,791.35,587.13,503.95,728.11
A4GNT,0.25,0.13,34.51,10.47,4.39,5.19,0.0,13.83,4.54,0.0,...,0.73,9.27,1.45,4.7,1.41,5.15,2.34,15.34,2.58,7.82


In [54]:
df_files.replace('Uterus, NOS', 'Uterus', inplace=True)
df_files.replace('Bronchus and lung', 'Lung', inplace=True)
df_files.replace('Liver and intrahepatic bile ducts', 'Liver', inplace=True)
df_files.replace('Prostate gland', 'Prostate', inplace=True)
df_files.replace('Thyroid gland', 'Thyroid', inplace=True)
df_files.replace('Base of Tongue', 'Salivary Gland', inplace=True)
df_files.replace('Bones, joints and articular cartilage of other and unspecified sites', 'Salivary Gland', inplace=True)
df_files.replace('Floor of mouth', 'Salivary Gland', inplace=True)
df_files.replace('Gum', 'Salivary Gland', inplace=True)
df_files.replace('Hypopharynx', 'Salivary Gland', inplace=True)
df_files.replace('Larynx', 'Salivary Gland', inplace=True)
df_files.replace('Lip', 'Salivary Gland', inplace=True)
df_files.replace('Oropharynx', 'Salivary Gland', inplace=True)
df_files.replace('Other and ill-defined sites in lip, oral cavity and pharynx', 'Salivary Gland', inplace=True)
df_files.replace('Other and unspecified parts of mouth', 'Salivary Gland', inplace=True)
df_files["tissue_hd"]=df_files["primary_site"]+"_"+df_files["dataset"]
df_files['primary_site'].unique()

array(['Esophagus', 'Bladder', 'Liver', 'Prostate', 'Thyroid',
       'Other and unspecified parts of biliary tract', 'Rectum',
       'Rectosigmoid junction', 'Colon', 'Lung', 'Uterus', 'Corpus uteri',
       'Stomach', 'Kidney', 'Gallbladder', 'Breast', 'Cervix uteri',
       'Salivary Gland', 'Other and unspecified parts of tongue',
       'Base of tongue', 'Tonsil', 'Palate',
       'Connective, subcutaneous and other soft tissues', 'Cervix Uteri'],
      dtype=object)

In [57]:
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5903355/
sites_with_enough_statistics = ['Breast', 'Colon', 'Liver', 'Esophagus', 'Prostate', 'Stomach', 'Lung', 'Uterus']

In [15]:
rs = np.random.RandomState(seed=42)
samples = pd.DataFrame(columns=df_files.columns)
for site in sites_with_enough_statistics:
    samples = samples.append(df_files[(df_files['primary_site']==site) & (df_files['dataset']=='tcga')].sample(50, random_state=rs))
    samples = samples.append(df_files[(df_files['primary_site']==site) & (df_files['dataset']=='gtex')].sample(50, random_state=rs))

In [16]:
samples.groupby('primary_site').count()

Unnamed: 0_level_0,dataset
primary_site,Unnamed: 1_level_1
Breast,100
Colon,100
Esophagus,100
Liver,100
Lung,100
Prostate,100
Stomach,100
Uterus,100


In [17]:
samples.groupby('dataset').count()

Unnamed: 0_level_0,primary_site
dataset,Unnamed: 1_level_1
gtex,400
tcga,400


In [25]:
df[df.columns[~df.columns.isin(samples.index) & df.columns.isin(df_files[df_files["primary_site"].isin(sites_with_enough_statistics)].index)]].to_csv("mainTable_test.csv", index=True, header=True)

In [28]:
df[df.columns[df.columns.isin(samples.index)]].to_csv("mainTable_train.csv", index=True, header=True)

In [56]:
df_files["tissue_hd"].unique()

array(['Esophagus_tcga', 'Bladder_tcga', 'Liver_tcga', 'Bladder_gtex',
       'Prostate_tcga', 'Thyroid_tcga',
       'Other and unspecified parts of biliary tract_tcga', 'Rectum_tcga',
       'Rectosigmoid junction_tcga', 'Colon_tcga', 'Lung_tcga',
       'Uterus_tcga', 'Corpus uteri_tcga', 'Stomach_gtex', 'Thyroid_gtex',
       'Kidney_tcga', 'Uterus_gtex', 'Gallbladder_tcga', 'Breast_tcga',
       'Colon_gtex', 'Cervix uteri_tcga', 'Salivary Gland_tcga',
       'Other and unspecified parts of tongue_tcga',
       'Base of tongue_tcga', 'Tonsil_tcga', 'Palate_tcga', 'Liver_gtex',
       'Prostate_gtex', 'Lung_gtex', 'Esophagus_gtex', 'Breast_gtex',
       'Kidney_gtex', 'Salivary Gland_gtex',
       'Connective, subcutaneous and other soft tissues_tcga',
       'Stomach_tcga', 'Cervix Uteri_gtex'], dtype=object)

In [55]:
df_files.to_csv("files.dat", index=True, header=True)

# Use scanpy to filter HVG

In [None]:
adata = sc.AnnData(df[samples.index].transpose(), obs=samples)
adata_log = sc.pp.log1p(adata, copy=True)

In [None]:
sc.pp.highly_variable_genes(adata_log, n_top_genes=3000, n_bins=50)

In [None]:
sc.pl.highly_variable_genes(adata_log, log=True, save='hvg.pdf')

In [None]:
hvg = adata_log.var[adata_log.var['highly_variable']==True].index
samples = adata_log.obs.index

# Use SBM

In [None]:
from sbmtm import sbmtm

In [None]:
model = sbmtm()

In [None]:
model.make_graph_from_BoW_df(df.loc[hvg, samples])

In [None]:
model.save_graph("graph.xml.gz")