In [None]:
import pandas as pd
import numpy as np
import os,sys
import seaborn as sns
import matplotlib.pyplot as plt
import logging
log = logging.getLogger("triSBM_cluster")
log.addHandler(logging.StreamHandler())
log.setLevel(logging.DEBUG)

In [None]:
work_dir="datasets/tcga/COAD/mirna_allsamples/"
#working_dir="../cancers/breast/"
os.chdir(work_dir)

## Survival

In [None]:
df_files = pd.read_csv("files.dat", sep=",")
df_files.info()

In [None]:
bins = np.linspace(0,100,20)
avg = df_files["cases.0.diagnoses.0.age_at_diagnosis"].mean(skipna=True)
df_files["age_at_diagnosis"]=pd.cut(df_files["cases.0.diagnoses.0.age_at_diagnosis"].fillna(avg)/365., bins=bins, labels = (bins[1:]+bins[:-1])/2).__array__()

In [None]:
df_files["gender"] = (df_files["cases.0.demographic.gender"]=="male").astype(int)

In [None]:
for letter in ["a", "b", "c"]:
    for (old, new) in zip(["stage %s%s"%(i,letter) for i in ["i", "ii", "iii", "iv"]],["stage %s"%i for i in ["i", "ii", "iii", "iv"]]):
        df_files.replace(old, new, inplace=True)
df_files["cases.0.diagnoses.0.tumor_stage"].unique()

In [None]:
df_files["tumor_stage"]=df_files["cases.0.diagnoses.0.tumor_stage"]
for i,stage in enumerate(["stage i", "stage ii", "stage iii", "stage iv", "stage v", "stage x"]):
    df_files["tumor_stage"].replace(stage, i+1, inplace=True)

In [None]:
#0 = Alive
df_files["vital_status"]=(df_files["cases.0.demographic.vital_status"]=="Dead").astype(int)

In [None]:
def get_survival(case):
    if case["cases.0.demographic.vital_status"] == 1:
        return case["cases.0.demographic.days_to_death"]
    else:
        return case["cases.0.diagnoses.0.days_to_last_follow_up"]

df_files["days_survival"] = df_files.apply(get_survival,1)

# Regulatory network

In [None]:
%load_ext autoreload
%autoreload 2
from regulatory.mixed import Mixed
from regulatory.mirdip import MirDip
from regulatory.tarbase import TarBase

In [None]:
import networkx as nx
from gseapy import enrichr

In [None]:
gene_sets = ["/home/jovyan/work/phd/MSigDB/"+set for set in ['c1.all.v7.1.symbols.gmt',
                                             'c2.all.v7.1.symbols.gmt',
                                             #'c3.all.v7.1.symbols.gmt',
                                             'c4.all.v7.1.symbols.gmt',
                                             'c5.all.v7.1.symbols.gmt',
                                             'c6.all.v7.1.symbols.gmt',
                                             'c7.all.v7.1.symbols.gmt',
                                             'c8.all.v7.2.symbols.gmt',
                                             'h.all.v7.2.symbols.gmt',
                                                            ]
            ]

In [None]:
#network = Mixed()
#network = MirDip()
network = TarBase()

In [None]:
df_topics = pd.read_csv("trisbm/trisbm_level_1_metadata.csv")
#df_key_dist = pd.read_csv("trisbm/trisbm_level_1_metadata.csv")
background = df_topics.values.ravel()
background = list(filter(lambda x: x!="nan", background.astype(str)))

res = {}

for metadatum in df_topics.columns:
    mirnas = df_topics[metadatum].dropna().values
    regulated = pd.DataFrame()
    for mirna in network.check_input_data(mirnas):
        data = network.get_neighborns(mirna)
        data = list(filter(lambda x: str(x)!="nan",data))
        regulated=regulated.join(
            pd.Series(index=data, 
                    name=mirna,
                    data = 1,
                    dtype=object),
            how="outer"
        )
    regulated = regulated.fillna(0).sum(1).sort_values(ascending=False)
    regulated = regulated[regulated>regulated.quantile(0.9)] # genes regulated by at least 50% of the metadatum
    res[metadatum]=regulated.index
    #for gene in regulated.items():
    #    print(gene[0])
    try:
        print(enrichr(list(regulated.index), gene_sets).results.sort_values("Adjusted P-value", ascending=True)[["Term","Adjusted P-value"]])   
    except:
        log.error(sys.exc_info()[1])
    print("\n\n**************************\n\n")

In [None]:
list(map(len,res.values()))