In [None]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import logging
log = logging.getLogger("trisbm_cnv")
hdl = logging.StreamHandler()
hdl.setLevel(logging.DEBUG)
log.addHandler(hdl)
log.setLevel(logging.DEBUG)

In [None]:
work_dir="datasets/tcga/COAD/mirna_allsamples/"
#working_dir="../cancers/breast/"
os.chdir(work_dir)

## CNV

In [None]:
df_files = pd.read_csv("files.dat", sep=",")
df_files.info()

In [None]:
bins = np.linspace(0,100,20)
avg = df_files["cases.0.diagnoses.0.age_at_diagnosis"].mean(skipna=True)
df_files["age_at_diagnosis"]=pd.cut(df_files["cases.0.diagnoses.0.age_at_diagnosis"].fillna(avg)/365., bins=bins, labels = (bins[1:]+bins[:-1])/2).__array__()

In [None]:
df_files["gender"] = (df_files["cases.0.demographic.gender"]=="male").astype(int)

In [None]:
for letter in ["a", "b", "c"]:
    for (old, new) in zip(["stage %s%s"%(i,letter) for i in ["i", "ii", "iii", "iv"]],["stage %s"%i for i in ["i", "ii", "iii", "iv"]]):
        df_files.replace(old, new, inplace=True)
df_files["cases.0.diagnoses.0.tumor_stage"].unique()

In [None]:
df_files["tumor_stage"]=df_files["cases.0.diagnoses.0.tumor_stage"]
for i,stage in enumerate(["stage i", "stage ii", "stage iii", "stage iv", "stage v", "stage x"]):
    df_files["tumor_stage"].replace(stage, i+1, inplace=True)

In [None]:
#0 = Alive
df_files["vital_status"]=(df_files["cases.0.demographic.vital_status"]=="Dead").astype(int)

In [None]:
def get_survival(case):
    if case["cases.0.demographic.vital_status"] == 1:
        return case["cases.0.demographic.days_to_death"]
    else:
        return case["cases.0.diagnoses.0.days_to_last_follow_up"]

df_files["days_survival"] = df_files.apply(get_survival,1)

In [None]:
df_cnv = pd.read_csv("mainTable_cnv.csv", index_col=0).astype("Int64")
df_cnv.columns = [sample[:12] for sample in df_cnv.columns]
df_cnv.head(2)

In [None]:
df_metadata = pd.read_csv("trisbm/trisbm_level_1_metadatum-dist.csv", index_col=1).drop("i_doc",1)
df_metadata.head()

In [None]:
for metadatum in df_metadata.columns:
    fig = px.scatter(x=df_cnv.mean(0),
                y=df_metadata.subtract(df_metadata.median(0),1)[metadatum].reindex(index=df_cnv.columns),
                opacity=0.8
               )
    fig.update_layout({
        "xaxis_title":"cnv",
        "yaxis_title":"P(sample|metadatum)"
    })
    fig.show()

In [None]:
df_metadata = pd.read_csv("trisbm/trisbm_level_1_metadata.csv")
df_keywords = pd.read_csv("trisbm/trisbm_level_1_keyword-dist.csv", index_col=0)
df_mirna=pd.read_csv("/home/jovyan/work/phd/miRNA/miRNA.txt").drop_duplicates("miRBase ID").set_index("miRBase ID").reindex(index=df_keywords.index).drop_duplicates()
df_keywords["ensg"]=df_mirna["Gene stable ID"]
assert(df_keywords.index.duplicated().sum()==0)
df_keywords.head(2)

In [None]:
for metadatum in df_keywords.columns[:-1]:
    mirnas = df_metadata[metadatum].dropna()
    mirnas_ensgs = df_mirna[df_mirna.index.isin(mirnas)].dropna().values.ravel()
    mirnas_ensgs = df_cnv[df_cnv.index.isin(mirnas_ensgs)].index
    mirnas_ensgs = np.unique(mirnas_ensgs)

    x = df_keywords[df_keywords["ensg"].isin(mirnas_ensgs)].set_index("ensg").reindex(index=mirnas_ensgs)[metadatum]
    y = df_cnv.drop_duplicates().reindex(index=mirnas_ensgs).mean(1)
    mask = (~np.isnan(x)) & (~np.isnan(y))
    
    #log.debug(len(mirnas_ensgs))
    
    fig = px.scatter(x=x[mask], 
                     y=y[mask],
                   # text=mirnas_ensgs[mask]
                    )

    fig.update_layout({
            "title":metadatum,
            "xaxis_title":"P(miRNA|metadatum)",
            "yaxis_title":"cnv"
        })
    fig.show()