In [51]:
import re
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
from dask import compute, delayed
from time import perf_counter

import utils.dev_config as dev_conf
import utils.preprocessing as prep

In [2]:
dirs = dev_conf.get_dev_directories("../dev_paths.txt")
unified_dsets = ["unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data"]
matrisome_list = f"{dirs.data_dir}/matrisome/matrisome_hs_masterlist.tsv"

In [129]:
i = 0

In [4]:
matrisome_df = prep.load_matrisome_df(matrisome_list)
sig_deg_df = pd.read_csv(f"{dirs.analysis_dir}/{unified_dsets[i]}_sig_DESeq_results_xref_matrisome.tsv", sep = '\t')
matrisome_sig_deg_df = (
    sig_deg_df.query("in_matrisome == True")
        .reset_index(drop=True)
)

# Load and filter survival data

In [5]:
event_code = {"Alive": 0, "Dead": 1}
covariate_cols = ["age_at_diagnosis", "bmi", "race", "ethnicity"]
dep_cols = ["vital_status", "survival_time"]
cat_cols = ["race", "ethnicity"]
survival_df = prep.load_survival_df(f"{dirs.data_dir}/{unified_dsets[i]}/survival_data.tsv", event_code)

In [6]:
figo_df = (
    survival_df[["sample_name", "figo_stage"] + covariate_cols]
        .dropna()
        .pipe(pd.get_dummies, columns=cat_cols)
        .sort_values("figo_stage")
        .reset_index(drop=True)
#         .assign(figo_stage_int=lambda x: pd.factorize(x["figo_stage"])[0] + 1)
        .assign(figo_stage_major = lambda x: x["figo_stage"].apply(lambda s: re.findall(r"IV|III|II|I", s)[0]))
        .assign(figo_stage_major_fact = lambda x: pd.factorize(x["figo_stage_major"])[0] + 1)
        .pipe(prep.cols_to_front, ["sample_name", "figo_stage_major", "figo_stage_major_fact"])
        .drop(["figo_stage_major", "figo_stage"], axis=1)
        .rename(columns={"figo_stage_major_fact": "figo_stage"})
)

print(figo_df.shape)
print(figo_df.shape[0] / survival_df.shape[0])
figo_df.head()

(216, 13)
0.833976833976834


Unnamed: 0,sample_name,figo_stage,age_at_diagnosis,bmi,race_american indian or alaska native,race_asian,race_black or african american,race_native hawaiian or other pacific islander,race_not reported,race_white,ethnicity_hispanic or latino,ethnicity_not hispanic or latino,ethnicity_not reported
0,TCGA-Q1-A73Q-01A-21R-A32P-07,1,16851.0,34.850184,0,0,0,0,0,1,0,1,0
1,TCGA-Q1-A6DW-01A-11R-A32P-07,1,16200.0,24.21875,0,0,0,0,0,1,0,1,0
2,TCGA-Q1-A73R-01A-11R-A33Z-07,1,16701.0,39.542144,0,0,0,0,0,1,0,0,1
3,TCGA-LP-A4AW-01A-11R-A24H-07,1,19079.0,20.829995,0,1,0,0,0,0,0,1,0
4,TCGA-MU-A5YI-01A-11R-A32P-07,1,21927.0,32.979592,0,0,1,0,0,0,0,1,0


# Load normalized matrisome count data

In [7]:
norm_matrisome_counts_df = pd.read_csv(f"{dirs.data_dir}/{unified_dsets[i]}/norm_matrisome_counts.tsv", sep='\t')
norm_filtered_matrisome_counts_t_df = (
    norm_matrisome_counts_df[["geneID"] + list(figo_df.sample_name)]
        .set_index("geneID")                        # set as index so will be column names
        .transpose()
        .rename_axis(None, axis=1)                  # column.name will be set to "geneID", we don't want this
        .reset_index()                              # "sample_name" should now be its own column
        .rename({"index": "sample_name"}, axis=1)
)

# Joint survival & count data

In [8]:
joined_df = (
    pd.merge(figo_df, norm_filtered_matrisome_counts_t_df, on="sample_name")
        .set_index("sample_name")
)
joined_df.head()

Unnamed: 0_level_0,figo_stage,age_at_diagnosis,bmi,race_american indian or alaska native,race_asian,race_black or african american,race_native hawaiian or other pacific islander,race_not reported,race_white,ethnicity_hispanic or latino,...,PIK3IP1,C1QTNF2,PCSK5,ANXA1,HGF,VWA2,FGF3,POSTN,NTF3,S100A6
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-Q1-A73Q-01A-21R-A32P-07,1,16851.0,34.850184,0,0,0,0,0,1,0,...,10.63646,7.448303,10.613982,13.5397,6.180838,7.518582,4.602649,9.346523,7.950185,16.000858
TCGA-Q1-A6DW-01A-11R-A32P-07,1,16200.0,24.21875,0,0,0,0,0,1,0,...,9.572978,7.374251,7.794489,17.226296,6.3637,6.165439,4.602649,10.928503,5.094836,14.640358
TCGA-Q1-A73R-01A-11R-A33Z-07,1,16701.0,39.542144,0,0,0,0,0,1,0,...,10.371489,7.546277,7.782646,15.06388,5.924565,11.41953,4.602649,17.293717,5.124649,16.785389
TCGA-LP-A4AW-01A-11R-A24H-07,1,19079.0,20.829995,0,1,0,0,0,0,0,...,10.734178,7.707393,9.750078,15.168182,8.984331,5.508312,4.602649,11.969267,6.953788,14.933013
TCGA-MU-A5YI-01A-11R-A32P-07,1,21927.0,32.979592,0,0,1,0,0,0,0,...,12.349214,7.258085,8.122189,14.860586,8.706173,9.430509,4.890038,11.694373,7.229737,14.409821


In [9]:
X = joined_df.iloc[:, 1:].values
y = joined_df.iloc[:, 0].values

In [24]:
mi_df = pd.DataFrame({"geneID": joined_df.columns[12:]})
mi_df.head()

Unnamed: 0,geneID
0,PGF
1,TIMP4
2,C1QTNF6
3,TNC
4,PRL


In [119]:
sim_rounds = 21
start = perf_counter()
mi_delayed = [delayed(mutual_info_classif)(X[:, 11:], y, discrete_features=False) for _ in range(sim_rounds)]
res = compute(*mi_delayed, scheduler="processes")
stop = perf_counter()
print(stop - start)

11.385230999992928


In [120]:
res_df = pd.DataFrame(np.column_stack(res), columns=[f"MI_est_{i + 1}" for i in range(sim_rounds)])
mi_df = pd.concat([
    pd.DataFrame({"geneID": joined_df.columns[12:]}),
    pd.DataFrame(np.column_stack(res), columns=[f"MI_est_{i + 1}" for i in range(sim_rounds)])
], axis=1)
mi_df["MI_est_median"] = mi_df.iloc[:, 1:].median(axis=1)
mi_df.head()

Unnamed: 0,geneID,MI_est_1,MI_est_2,MI_est_3,MI_est_4,MI_est_5,MI_est_6,MI_est_7,MI_est_8,MI_est_9,...,MI_est_13,MI_est_14,MI_est_15,MI_est_16,MI_est_17,MI_est_18,MI_est_19,MI_est_20,MI_est_21,MI_est_median
0,PGF,0.083027,0.083027,0.083027,0.083027,0.083027,0.083027,0.083027,0.083027,0.083027,...,0.083027,0.083027,0.083027,0.083027,0.083027,0.083027,0.083027,0.083027,0.083027,0.083027
1,TIMP4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,C1QTNF6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TNC,0.023202,0.023202,0.023202,0.023202,0.023202,0.023202,0.023202,0.023202,0.023202,...,0.023202,0.023202,0.023202,0.023202,0.023202,0.023202,0.023202,0.023202,0.023202,0.023202
4,PRL,0.0,0.0,0.0,0.0,0.0,0.003727,0.0,0.0,0.0,...,0.029682,0.010991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [124]:
dep_mi_df = (
    mi_df[["geneID", "MI_est_median"]]
        .query("MI_est_median > 0")
        .reset_index(drop=True)
)
print(dep_mi_df.shape)

(483, 2)


In [131]:
coxph_df = pd.read_csv(f"{dirs.analysis_dir}/{unified_dsets[i]}_coxph_results.tsv", sep='\t')
sig_coxph_df = coxph_df[coxph_df.gene_pval < 0.05]

In [132]:
sig_coxph_genes = set(sig_coxph_df.geneID)
mi_genes = set(dep_mi_df.geneID)
len(mi_genes.intersection(sig_coxph_genes))

56

In [133]:
sig_deg_matrisome_df = (
    pd.read_csv(f"{dirs.analysis_dir}/{unified_dsets[i]}_sig_DESeq_results_xref_matrisome.tsv", sep='\t')
        .query("in_matrisome == True")
        .reset_index(drop=True)
)

In [134]:
sig_deg_genes = set(sig_deg_matrisome_df.geneID)
len(mi_genes.intersection(sig_deg_genes))

268

In [135]:
len(mi_genes.intersection(sig_coxph_genes).intersection(sig_deg_genes))

38