In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from dask import compute, delayed
from time import perf_counter

import utils.dev_config as dev_conf
import utils.preprocessing as prep

In [2]:
dirs = dev_conf.get_dev_directories("../dev_paths.txt")
unified_dsets = ["unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data"]
matrisome_list = f"{dirs.data_dir}/matrisome/matrisome_hs_masterlist.tsv"

In [3]:
dset_idx = 0

In [4]:
seed = 123
rand = np.random.RandomState()

# Load and filter survival data

In [5]:
event_code = {"Alive": 0, "Dead": 1}
covariate_cols = ["age_at_diagnosis", "race", "ethnicity"]
dep_cols = ["figo_stage"]
cat_cols = ["race", "ethnicity"]
survival_df = prep.load_survival_df(f"{dirs.data_dir}/{unified_dsets[dset_idx]}/survival_data.tsv", event_code)

In [6]:
filtered_survival_df = (
    prep.decode_figo_stage(survival_df[["sample_name"] + dep_cols + covariate_cols].dropna(), to="n")
        .pipe(lambda df: df[["sample_name", "figo_num"]])
        .reset_index(drop=True)
)

print(filtered_survival_df.shape)
print(filtered_survival_df.shape[0] / survival_df.shape[0])
filtered_survival_df.head()

(255, 2)
0.9845559845559846


Unnamed: 0,sample_name,figo_num
0,TCGA-C5-A1BF-01B-11R-A13Y-07,1
1,TCGA-EK-A2RM-01A-21R-A18M-07,1
2,TCGA-Q1-A73P-01A-11R-A32P-07,1
3,TCGA-C5-A8YT-01A-11R-A37O-07,1
4,TCGA-UC-A7PI-01A-11R-A42S-07,1


# Load normalized matrisome count data

In [7]:
norm_matrisome_counts_df = pd.read_csv(f"{dirs.data_dir}/{unified_dsets[dset_idx]}/norm_matrisome_counts.tsv", sep='\t')
norm_filtered_matrisome_counts_t_df = prep.transpose_df(
    norm_matrisome_counts_df[["geneID"] + list(filtered_survival_df.sample_name)], "geneID", "sample_name"
)

In [8]:
norm_filtered_matrisome_counts_t_df.shape

(255, 1009)

# Joint survival & count data

In [9]:
joined_df = (
    pd.merge(filtered_survival_df, norm_filtered_matrisome_counts_t_df, on="sample_name")
        .set_index("sample_name")
)
joined_df.head()

Unnamed: 0_level_0,figo_num,PGF,TIMP4,C1QTNF6,TNC,PRL,OGN,C1QL3,FGB,NDNF,...,PIK3IP1,C1QTNF2,PCSK5,ANXA1,HGF,VWA2,FGF3,POSTN,NTF3,S100A6
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-C5-A1BF-01B-11R-A13Y-07,1,10.800637,6.228003,11.669331,13.002928,5.063964,4.869744,5.063964,8.834522,6.410767,...,9.013453,8.190325,9.503647,14.077995,6.569726,7.315604,4.602649,12.0623,5.649441,16.558407
TCGA-EK-A2RM-01A-21R-A18M-07,1,9.674879,7.277164,10.712783,13.003138,4.602649,5.086466,5.63082,5.086466,5.761877,...,10.854224,6.581217,8.437154,15.816261,7.644559,6.406766,4.998296,11.731128,6.028879,17.119594
TCGA-Q1-A73P-01A-11R-A32P-07,1,8.036801,5.247645,9.894159,13.321633,4.602649,5.769802,7.289183,6.336043,9.84385,...,10.854487,5.629541,9.602922,14.174748,6.987468,6.731154,4.602649,9.293089,4.893018,16.649488
TCGA-C5-A8YT-01A-11R-A37O-07,1,7.830611,5.733875,12.445548,13.765468,5.455125,13.049104,5.146455,5.074289,10.569544,...,9.453187,6.398956,12.288955,13.396332,10.228758,8.542025,4.602649,11.765396,5.318924,13.556322
TCGA-UC-A7PI-01A-11R-A42S-07,1,7.243036,5.328548,9.392965,14.24357,4.879491,5.583359,5.862713,5.377532,9.604209,...,10.655786,7.368694,8.444696,14.402125,5.940529,9.163491,4.602649,8.118925,5.889309,16.314001


In [10]:
X = joined_df.iloc[:, 1:].values
y = joined_df.iloc[:, 0].values

In [11]:
rand.seed(seed)
sim_rounds = 101
start = perf_counter()
mi_delayed = [delayed(mutual_info_classif)(X, y, discrete_features=False, random_state=rand) for _ in range(sim_rounds)]
res = compute(*mi_delayed, scheduler="processes")
stop = perf_counter()
print(stop - start)

55.570829599979334


In [12]:
mi_df = pd.concat([
    pd.DataFrame({"geneID": joined_df.columns[12:]}),
    pd.DataFrame(np.column_stack(res), columns=[f"MI_est_{i + 1}" for i in range(sim_rounds)])
], axis=1)
mi_df["MI_est_median"] = mi_df.iloc[:, 1:].median(axis=1)
mi_df.head()

Unnamed: 0,geneID,MI_est_1,MI_est_2,MI_est_3,MI_est_4,MI_est_5,MI_est_6,MI_est_7,MI_est_8,MI_est_9,...,MI_est_93,MI_est_94,MI_est_95,MI_est_96,MI_est_97,MI_est_98,MI_est_99,MI_est_100,MI_est_101,MI_est_median
0,CYR61,0.059313,0.059313,0.059313,0.059313,0.059313,0.059313,0.059313,0.059313,0.059313,...,0.059313,0.059313,0.059313,0.059313,0.059313,0.059313,0.059313,0.059313,0.059313,0.059313
1,ECM1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ANGPT2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SERPINF2,0.009719,0.009719,0.009719,0.009719,0.009719,0.009719,0.009719,0.009719,0.009719,...,0.009719,0.009719,0.009719,0.009719,0.009719,0.009719,0.009719,0.009719,0.009719,0.009719
4,SCUBE3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
sum(mi_df.MI_est_median > 0)

506

In [14]:
mi_df[["geneID", "MI_est_median"]].to_csv(f"{dirs.analysis_dir}/{unified_dsets[dset_idx]}_MI_figo_results.tsv", sep="\t", index=False)