In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_regression
from dask import compute, delayed
from time import perf_counter

import utils.dev_config as dev_conf
import utils.preprocessing as prep

In [2]:
dirs = dev_conf.get_dev_directories("../dev_paths.txt")
unified_dsets = ["unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data"]
matrisome_list = f"{dirs.data_dir}/matrisome/matrisome_hs_masterlist.tsv"

In [3]:
dset_idx = 2

In [4]:
seed = 123
rand = np.random.RandomState()

# Load and filter survival data

In [5]:
event_code = {"Alive": 0, "Dead": 1}
covariate_cols = ["age_at_diagnosis", "bmi", "race", "ethnicity"]
dep_cols = ["vital_status", "survival_time"]
cat_cols = ["race", "ethnicity"]
survival_df = prep.load_survival_df(f"{dirs.data_dir}/{unified_dsets[dset_idx]}/survival_data.tsv", event_code)

In [6]:
print(survival_df.shape)
survival_df.head()

(140, 13)


Unnamed: 0,sample_name,survival_time,vital_status,figo_stage,days_to_last_follow_up,days_to_death,age_at_diagnosis,age_at_index,height,weight,bmi,race,ethnicity
0,TCGA-AJ-A3OJ-01A-11R-A22K-07,467.0,0,Stage II,467.0,,19826.0,54.0,157.0,104.0,42.192381,white,not hispanic or latino
1,TCGA-QF-A5YS-01A-11R-A31O-07,689.0,0,Stage II,689.0,,20841.0,57.0,162.0,79.0,30.102119,black or african american,not hispanic or latino
2,TCGA-A5-A2K2-01A-11R-A18M-07,3595.0,0,Stage IA,3595.0,,28456.0,77.0,155.0,50.0,20.811655,white,not hispanic or latino
3,TCGA-BK-A6W4-01A-12R-A34R-07,300.0,0,Stage IA,300.0,,22766.0,62.0,157.0,107.0,43.409469,black or african american,not hispanic or latino
4,TCGA-BK-A26L-01A-11R-A277-07,734.0,0,Stage IIIC1,734.0,,26002.0,71.0,152.0,75.0,32.461911,white,not hispanic or latino


In [7]:
filtered_survival_df = (
    survival_df[["sample_name"] + dep_cols]
        .query("vital_status == 1")
        .dropna()
        .reset_index(drop=True)
)
print(filtered_survival_df.shape)
print(filtered_survival_df.shape[0] / survival_df.shape[0])
filtered_survival_df.head()

(24, 3)
0.17142857142857143


Unnamed: 0,sample_name,vital_status,survival_time
0,TCGA-A5-A2K4-01A-11R-A18M-07,1,871.0
1,TCGA-AJ-A23N-01A-11R-A22K-07,1,439.0
2,TCGA-EY-A3QX-01A-11R-A22K-07,1,989.0
3,TCGA-AJ-A3I9-01A-11R-A22K-07,1,519.0
4,TCGA-EY-A2ON-01A-21R-A18M-07,1,610.0


# Load normalized matrisome count data

In [8]:
norm_matrisome_counts_df = pd.read_csv(f"{dirs.data_dir}/{unified_dsets[dset_idx]}/norm_matrisome_counts.tsv", sep='\t')
norm_filtered_matrisome_counts_t_df = prep.transpose_df(
    norm_matrisome_counts_df[["geneID"] + list(filtered_survival_df.sample_name)], "geneID", "sample_name"
)

In [9]:
joined_df = (
    pd.merge(filtered_survival_df, norm_filtered_matrisome_counts_t_df, on="sample_name")
        .drop("vital_status", axis=1)
        .set_index("sample_name")
)

print(joined_df.shape)
joined_df.head()

(24, 1009)


Unnamed: 0_level_0,survival_time,PGF,TIMP4,C1QTNF6,TNC,PRL,OGN,C1QL3,FGB,NDNF,...,PIK3IP1,C1QTNF2,PCSK5,ANXA1,HGF,VWA2,FGF3,POSTN,NTF3,S100A6
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-A5-A2K4-01A-11R-A18M-07,871.0,7.60723,4.169268,11.25537,12.235577,4.856636,4.405101,4.914654,7.523761,9.151365,...,10.468971,5.479441,8.614214,12.573146,7.562306,7.088642,3.586865,9.456699,7.277034,15.671777
TCGA-AJ-A23N-01A-11R-A22K-07,439.0,7.523942,4.161205,10.830059,11.411592,4.393914,4.83987,4.83987,3.586865,5.433021,...,10.035907,5.741274,10.296439,13.104221,7.351969,7.391397,4.248627,13.159221,6.602252,14.813381
TCGA-EY-A3QX-01A-11R-A22K-07,989.0,7.816775,4.066913,9.444013,13.437827,8.365137,4.865358,4.781944,3.586865,3.92709,...,9.832998,5.048594,7.964041,13.320121,5.31761,8.156598,3.92709,9.277586,4.340787,16.145243
TCGA-AJ-A3I9-01A-11R-A22K-07,519.0,7.321404,5.142235,7.355847,11.544473,3.586865,7.249919,5.142235,3.586865,5.869257,...,9.577459,4.656818,6.480327,11.288854,6.305301,9.278474,3.586865,10.459636,3.586865,11.848008
TCGA-EY-A2ON-01A-21R-A18M-07,610.0,8.274716,3.586865,10.434548,9.930904,4.951923,3.586865,5.57426,6.485578,3.586865,...,10.581401,5.536235,7.092757,12.675114,5.013691,6.466455,3.586865,8.756968,5.07215,14.35194


# Examine mutual information

In [10]:
X = joined_df.iloc[:, 1:].values
y = joined_df.iloc[:, 0].values

In [11]:
rand.seed(seed)
sim_rounds = 101
start = perf_counter()
mi_delayed = [delayed(mutual_info_regression)(X, y, discrete_features=False, random_state=rand) for _ in range(sim_rounds)]
res = compute(*mi_delayed, scheduler="processes")
stop = perf_counter()
print(stop - start)

40.23346959997434


In [12]:
mi_df = pd.concat([
    pd.DataFrame({"geneID": joined_df.columns[1:]}),
    pd.DataFrame(np.column_stack(res), columns=[f"MI_est_{i + 1}" for i in range(sim_rounds)])
], axis=1)
mi_df["MI_est_median"] = mi_df.iloc[:, 1:].median(axis=1)
mi_df.head()

Unnamed: 0,geneID,MI_est_1,MI_est_2,MI_est_3,MI_est_4,MI_est_5,MI_est_6,MI_est_7,MI_est_8,MI_est_9,...,MI_est_93,MI_est_94,MI_est_95,MI_est_96,MI_est_97,MI_est_98,MI_est_99,MI_est_100,MI_est_101,MI_est_median
0,PGF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TIMP4,0.074189,0.074189,0.074189,0.074189,0.074189,0.074189,0.074189,0.074189,0.074189,...,0.074189,0.074189,0.074189,0.074189,0.074189,0.074189,0.074189,0.074189,0.074189,0.074189
2,C1QTNF6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TNC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,PRL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
sum(mi_df.MI_est_median > 0)

458

In [14]:
mi_df[["geneID", "MI_est_median"]].to_csv(f"{dirs.analysis_dir}/{unified_dsets[dset_idx]}_MI_survival_results.tsv", sep="\t", index=False)