In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
from dask import compute, delayed
from time import perf_counter

import utils.dev_config as dev_conf
import utils.preprocessing as prep

In [2]:
dirs = dev_conf.get_dev_directories("../dev_paths.txt")
unified_dsets = ["unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data"]
matrisome_list = f"{dirs.data_dir}/matrisome/matrisome_hs_masterlist.tsv"

In [3]:
def min_max_norm(x, a, b):
    x_min = x.min()
    x_max = x.max()
    return ((x - x_min) * (b - a)) / (x_max - x_min) + a

In [4]:
i = 0

In [5]:
matrisome_df = prep.load_matrisome_df(matrisome_list)
sig_deg_df = pd.read_csv(f"{dirs.analysis_dir}/{unified_dsets[i]}_sig_DESeq_results_xref_matrisome.tsv", sep = '\t')
matrisome_sig_deg_df = (
    sig_deg_df.query("in_matrisome == True")
        .reset_index(drop=True)
)

# Load and filter survival data

In [6]:
event_code = {"Alive": 0, "Dead": 1}
covariate_cols = ["age_at_diagnosis", "bmi", "race", "ethnicity"]
dep_cols = ["vital_status", "survival_time"]
cat_cols = ["race", "ethnicity"]
survival_df = prep.load_survival_df(f"{dirs.data_dir}/{unified_dsets[i]}/survival_data.tsv", event_code)

In [7]:
filtered_survival_df = (
    survival_df[["sample_name"] + dep_cols + covariate_cols]
        .query("vital_status == 1")
        .dropna()
        .reset_index(drop=True)
        .pipe(pd.get_dummies, columns=cat_cols)
)
print(filtered_survival_df.shape)
print(filtered_survival_df.shape[0] / survival_df.shape[0])
filtered_survival_df.head()

(48, 13)
0.18532818532818532


Unnamed: 0,sample_name,vital_status,survival_time,age_at_diagnosis,bmi,race_american indian or alaska native,race_asian,race_black or african american,race_not reported,race_white,ethnicity_hispanic or latino,ethnicity_not hispanic or latino,ethnicity_not reported
0,TCGA-C5-A2LZ-01A-11R-A213-07,1,3046,24059.0,31.992171,0,0,0,0,1,0,1,0
1,TCGA-VS-A9V1-01A-11R-A42T-07,1,157,17001.0,18.730489,0,0,0,0,1,0,0,1
2,TCGA-C5-A1BE-01B-11R-A13Y-07,1,2094,23727.0,34.232692,0,0,0,0,1,0,0,1
3,TCGA-C5-A8XH-01A-11R-A37O-07,1,1394,14444.0,22.582709,0,0,0,0,1,0,1,0
4,TCGA-DS-A7WF-01A-11R-A352-07,1,492,15319.0,24.609375,0,0,0,1,0,1,0,0


# Load normalized matrisome count data

In [8]:
norm_matrisome_counts_df = pd.read_csv(f"{dirs.data_dir}/{unified_dsets[i]}/norm_matrisome_counts.tsv", sep='\t')
norm_matrisome_survival_counts_t_df = (
    norm_matrisome_counts_df[["geneID"] + list(filtered_survival_df.sample_name)]
        .set_index("geneID")                        # set as index so will be column names
        .transpose()
        .rename_axis(None, axis=1)                  # column.name will be set to "geneID", we don't want this
        .reset_index()                              # "sample_name" should now be its own column
        .rename({"index": "sample_name"}, axis=1)
)

In [9]:
joined_df = (
    pd.merge(filtered_survival_df, norm_matrisome_survival_counts_t_df, on="sample_name")
        .drop("vital_status", axis=1)
        .set_index("sample_name")
)

# Examine mutual information

In [10]:
# # The one-hot columns are the only remaining discrete variables
# discr_cols = np.where(filtered_joined_survival_counts_df.columns.str.contains("race|ethnicity"))[0]

# # The first column is the response variable, so we shift these indices
# discr_cols -= 1

In [11]:
X = joined_df.iloc[:, 1:].values
y = joined_df.iloc[:, 0].values
# X_mm = mm_norm_df.iloc[:, 1:].values
# y_mm = mm_norm_df.iloc[:, 0].values

In [12]:
sim_rounds = 101
start = perf_counter()
mi_delayed = [delayed(mutual_info_regression)(X[:, 10:], y, discrete_features=False) for _ in range(sim_rounds)]
res = compute(*mi_delayed, scheduler="processes")
stop = perf_counter()
print(stop - start)

21.787716299993917


In [13]:
mi_df = pd.concat([
    pd.DataFrame({"geneID": joined_df.columns[11:]}),
    pd.DataFrame(np.column_stack(res), columns=[f"MI_est_{i + 1}" for i in range(sim_rounds)])
], axis=1)
mi_df["MI_est_median"] = mi_df.iloc[:, 1:].median(axis=1)
mi_df.head()

Unnamed: 0,geneID,MI_est_1,MI_est_2,MI_est_3,MI_est_4,MI_est_5,MI_est_6,MI_est_7,MI_est_8,MI_est_9,...,MI_est_93,MI_est_94,MI_est_95,MI_est_96,MI_est_97,MI_est_98,MI_est_99,MI_est_100,MI_est_101,MI_est_median
0,PGF,0.071563,0.071563,0.071563,0.071563,0.071563,0.071563,0.071563,0.071563,0.071563,...,0.071563,0.071563,0.071563,0.071563,0.071563,0.071563,0.071563,0.071563,0.071563,0.071563
1,TIMP4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,C1QTNF6,0.122624,0.123849,0.126865,0.123849,0.12809,0.123849,0.12809,0.123849,0.123849,...,0.122624,0.122624,0.122624,0.123849,0.123849,0.126865,0.12809,0.12809,0.126865,0.126865
3,TNC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,PRL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
sum(mi_df.MI_est_median > 0)

493

In [15]:
mi_df[["geneID", "MI_est_median"]].to_csv(f"{dirs.analysis_dir}/{unified_dsets[i]}_MI_survival_results.tsv", sep="\t", index=False)