In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif

import utils.dev_config as dev_conf
import utils.preprocessing as prep

In [2]:
dirs = dev_conf.get_dev_directories("../dev_paths.txt")
unified_dsets = ["unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data"]
matrisome_list = f"{dirs.data_dir}/matrisome/matrisome_hs_masterlist.tsv"

In [3]:
i = 0

In [4]:
# matrisome_df = pd.read_csv(matrisome_list, sep='\t')
matrisome_df = prep.load_matrisome_df(matrisome_list)
sig_deg_df = pd.read_csv(f"{dirs.analysis_dir}/{unified_dsets[i]}_sig_DESeq_results_xref_matrisome.tsv", sep = '\t')
matrisome_sig_deg_df = (
    sig_deg_df.query("in_matrisome == True")
        .reset_index(drop=True)
)

# Load and filter survival data

In [5]:
event_code = {"Alive": 0, "Dead": 1}
covariate_cols = ["age_at_diagnosis", "bmi", "race", "ethnicity"]
dep_cols = ["vital_status", "survival_time"]
cat_cols = ["race", "ethnicity"]

In [6]:
survival_df = prep.load_survival_df(f"{dirs.data_dir}/{unified_dsets[i]}/survival_data.tsv", event_code)

In [7]:
filtered_survival_df = (
    survival_df[["sample_name"] + dep_cols + covariate_cols]
        .query("vital_status == 1")
        .dropna()
        .reset_index(drop=True)
        .pipe(pd.get_dummies, columns=cat_cols)
)

In [8]:
filtered_survival_df.shape[0]
filtered_survival_df.shape[0] / survival_df.shape[0]

0.18532818532818532

# Load normalized matrisome count data

In [9]:
norm_matrisome_counts_df = pd.read_csv(f"{dirs.data_dir}/{unified_dsets[i]}/norm_matrisome_counts.tsv", sep='\t')
norm_matrisome_survival_counts_t_df = (
    norm_matrisome_counts_df[["geneID"] + list(filtered_survival_df.sample_name)]
        .set_index("geneID")                        # set as index so will be column names
        .transpose()
        .rename_axis(None, axis=1)                  # column.name will be set to "geneID", we don't want this
        .reset_index()                              # "sample_name" should now be its own column
        .rename({"index": "sample_name"}, axis=1)
)

In [118]:
joined_survival_counts_df = (
    pd.merge(filtered_survival_df, norm_matrisome_survival_counts_t_df, on="sample_name")
        .drop("vital_status", axis=1)
        .set_index("sample_name")
)
# This feature is integral
joined_survival_counts_df.age_at_diagnosis = joined_survival_counts_df.age_at_diagnosis.astype("int")

In [119]:
# We leave the y variable out of the mask
discr_mask = np.array(list(map(lambda x: np.issubdtype(x, np.integer), joined_survival_counts_df.dtypes.values)))[1:]

In [120]:
X = joined_survival_counts_df.iloc[:, 11:].values
y = joined_survival_counts_df.iloc[:, 0].values

In [121]:
# res = mutual_info_regression(X, y, discrete_features=discr_mask)
res = mutual_info_regression(X, y, discrete_features=discr_mask[10:])

In [122]:
mi_df = pd.DataFrame({"geneID": joined_survival_counts_df.columns[11:], "MI": res})

In [123]:
mi_df.sort_values("MI", ascending=False)

Unnamed: 0,geneID,MI
247,CLEC11A,0.362796
311,LOXL2,0.334747
112,SERPIND1,0.329892
367,LAMB1,0.255152
882,FGF11,0.252130
...,...,...
432,FREM3,0.000000
436,SEMA3D,0.000000
437,OIT3,0.000000
438,MBL2,0.000000


In [124]:
sig_coxph_res_df = (
    pd.read_csv(f"{dirs.analysis_dir}/{unified_dsets[i]}_coxph_results.tsv", sep='\t')
        .query("gene_pval < 0.05")
)