In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.metrics import f1_score
from sklearn.inspection import permutation_importance

import utils.dev_config as dev_conf
import utils.preprocessing as prep
import utils.optimization as opt
import utils.feature_selection as feat_sel

In [2]:
dirs = dev_conf.get_dev_directories("../dev_paths.txt")
unified_dsets = ["unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data"]
matrisome_list = f"{dirs.data_dir}/matrisome/matrisome_hs_masterlist.tsv"

In [3]:
dset_idx = 0

In [4]:
matrisome_df = prep.load_matrisome_df(matrisome_list)

In [5]:
seed = 123
rand = np.random.RandomState()

# Load and filter survival data

In [6]:
event_code = {"Alive": 0, "Dead": 1}
covariate_cols = ["age_at_diagnosis", "race", "ethnicity"]
dep_cols = ["figo_stage"]
cat_cols = ["race", "ethnicity"]
survival_df = prep.load_survival_df(f"{dirs.data_dir}/{unified_dsets[dset_idx]}/survival_data.tsv", event_code)

In [7]:
filtered_survival_df = (
    prep.decode_figo_stage(survival_df[["sample_name"] + dep_cols + covariate_cols].dropna(), to="n")
        .pipe(pd.get_dummies, columns=cat_cols)
        .reset_index(drop = True)
        .pipe(prep.cols_to_front, ["sample_name", "figo_num"])
)
filtered_survival_df.columns = filtered_survival_df.columns.str.replace(' ', '_')
print(filtered_survival_df.shape)
# filtered_survival_df.head()

(255, 12)


# Load normalized matrisome count data

In [8]:
norm_matrisome_counts_df = pd.read_csv(f"{dirs.data_dir}/{unified_dsets[dset_idx]}/norm_matrisome_counts.tsv", sep='\t')
norm_filtered_matrisome_counts_t_df = prep.transpose_df(
    norm_matrisome_counts_df[["geneID"] + list(filtered_survival_df.sample_name)], "geneID", "sample_name"
)
print(norm_filtered_matrisome_counts_t_df.shape)
# norm_filtered_matrisome_counts_t_df.head()

(255, 1009)


# Join survival and count data

In [9]:
joined_df = (
    pd.merge(filtered_survival_df, norm_filtered_matrisome_counts_t_df, on="sample_name")
        .set_index("sample_name")
)
print(joined_df.shape)
# joined_df.head()

(255, 1019)


# Build models

In [10]:
matrisome_genes = norm_filtered_matrisome_counts_t_df.columns[1:]
n_matrisome_genes = len(matrisome_genes)

In [11]:
def make_lr(h_params):
    if pd.isna(h_params["class_weight"]):
        h_params["class_weight"] = None
    model = make_pipeline(
        ColumnTransformer([
            ("standard", StandardScaler(), ["age_at_diagnosis"] + list(matrisome_genes))
        ], remainder="passthrough"),
        LogisticRegression(
            C=h_params["C"],
            class_weight=h_params["class_weight"],
            solver=h_params["solver"],
            penalty=h_params["penalty"],
            random_state=h_params["random_state"],
            n_jobs=-1
        )
    )
    return model

In [12]:
l1_lr_h_param_df = pd.read_csv(f"{unified_dsets[dset_idx]}_opt_lr_h_params_l1_f1_weighted.tsv", sep="\t")
l1_lrs = []
for i in range(l1_lr_h_param_df.shape[0]):
    l1_lr_h_params = {
        **dict(zip(l1_lr_h_param_df.columns[:-1], l1_lr_h_param_df.iloc[i, :-1])), "penalty": "l1", "random_state": rand
    }
    l1_lrs.append(make_lr(l1_lr_h_params))

In [13]:
l2_lr_h_param_df = pd.read_csv(f"{unified_dsets[dset_idx]}_opt_lr_h_params_l2_f1_weighted.tsv", sep="\t")
l2_lrs = []
for i in range(l2_lr_h_param_df.shape[0]):
    l2_lr_h_params = {
        **dict(zip(l2_lr_h_param_df.columns[:-1], l2_lr_h_param_df.iloc[i, :-1])), "penalty": "l2", "random_state": rand
    }
    l2_lrs.append(make_lr(l2_lr_h_params))

In [14]:
gbc_h_param_df = pd.read_csv(f"{unified_dsets[dset_idx]}_opt_gbc_h_params_f1_weighted.tsv", sep="\t")
gbcs = [
    GradientBoostingClassifier(
        **dict(zip(gbc_h_param_df.columns[:-1], gbc_h_param_df.iloc[i, :-1])), loss="deviance", random_state=rand
    ) for i in range(gbc_h_param_df.shape[0])
]

In [15]:
rfc_h_param_df = pd.read_csv(f"{unified_dsets[dset_idx]}_opt_rfc_h_params_f1_weighted.tsv", sep="\t")
rfcs = [
    RandomForestClassifier(
        **dict(zip(rfc_h_param_df.columns[:-1], rfc_h_param_df.iloc[i, :-1])), random_state=rand, n_jobs=-1
    ) for i in range(rfc_h_param_df.shape[0])
]

In [16]:
def collect_feature_perm_results(models, x_df, y_df, r, gene_cols, score, verbose=True, to_array=True):
    all_mean_perm_results = []
    all_ref_scores = []
    all_perm_res_dfs = []
    
    for i, m in enumerate(models):
        if verbose:
            print(f"Running feature perm for model {i}")
        perm_results, ref_scores = opt.cv_permutation_importance(m, x_df, y_df, score, k=5, random_state=r, to_array=to_array)
        perm_importances = np.concatenate([r.importances for r in perm_results], axis=1)
        perm_importance_means = np.mean(perm_importances, axis=1)
        
        all_mean_perm_results.append(perm_importance_means)
        all_ref_scores.append(ref_scores)
        
        res_df = feat_sel.gather_perm_res(x_df, perm_importance_means, np.mean(ref_scores), gene_cols)
        res_df = res_df.rename(columns={"mean_imp": f"mean_imp_{i}", "score_pct_improvement": f"score_pct_improvement_{i}"})
        all_perm_res_dfs.append(res_df)
    
    return all_mean_perm_results, all_ref_scores, all_perm_res_dfs


def merge_perm_results(perm_res_dfs, importance_thresh=0):
    merge_df = perm_res_dfs[0]
    for i in range(1, len(perm_res_dfs)):
        merge_df = merge_df.merge(perm_res_dfs[i], on = "geneID", how = "inner")
    merge_df = (
        merge_df.assign(consensus_imp_mean = merge_df.filter(regex="mean_imp").mean(axis=1))
            .assign(consensus_imp_std = merge_df.filter(regex="mean_imp").std(axis=1))
    )
    merge_df = merge_df.assign(consensus_imp_cv = merge_df.consensus_imp_std / merge_df.consensus_imp_mean)
    merge_df["consensus_vote"] = (merge_df.set_index("geneID").filter(regex="mean_imp", axis=1) > importance_thresh).all(axis=1).values
    return merge_df

In [17]:
rand.seed(seed)
x_df, y_df = prep.shuffle_data(joined_df, rand)

## LR (L1 penalty)

In [18]:
l1_lr_mean_perm_res, l1_lr_ref_scores, l1_lr_perm_res_dfs = collect_feature_perm_results(
    l1_lrs, x_df, y_df, rand, matrisome_genes, "f1_weighted", to_array=False
)

Running feature perm for model 0




Running feature perm for model 1




Running feature perm for model 2




Running feature perm for model 3




Running feature perm for model 4




In [19]:
l1_lr_merge_df = merge_perm_results(l1_lr_perm_res_dfs)
l1_lr_merge_df.query("consensus_vote == True").sort_values("consensus_imp_mean", ascending=False).shape[0]

58

In [20]:
np.array(l1_lr_ref_scores).mean(axis=1)

array([0.50164974, 0.50897904, 0.49894685, 0.4960666 , 0.49307874])

## LR (L2 penalty)

In [21]:
l2_lr_mean_perm_res, l2_lr_ref_scores, l2_lr_perm_res_dfs = collect_feature_perm_results(
    l2_lrs, x_df, y_df, rand, matrisome_genes, "f1_weighted", to_array=False
)

Running feature perm for model 0
Running feature perm for model 1
Running feature perm for model 2
Running feature perm for model 3
Running feature perm for model 4


In [22]:
l2_lr_merge_df = merge_perm_results(l2_lr_perm_res_dfs)
l2_lr_merge_df.query("consensus_vote == True").sort_values("consensus_imp_mean", ascending=False).shape[0]

843

In [23]:
np.array(l2_lr_ref_scores).mean(axis=1)

array([0.50974458, 0.50974458, 0.50974458, 0.50974458, 0.50778366])

## GBC

In [24]:
gbc_mean_perm_res, gbc_ref_scores, gbc_perm_res_dfs = collect_feature_perm_results(
    gbcs, x_df, y_df, rand, matrisome_genes, "f1_weighted", to_array=True
)

Running feature perm for model 0
Running feature perm for model 1
Running feature perm for model 2
Running feature perm for model 3
Running feature perm for model 4


In [25]:
gbc_merge_df = merge_perm_results(gbc_perm_res_dfs)
gbc_merge_df.query("consensus_vote == True").sort_values("consensus_imp_mean", ascending=False).shape[0]

11

In [26]:
np.array(gbc_ref_scores).mean(axis=1)

array([0.48539758, 0.47589069, 0.47466133, 0.48404418, 0.4743963 ])

## RFC

In [27]:
rfc_mean_perm_res, rfc_ref_scores, rfc_perm_res_dfs = collect_feature_perm_results(
    rfcs, x_df, y_df, rand, matrisome_genes, "f1_weighted", to_array=True
)

Running feature perm for model 0
Running feature perm for model 1
Running feature perm for model 2
Running feature perm for model 3
Running feature perm for model 4


In [28]:
rfc_merge_df = merge_perm_results(rfc_perm_res_dfs)
rfc_merge_df.query("consensus_vote == True").sort_values("consensus_imp_mean", ascending=False).shape[0]

7

In [29]:
np.array(rfc_ref_scores).mean(axis=1)

array([0.40481892, 0.40483626, 0.39257452, 0.40856094, 0.41197281])

# Save findings

In [30]:
l1_lr_merge_df.to_csv(f"{dirs.analysis_dir}/{unified_dsets[dset_idx]}_l1_lr_results.tsv", sep="\t", index=False)
l2_lr_merge_df.to_csv(f"{dirs.analysis_dir}/{unified_dsets[dset_idx]}_l2_lr_results.tsv", sep="\t", index=False)
gbc_merge_df.to_csv(f"{dirs.analysis_dir}/{unified_dsets[dset_idx]}_gbc_results.tsv", sep="\t", index=False)
rfc_merge_df.to_csv(f"{dirs.analysis_dir}/{unified_dsets[dset_idx]}_rfc_results.tsv", sep="\t", index=False)