In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.metrics import f1_score
from sklearn.inspection import permutation_importance

import utils.dev_config as dev_conf
import utils.preprocessing as prep
import utils.optimization as opt
import utils.feature_selection as feat_sel



In [2]:
def make_lr(h_params, matrisome_genes):
    if pd.isna(h_params["class_weight"]):
        h_params["class_weight"] = None
    model = make_pipeline(
        # c_transformer = ColumnTransformer([
        #     ("standard", StandardScaler(), ["age_at_diagnosis"] + list(matrisome_genes))
        # ], remainder="passthrough")
        ColumnTransformer([
            ("standard", StandardScaler(), list(matrisome_genes))
        ], remainder="passthrough"),
        LogisticRegression(
            C=h_params["C"],
            class_weight=h_params["class_weight"],
            solver=h_params["solver"],
            penalty=h_params["penalty"],
            random_state=h_params["random_state"],
            n_jobs=-1
        )
    )
    return model


In [3]:
# Define constants and load data
dirs = dev_conf.get_dev_directories("../dev_paths.txt")
unified_dsets = ["unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data"]
matrisome_list = f"{dirs.data_dir}/matrisome/matrisome_hs_masterlist.tsv"
seed = 123
rand = np.random.RandomState()
event_code = {"Alive": 0, "Dead": 1}
# covariate_cols = ["age_at_diagnosis", "race", "ethnicity"]
dep_cols = ["figo_stage"]
# cat_cols = ["race", "ethnicity"]
scoring_method = "f1_macro"

In [4]:
dset_idx = 0

In [5]:
# Load and filter survival data
survival_df = prep.load_survival_df(f"{dirs.data_dir}/{unified_dsets[dset_idx]}/survival_data.tsv", event_code)
filtered_survival_df = (
    # prep.decode_figo_stage(survival_df[["sample_name"] + dep_cols + covariate_cols].dropna(), to="n")
    prep.decode_figo_stage(survival_df[["sample_name"] + dep_cols].dropna(), to="n")
        # .pipe(pd.get_dummies, columns=cat_cols)
        .reset_index(drop = True)
        .pipe(prep.cols_to_front, ["sample_name", "figo_num"])
)
filtered_survival_df.columns = filtered_survival_df.columns.str.replace(' ', '_')

# Load normalized matrisome count data
norm_matrisome_counts_df = pd.read_csv(f"{dirs.data_dir}/{unified_dsets[dset_idx]}/norm_matrisome_counts.tsv", sep='\t')
norm_filtered_matrisome_counts_t_df = prep.transpose_df(
    norm_matrisome_counts_df[["geneID"] + list(filtered_survival_df.sample_name)], "geneID", "sample_name"
)

# Combine survival data and normalized count data
joined_df = (
    pd.merge(filtered_survival_df, norm_filtered_matrisome_counts_t_df, on="sample_name")
        .set_index("sample_name")
)

rand.seed(seed)
x_df, y_df = prep.shuffle_data(joined_df, rand)

# Prep for running models
matrisome_genes = norm_filtered_matrisome_counts_t_df.columns[1:]

# Build models
l1_lr_h_param_df = pd.read_csv(f"{dirs.analysis_dir}/model_opt/{unified_dsets[dset_idx]}_opt_lr_h_params_l1_{scoring_method}.tsv", sep="\t")
l1_lrs = []
for i in range(l1_lr_h_param_df.shape[0]):
    l1_lr_h_params = {
        **dict(zip(l1_lr_h_param_df.columns[:-1], l1_lr_h_param_df.iloc[i, :-1])), "penalty": "l1", "random_state": rand
    }
    l1_lrs.append(make_lr(l1_lr_h_params, matrisome_genes))


In [6]:
any_model_genes = []
all_model_genes = []

In [7]:
for p_line in l1_lrs:
    p_line.fit(x_df, y_df.squeeze())
    all_model_genes.append(matrisome_genes[np.where(np.sum(p_line[1].coef_ != 0, axis=0) == 4)])
    any_model_genes.append(matrisome_genes[np.where(np.sum(p_line[1].coef_ != 0, axis=0) > 0)])



In [10]:
print(len(set.intersection(*map(set, any_model_genes))))
print(len(set.intersection(*map(set, all_model_genes))))

854
54


In [11]:
print(list(map(len, any_model_genes)))
print(list(map(len, all_model_genes)))

[867, 888, 874, 889, 856]
[62, 85, 69, 83, 54]
