In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

import utils.dev_config as dev_conf
import utils.preprocessing as prep

In [2]:
dirs = dev_conf.get_dev_directories("../dev_paths.txt")
unified_dsets = ["unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data"]
matrisome_list = f"{dirs.data_dir}/matrisome/matrisome_hs_masterlist.tsv"

In [3]:
i = 0

In [4]:
matrisome_df = prep.load_matrisome_df(matrisome_list)

# Load and filter survival data

In [5]:
event_code = {"Alive": 0, "Dead": 1}
covariate_cols = ["age_at_diagnosis", "bmi", "race", "ethnicity"]
dep_cols = ["vital_status", "survival_time"]
cat_cols = ["race", "ethnicity"]
survival_df = prep.load_survival_df(f"{dirs.data_dir}/{unified_dsets[i]}/survival_data.tsv", event_code)

In [6]:
figo_df = (
    survival_df[["sample_name", "figo_stage"] + covariate_cols]
        .dropna()
        .pipe(pd.get_dummies, columns=cat_cols)
        .sort_values("figo_stage")
        .reset_index(drop=True)
        .assign(figo_stage_major = lambda x: x["figo_stage"].apply(lambda s: re.findall(r"IV|III|II|I", s)[0]))
        .assign(figo_stage_major_fact = lambda x: pd.factorize(x["figo_stage_major"])[0] + 1)
        .pipe(prep.cols_to_front, ["sample_name", "figo_stage_major", "figo_stage_major_fact"])
        .drop(["figo_stage_major", "figo_stage"], axis=1)
        .rename(columns={"figo_stage_major_fact": "figo_stage"})
)

print(figo_df.shape)
print(figo_df.shape[0] / survival_df.shape[0])

(216, 13)
0.833976833976834


# Load normalized matrisome count data

In [7]:
norm_matrisome_counts_df = pd.read_csv(f"{dirs.data_dir}/{unified_dsets[i]}/norm_matrisome_counts.tsv", sep='\t')
norm_filtered_matrisome_counts_t_df = (
    norm_matrisome_counts_df[["geneID"] + list(figo_df.sample_name)]
        .set_index("geneID")                        # set as index so will be column names
        .transpose()
        .rename_axis(None, axis=1)                  # column.name will be set to "geneID", we don't want this
        .reset_index()                              # "sample_name" should now be its own column
        .rename({"index": "sample_name"}, axis=1)
)

In [8]:
joined_df = (
    pd.merge(figo_df, norm_filtered_matrisome_counts_t_df, on="sample_name")
        .set_index("sample_name")
)

In [9]:
X = joined_df.iloc[:, 1:].values
y = joined_df.iloc[:, 0].values
# Use y.max() instead of y.max() + 1, and subscript with
# y - 1 because y falls in [1, 4] rather than [0, 3]
y_one_hot = np.eye(y.max())[y - 1]

# Shuffle data
n = y.shape[0]
perm = np.random.choice(n, size=n, replace=False)
X = X[perm, :]
y = y[perm]
y_one_hot = y_one_hot[perm, :]

# Cross validated feature permutation importance

In [55]:
def cv_permutation_importance(pipeline, X, y ,k):
    kf = KFold(n_splits=k)
    results = []
    for train_idx, test_idx in kf.split(X):
        # Train/retrain from scratch
        pipeline.fit(X[train_idx], y[train_idx])
        result = permutation_importance(pipeline, X[test_idx], y[test_idx], scoring="f1_weighted", n_jobs=-1, n_repeats=5)
        results.append(result)
    return results

In [11]:
svc_df = (
    pd.read_csv("svc_opt_res.csv")
        .rename(columns={"min": "min_loss"})
)

opt_params = svc_df.loc[svc_df.min_loss == svc_df.min_loss.min()].to_dict("records")[0]

In [12]:
svc_model = SVC(
    kernel=opt_params["kernel"],
    C=opt_params["C"],
    gamma=opt_params["gamma"],
    degree=opt_params["degree"],
    coef0=opt_params["coef0"]
)
svc_pipeline = make_pipeline(StandardScaler(), svc_model)

In [13]:
kv_score = cross_val_score(
    svc_pipeline,
    X,
    y,
    cv=KFold(n_splits=5, shuffle=False),
    n_jobs=-1,
    scoring="f1_weighted"
)
print(kv_score)
print(kv_score.mean())

[0.38502674 0.50733647 0.39147287 0.28297011 0.31819115]
0.376999467369998


In [14]:
svc_results = cv_permutation_importance(svc_pipeline, X, y, 5)
svc_kv_importance_means = np.concatenate([r.importances for r in svc_results], axis=1).mean(axis=1)

In [71]:
print(np.sum(svc_kv_importance_means > 0))
# print(np.sum(svc_kv_importance_means < 0))

82


In [56]:
lrg_model = LogisticRegression(
    penalty="l2",
#     penalty="l1",
    class_weight="balanced",
    multi_class="multinomial",
    C=1,
    solver="newton-cg"
#     solver="saga"
)
lrg_pipeline = make_pipeline(StandardScaler(), lrg_model)

In [57]:
kv_score = cross_val_score(
    lrg_pipeline,
    X,
    y,
    cv=KFold(n_splits=5, shuffle=False),
    n_jobs=-1,
    scoring="f1_weighted"
)
print(kv_score)
print(kv_score.mean())

[0.50029709 0.55395757 0.45200915 0.37928924 0.46269616]
0.469649841791854


In [60]:
lrg_results = cv_permutation_importance(lrg_pipeline, X, y, 5)
lrg_kv_importance_means = np.concatenate([r.importances for r in lrg_results], axis=1).mean(axis=1)

In [75]:
print(np.sum(lrg_kv_importance_means > 0))
# print(np.sum(lrg_kv_importance_means < 0))

418


In [69]:
np.sort(lrg_kv_importance_means) / kv_score.mean()

array([-0.0206125 , -0.02056103, -0.01932727, ...,  0.01651336,
        0.01664334,  0.01709878])

# How does the naive model (guess the label with highest prevalence) perform?

In [None]:
joined_df.figo_stage.value_counts()

In [None]:
f1_score(y, np.ones(shape=(216,)), average="weighted")