In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor

import utils.dev_config as dev_conf
import utils.preprocessing as prep

In [2]:
dirs = dev_conf.get_dev_directories("../dev_paths.txt")
unified_dsets = ["unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data"]
matrisome_list = f"{dirs.data_dir}/matrisome/matrisome_hs_masterlist.tsv"

In [3]:
dset_idx = 0

In [4]:
seed = 123
rand = np.random.RandomState()

# Load and filter survival data

In [5]:
event_code = {"Alive": 0, "Dead": 1}
covariate_cols = ["figo_stage", "age_at_diagnosis", "race", "ethnicity"]
dep_cols = ["vital_status", "survival_time"]
cat_cols = ["race", "ethnicity", "figo_chr"]
survival_df = prep.load_survival_df(f"{dirs.data_dir}/{unified_dsets[dset_idx]}/survival_data.tsv", event_code)

In [6]:
filtered_survival_df = (
    prep.decode_figo_stage(survival_df[["sample_name"] + dep_cols + covariate_cols].dropna(), to="c")
        .query("vital_status == 1")
        .drop(["vital_status"], axis=1)
        .pipe(pd.get_dummies, columns=cat_cols)
        .reset_index(drop = True)
)
filtered_survival_df.columns = filtered_survival_df.columns.str.replace(' ', '_')

# print(filtered_survival_df.shape)
# filtered_survival_df.head()

# Load normalized matrisome count data

In [7]:
norm_matrisome_counts_df = pd.read_csv(f"{dirs.data_dir}/{unified_dsets[dset_idx]}/norm_matrisome_counts.tsv", sep='\t')
norm_filtered_matrisome_counts_t_df = prep.transpose_df(
    norm_matrisome_counts_df[["geneID"] + list(filtered_survival_df.sample_name)], "geneID", "sample_name"
)
# print(norm_filtered_matrisome_counts_t_df.shape)
# norm_filtered_matrisome_counts_t_df.head()

# Join survival and count data

In [8]:
joined_df = (
    pd.merge(filtered_survival_df, norm_filtered_matrisome_counts_t_df, on="sample_name")
        .set_index("sample_name")
)
# print(joined_df.shape)
# joined_df.head()

# PLSR

In [9]:
rand.seed(seed)
x_df, y_df = prep.shuffle_data(joined_df, rand)

In [10]:
plsr_model = PLSRegression(scale=False)
# plsr_model = PLSRegression()
# Does worse with pre-processing, so just going to use vanilla
plsr_pipeline = make_pipeline(plsr_model)
ttr = TransformedTargetRegressor(regressor=plsr_pipeline)
h_params = {"regressor__plsregression__n_components": range(2, 20)}
cv_grid_search = GridSearchCV(ttr, h_params, scoring="neg_mean_absolute_error", cv=KFold(5), n_jobs=-1, verbose=1)
cv_grid_search.fit(x_df, y_df)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    1.8s finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=TransformedTargetRegressor(regressor=Pipeline(steps=[('plsregression',
                                                                             PLSRegression(scale=False))])),
             n_jobs=-1,
             param_grid={'regressor__plsregression__n_components': range(2, 20)},
             scoring='neg_mean_absolute_error', verbose=1)

In [11]:
best_plsr = cv_grid_search.best_estimator_.regressor_["plsregression"]
cv_score = cross_val_score(cv_grid_search.best_estimator_, x_df, y_df, cv=KFold(5), scoring="neg_mean_absolute_error", n_jobs=-1)
print(cv_score.mean())
# coef_df = pd.DataFrame({"var": joined_df.columns[1:], "coef": best_plsr.coef_[:, 0]}).assign(abs_coef = lambda x: np.abs(x.coef))

-581.5718117714823


In [12]:
# My code, adapted from Mehmood, T. et al.: https://www.sciencedirect.com/science/article/pii/S0169743912001542
def get_VIP(plsr_model):
    T = plsr_model.x_scores_
    W = plsr_model.x_weights_
    Q = plsr_model.y_loadings_
    p, a = W.shape
    
    vip = np.zeros(p)
    # SSa for each A
    SSA = np.sum(T ** 2, axis=0) * np.sum(Q ** 2, axis=0)
    # Column-wise l2 norm of W
    W_norm = np.einsum("ij, ij -> j", W, W)
    
    vip = np.sqrt(p * np.sum(SSA * (W / W_norm) ** 2, axis=1) / np.sum(SSA, axis=0))
    return vip

In [13]:
vip = get_VIP(best_plsr)

In [14]:
non_gene_vars = list(x_df.drop(list(norm_filtered_matrisome_counts_t_df.columns[1:]), axis=1).columns)

In [15]:
best_plsr.coef_.squeeze()

array([ 1.35976492e-03, -1.35139553e-01, -7.78647688e-02, ...,
       -1.92199849e+00,  4.87591427e-01, -1.58765786e+00])

In [16]:
plsr_res_df = (
    pd.DataFrame({"geneID": x_df.columns, "vip_scores": vip, "coeff": best_plsr.coef_.squeeze()})
        .pipe(lambda x: x[~x.geneID.isin(non_gene_vars)])
        .reset_index(drop=True)
)

In [17]:
plsr_res_df.to_csv(f"{dirs.analysis_dir}/{unified_dsets[dset_idx]}_plsr_results.tsv", sep="\t", index=False)

In [18]:
np.sum(plsr_res_df.vip_scores > 1)

238