In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.cross_decomposition import PLSRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from scipy.stats import pearsonr

import utils.dev_config as dev_conf
import utils.preprocessing as prep

In [2]:
dirs = dev_conf.get_dev_directories("../dev_paths.txt")
unified_dsets = ["unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data"]
matrisome_list = f"{dirs.data_dir}/matrisome/matrisome_hs_masterlist.tsv"

In [3]:
i = 0

In [4]:
matrisome_df = prep.load_matrisome_df(matrisome_list)

In [5]:
rand = np.random.RandomState()

# Load and filter survival data

In [6]:
event_code = {"Alive": 0, "Dead": 1}
covariate_cols = ["figo_stage", "age_at_diagnosis", "race", "ethnicity"]
dep_cols = ["vital_status", "survival_time"]
cat_cols = ["race", "ethnicity", "figo_chr"]
survival_df = prep.load_survival_df(f"{dirs.data_dir}/{unified_dsets[i]}/survival_data.tsv", event_code)

In [7]:
filtered_survival_df = (
    prep.decode_figo_stage(survival_df[["sample_name"] + dep_cols + covariate_cols].dropna(), to="c")
        .query("vital_status == 1")
        .drop(["vital_status"], axis=1)
        .pipe(pd.get_dummies, columns=cat_cols)
        .reset_index(drop = True)
)
filtered_survival_df.columns = filtered_survival_df.columns.str.replace(' ', '_')

print(filtered_survival_df.shape)
filtered_survival_df.head()

(66, 16)


Unnamed: 0,sample_name,survival_time,age_at_diagnosis,race_american_indian_or_alaska_native,race_asian,race_black_or_african_american,race_native_hawaiian_or_other_pacific_islander,race_not_reported,race_white,ethnicity_hispanic_or_latino,ethnicity_not_hispanic_or_latino,ethnicity_not_reported,figo_chr_figo_stage_1,figo_chr_figo_stage_2,figo_chr_figo_stage_3,figo_chr_figo_stage_4
0,TCGA-C5-A1BF-01B-11R-A13Y-07,570,16975.0,0,0,0,0,0,1,0,0,1,1,0,0,0
1,TCGA-C5-A8YT-01A-11R-A37O-07,633,13253.0,0,0,0,0,0,1,0,1,0,1,0,0,0
2,TCGA-C5-A1BE-01B-11R-A13Y-07,2094,23727.0,0,0,0,0,0,1,0,0,1,1,0,0,0
3,TCGA-C5-A8XH-01A-11R-A37O-07,1394,14444.0,0,0,0,0,0,1,0,1,0,1,0,0,0
4,TCGA-DS-A7WF-01A-11R-A352-07,492,15319.0,0,0,0,0,1,0,1,0,0,1,0,0,0


# Load normalized matrisome count data

In [8]:
norm_matrisome_counts_df = pd.read_csv(f"{dirs.data_dir}/{unified_dsets[i]}/norm_matrisome_counts.tsv", sep='\t')
norm_filtered_matrisome_counts_t_df = prep.transpose_df(
    norm_matrisome_counts_df[["geneID"] + list(filtered_survival_df.sample_name)], "geneID", "sample_name"
)
print(norm_filtered_matrisome_counts_t_df.shape)
norm_filtered_matrisome_counts_t_df.head()

(66, 1009)


Unnamed: 0,sample_name,PGF,TIMP4,C1QTNF6,TNC,PRL,OGN,C1QL3,FGB,NDNF,...,PIK3IP1,C1QTNF2,PCSK5,ANXA1,HGF,VWA2,FGF3,POSTN,NTF3,S100A6
0,TCGA-C5-A1BF-01B-11R-A13Y-07,10.800637,6.228003,11.669331,13.002928,5.063964,4.869744,5.063964,8.834522,6.410767,...,9.013453,8.190325,9.503647,14.077995,6.569726,7.315604,4.602649,12.0623,5.649441,16.558407
1,TCGA-C5-A8YT-01A-11R-A37O-07,7.830611,5.733875,12.445548,13.765468,5.455125,13.049104,5.146455,5.074289,10.569544,...,9.453187,6.398956,12.288955,13.396332,10.228758,8.542025,4.602649,11.765396,5.318924,13.556322
2,TCGA-C5-A1BE-01B-11R-A13Y-07,10.642039,5.348449,8.94522,13.419225,4.602649,5.867905,5.646251,4.602649,5.673617,...,11.696884,6.38498,9.774029,15.381944,6.205261,7.163053,4.957257,10.113788,5.213815,15.564969
3,TCGA-C5-A8XH-01A-11R-A37O-07,9.633752,5.908552,11.672191,13.863766,4.602649,5.339887,5.702581,5.3896,5.634507,...,11.148165,7.52614,8.561116,14.404419,6.462928,6.10257,4.602649,9.104547,5.088257,15.19735
4,TCGA-DS-A7WF-01A-11R-A352-07,13.292479,5.620704,12.867887,16.646519,4.602649,10.377267,6.177498,4.602649,5.198452,...,10.809104,7.877841,6.615459,13.894278,7.058159,10.809104,4.602649,15.271686,6.519692,16.820793


# Join survival and count data

In [9]:
joined_df = (
    pd.merge(filtered_survival_df, norm_filtered_matrisome_counts_t_df, on="sample_name")
        .set_index("sample_name")
)
print(joined_df.shape)
joined_df.head()

(66, 1023)


Unnamed: 0_level_0,survival_time,age_at_diagnosis,race_american_indian_or_alaska_native,race_asian,race_black_or_african_american,race_native_hawaiian_or_other_pacific_islander,race_not_reported,race_white,ethnicity_hispanic_or_latino,ethnicity_not_hispanic_or_latino,...,PIK3IP1,C1QTNF2,PCSK5,ANXA1,HGF,VWA2,FGF3,POSTN,NTF3,S100A6
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-C5-A1BF-01B-11R-A13Y-07,570,16975.0,0,0,0,0,0,1,0,0,...,9.013453,8.190325,9.503647,14.077995,6.569726,7.315604,4.602649,12.0623,5.649441,16.558407
TCGA-C5-A8YT-01A-11R-A37O-07,633,13253.0,0,0,0,0,0,1,0,1,...,9.453187,6.398956,12.288955,13.396332,10.228758,8.542025,4.602649,11.765396,5.318924,13.556322
TCGA-C5-A1BE-01B-11R-A13Y-07,2094,23727.0,0,0,0,0,0,1,0,0,...,11.696884,6.38498,9.774029,15.381944,6.205261,7.163053,4.957257,10.113788,5.213815,15.564969
TCGA-C5-A8XH-01A-11R-A37O-07,1394,14444.0,0,0,0,0,0,1,0,1,...,11.148165,7.52614,8.561116,14.404419,6.462928,6.10257,4.602649,9.104547,5.088257,15.19735
TCGA-DS-A7WF-01A-11R-A352-07,492,15319.0,0,0,0,0,1,0,1,0,...,10.809104,7.877841,6.615459,13.894278,7.058159,10.809104,4.602649,15.271686,6.519692,16.820793


# PLSR

In [10]:
rand.seed(123)
shuffled_df = joined_df.sample(frac=1, random_state=rand)
X_df = shuffled_df.iloc[:, 1:]
# Y must be a matrix
Y_df = shuffled_df.iloc[:, [0]]

In [11]:
plsr_model = PLSRegression(scale=False)
plsr_pipeline = make_pipeline(
    ColumnTransformer([
        ("standard", StandardScaler(), ["age_at_diagnosis"] + list(norm_filtered_matrisome_counts_t_df.columns[1:]))
    ], remainder="passthrough"),
    plsr_model
)
# ttr = TransformedTargetRegressor(regressor=plsr_pipeline, transformer=StandardScaler())
# ttr = TransformedTargetRegressor(regressor=plsr_pipeline)
# h_params = {"regressor__plsregression__n_components": range(2, 20)}
h_params = {"plsregression__n_components": range(2, 20)}
# h_params = {"n_components": range(2, 20)}
cv_grid_search = GridSearchCV(plsr_pipeline, h_params, scoring="neg_mean_absolute_error", cv=KFold(5), n_jobs=-1, verbose=1)
cv_grid_search.fit(X_df, Y_df)
pd.DataFrame(cv_grid_search.cv_results_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    1.6s finished


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_plsregression__n_components,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.046482,0.004069,0.021357,0.002333,2,{'plsregression__n_components': 2},-621.712847,-720.63591,-457.439657,-442.309911,-777.231697,-603.866004,135.314767,1
1,0.045593,0.001878,0.023245,0.004601,3,{'plsregression__n_components': 3},-705.862905,-757.449482,-595.197928,-479.902247,-816.169158,-670.916344,120.022739,18
2,0.046722,0.005657,0.02335,0.00703,4,{'plsregression__n_components': 4},-614.866826,-713.880134,-511.995088,-415.823548,-823.532817,-616.019683,143.902263,2
3,0.052336,0.010032,0.018306,0.001532,5,{'plsregression__n_components': 5},-650.496845,-715.882447,-499.136503,-449.090967,-827.729863,-628.467325,139.093207,13
4,0.042323,0.003728,0.017871,0.000503,6,{'plsregression__n_components': 6},-660.195452,-725.114709,-515.013941,-442.474924,-834.605446,-635.480895,141.463217,17
5,0.043296,0.004881,0.016936,0.001209,7,{'plsregression__n_components': 7},-660.670746,-726.421129,-514.13621,-415.331477,-836.735565,-630.659026,149.988095,16
6,0.039964,0.001067,0.016962,0.003044,8,{'plsregression__n_components': 8},-660.408974,-722.321061,-511.721265,-406.253148,-839.403184,-628.021526,153.1865,3
7,0.034573,0.003914,0.015659,0.002064,9,{'plsregression__n_components': 9},-664.01742,-723.147023,-507.582144,-411.078239,-839.40599,-629.046163,152.70412,15
8,0.033962,0.00324,0.014372,0.002412,10,{'plsregression__n_components': 10},-663.782891,-725.390274,-504.976676,-409.023281,-839.906485,-628.615921,154.112481,14
9,0.035903,0.003866,0.018721,0.000494,11,{'plsregression__n_components': 11},-663.233342,-725.718097,-505.184577,-408.059001,-839.23552,-628.286107,154.186849,12


In [12]:
plsr_pipeline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('standard', StandardScaler(),
                                                  ['age_at_diagnosis', 'PGF',
                                                   'TIMP4', 'C1QTNF6', 'TNC',
                                                   'PRL', 'OGN', 'C1QL3', 'FGB',
                                                   'NDNF', 'CCL22', 'ELSPBP1',
                                                   'CYR61', 'ECM1', 'ANGPT2',
                                                   'SERPINF2', 'SCUBE3',
                                                   'CRELD2', 'KITLG', 'THSD4',
                                                   'MEPE', 'CELA2B', 'CLEC4G',
                                                   'ANGPTL7', 'CSF3', 'LOXL1',
                                                   'CLEC18A', 'MUC3A', 'PXDNL',
                                      

In [13]:
best_plsr = cv_grid_search.best_estimator_["plsregression"]
cv_score = cross_val_score(cv_grid_search.best_estimator_, X_df, Y_df, cv=KFold(5), scoring="neg_mean_absolute_error", n_jobs=-1)
print(cv_score.mean())
coef_df = pd.DataFrame({"var": shuffled_df.columns[1:], "coef": best_plsr.coef_[:, 0]}).assign(abs_coef = lambda x: np.abs(x.coef))

-603.8660044913055


In [14]:
Y_hat = cv_grid_search.best_estimator_.predict(X_df)

In [15]:
Y_df.assign(pred = Y_hat)

Unnamed: 0_level_0,survival_time,pred
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1
TCGA-VS-A9V1-01A-11R-A42T-07,157,202.784327
TCGA-VS-A9UM-01A-11R-A42T-07,829,1051.832779
TCGA-C5-A7CJ-01A-11R-A32P-07,3097,1919.553351
TCGA-UC-A7PF-01A-11R-A352-07,2859,3067.487311
TCGA-JW-A5VH-01A-11R-A28H-07,100,308.360676
...,...,...
TCGA-DS-A0VK-01A-21R-A10U-07,1118,1507.069817
TCGA-Q1-A6DT-01A-11R-A32P-07,275,126.497853
TCGA-C5-A7UC-01A-11R-A352-07,523,145.829274
TCGA-C5-A7CK-01A-11R-A32P-07,4086,2586.172223


In [16]:
# Citation: Kevin Milton Mendez, https://github.com/scikit-learn/scikit-learn/pull/13492/files
def get_VIP(plsr_model):
    T = plsr_model.x_scores_
    W = plsr_model.x_weights_
    Q = plsr_model.y_loadings_
    w0, w1 = W.shape
    s = np.sum(T ** 2, axis=0) * np.sum(Q ** 2, axis=0)
    s_sum = np.sum(s, axis=0)
    w_norm = np.array([(W[:, i] / np.linalg.norm(W[:, i]))
                       for i in range(w1)])
    return np.sqrt(w0 * np.sum(s * w_norm.T ** 2, axis=1) / s_sum)

# My code, adapted from Mehmood, T. et al.: https://www.sciencedirect.com/science/article/pii/S0169743912001542
def get_VIP_2(plsr_model):
    T = plsr_model.x_scores_
    W = plsr_model.x_weights_
    Q = plsr_model.y_loadings_
    p, a = W.shape
    
    vip = np.zeros(p)
    # SSa for each A
    SSA = np.sum(T ** 2, axis=0) * np.sum(Q ** 2, axis=0)
    # Column-wise l2 norm of W
    Wa_norm = np.einsum("ij, ij -> j", W, W)
    
    for j in range(p):
        Wj2 = (W[j, :] / Wa_norm) ** 2
        vip[j] = np.sqrt(p * np.sum(SSA * Wj2) / np.sum(SSA))
    return vip

In [17]:
vip = get_VIP(best_plsr)

In [18]:
vip2 = get_VIP_2(best_plsr)

In [19]:
np.allclose(vip2, vip)

True

In [20]:
non_gene_vars = list(X_df.drop(list(norm_filtered_matrisome_counts_t_df.columns[1:]), axis=1).columns)

In [21]:
print(len(non_gene_vars))
non_gene_vars

14


['age_at_diagnosis',
 'race_american_indian_or_alaska_native',
 'race_asian',
 'race_black_or_african_american',
 'race_native_hawaiian_or_other_pacific_islander',
 'race_not_reported',
 'race_white',
 'ethnicity_hispanic_or_latino',
 'ethnicity_not_hispanic_or_latino',
 'ethnicity_not_reported',
 'figo_chr_figo_stage_1',
 'figo_chr_figo_stage_2',
 'figo_chr_figo_stage_3',
 'figo_chr_figo_stage_4']

In [22]:
best_plsr.coef_.squeeze()

array([ 1.04150576, -2.18477502, -0.47481875, ...,  1.45702514,
       -1.01356656, -3.12309068])

In [29]:
plsr_res_df = (
    pd.DataFrame({"geneID": X_df.columns, "vip_scores": vip2, "coeff": best_plsr.coef_.squeeze()})
        .pipe(lambda x: x[~x.geneID.isin(non_gene_vars)])
        .reset_index(drop=True)
)

In [30]:
plsr_res_df.to_csv(f"{dirs.analysis_dir}/{unified_dsets[i]}_plsr_results.tsv", sep="\t", index=False)

In [31]:
np.sum(plsr_res_df.vip_scores > 1)

394