In [1]:
import pandas as pd
import numpy as np
import re
from time import perf_counter
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import f1_score, accuracy_score
from skopt.space import Real, Integer
from skopt import gp_minimize

import utils.dev_config as dev_conf
import utils.preprocessing as prep

In [2]:
dirs = dev_conf.get_dev_directories("../dev_paths.txt")
unified_dsets = ["unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data"]
matrisome_list = f"{dirs.data_dir}/matrisome/matrisome_hs_masterlist.tsv"

In [3]:
i = 0

In [4]:
matrisome_df = prep.load_matrisome_df(matrisome_list)
sig_deg_df = pd.read_csv(f"{dirs.analysis_dir}/{unified_dsets[i]}_sig_DESeq_results_xref_matrisome.tsv", sep = '\t')
matrisome_sig_deg_df = (
    sig_deg_df.query("in_matrisome == True")
        .reset_index(drop=True)
)

# Load and filter survival data

In [5]:
event_code = {"Alive": 0, "Dead": 1}
covariate_cols = ["age_at_diagnosis", "bmi", "race", "ethnicity"]
dep_cols = ["vital_status", "survival_time"]
cat_cols = ["race", "ethnicity"]
survival_df = prep.load_survival_df(f"{dirs.data_dir}/{unified_dsets[i]}/survival_data.tsv", event_code)

In [6]:
figo_df = (
    survival_df[["sample_name", "figo_stage"] + covariate_cols]
        .dropna()
        .pipe(pd.get_dummies, columns=cat_cols)
        .sort_values("figo_stage")
        .reset_index(drop=True)
        .assign(figo_stage_major = lambda x: x["figo_stage"].apply(lambda s: re.findall(r"IV|III|II|I", s)[0]))
        .assign(figo_stage_major_fact = lambda x: pd.factorize(x["figo_stage_major"])[0] + 1)
        .pipe(prep.cols_to_front, ["sample_name", "figo_stage_major", "figo_stage_major_fact"])
        .drop(["figo_stage_major", "figo_stage"], axis=1)
        .rename(columns={"figo_stage_major_fact": "figo_stage"})
)

print(figo_df.shape)
print(figo_df.shape[0] / survival_df.shape[0])
figo_df.head()

(216, 13)
0.833976833976834


Unnamed: 0,sample_name,figo_stage,age_at_diagnosis,bmi,race_american indian or alaska native,race_asian,race_black or african american,race_native hawaiian or other pacific islander,race_not reported,race_white,ethnicity_hispanic or latino,ethnicity_not hispanic or latino,ethnicity_not reported
0,TCGA-Q1-A73Q-01A-21R-A32P-07,1,16851.0,34.850184,0,0,0,0,0,1,0,1,0
1,TCGA-Q1-A6DW-01A-11R-A32P-07,1,16200.0,24.21875,0,0,0,0,0,1,0,1,0
2,TCGA-Q1-A73R-01A-11R-A33Z-07,1,16701.0,39.542144,0,0,0,0,0,1,0,0,1
3,TCGA-LP-A4AW-01A-11R-A24H-07,1,19079.0,20.829995,0,1,0,0,0,0,0,1,0
4,TCGA-MU-A5YI-01A-11R-A32P-07,1,21927.0,32.979592,0,0,1,0,0,0,0,1,0


# Load normalized matrisome count data

In [7]:
norm_matrisome_counts_df = pd.read_csv(f"{dirs.data_dir}/{unified_dsets[i]}/norm_matrisome_counts.tsv", sep='\t')
norm_filtered_matrisome_counts_t_df = (
    norm_matrisome_counts_df[["geneID"] + list(figo_df.sample_name)]
        .set_index("geneID")                        # set as index so will be column names
        .transpose()
        .rename_axis(None, axis=1)                  # column.name will be set to "geneID", we don't want this
        .reset_index()                              # "sample_name" should now be its own column
        .rename({"index": "sample_name"}, axis=1)
)

In [8]:
joined_df = (
    pd.merge(figo_df, norm_filtered_matrisome_counts_t_df, on="sample_name")
        .set_index("sample_name")
)
joined_df.head()

Unnamed: 0_level_0,figo_stage,age_at_diagnosis,bmi,race_american indian or alaska native,race_asian,race_black or african american,race_native hawaiian or other pacific islander,race_not reported,race_white,ethnicity_hispanic or latino,...,PIK3IP1,C1QTNF2,PCSK5,ANXA1,HGF,VWA2,FGF3,POSTN,NTF3,S100A6
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-Q1-A73Q-01A-21R-A32P-07,1,16851.0,34.850184,0,0,0,0,0,1,0,...,10.63646,7.448303,10.613982,13.5397,6.180838,7.518582,4.602649,9.346523,7.950185,16.000858
TCGA-Q1-A6DW-01A-11R-A32P-07,1,16200.0,24.21875,0,0,0,0,0,1,0,...,9.572978,7.374251,7.794489,17.226296,6.3637,6.165439,4.602649,10.928503,5.094836,14.640358
TCGA-Q1-A73R-01A-11R-A33Z-07,1,16701.0,39.542144,0,0,0,0,0,1,0,...,10.371489,7.546277,7.782646,15.06388,5.924565,11.41953,4.602649,17.293717,5.124649,16.785389
TCGA-LP-A4AW-01A-11R-A24H-07,1,19079.0,20.829995,0,1,0,0,0,0,0,...,10.734178,7.707393,9.750078,15.168182,8.984331,5.508312,4.602649,11.969267,6.953788,14.933013
TCGA-MU-A5YI-01A-11R-A32P-07,1,21927.0,32.979592,0,0,1,0,0,0,0,...,12.349214,7.258085,8.122189,14.860586,8.706173,9.430509,4.890038,11.694373,7.229737,14.409821


# Optimize a SVM

## First, remove columns where std(X) $\approx$ 0

In [9]:
# filtered_joined_df = joined_df.iloc[:, np.where(joined_df.min(axis=0) != joined_df.max(axis=0))[0]]
# sum(joined_df.std(axis=0) < 1e-6)
filtered_joined_df = joined_df.iloc[:, np.where(joined_df.std(axis=0) > 1e-6)[0]]

In [10]:
def objective(h_params, svc_kernel, X, y):
    if svc_kernel == "linear":
        model = SVC(
            kernel=svc_kernel,
            C=h_params[0]
        )
    elif svc_kernel == "poly":
        model = SVC(
            kernel=svc_kernel,
            C=h_params[0],
            gamma=h_params[1],
            degree=h_params[2],
            coef0=h_params[3]
        )
    elif svc_kernel == "rbf":
        model = SVC(
            kernel=svc_kernel,
            C=h_params[0],
            gamma=h_params[1]
        )
    pipeline = make_pipeline(StandardScaler(), model)
    return -np.mean(cross_val_score(
        model,
        X,
        y,
        cv=KFold(n_splits=5, shuffle=True),
        n_jobs=-1,
        scoring="f1_weighted")
    )

In [11]:
X = filtered_joined_df.iloc[:, 1:].values
y = filtered_joined_df.iloc[:, 0].values

In [12]:
space_lin = [
    Real(1e-1, 1e1, name="C")
]

space_poly = [
    Real(1e-1, 1e1, name="C"),
    Real(1e-3, 1e-1, name="gamma"),
    Integer(1, 3, name="degree"),
    Real(-1, 1, name="coef0")
]

space_rbf = [
    Real(1e-1, 1e1, name="C"),
    Real(1e-3, 1e-1, name="gamma")
]

In [13]:
res_gp_linear = gp_minimize(lambda h_ps: objective(h_ps, "linear", X, y), space_lin, verbose=True)

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 21.4846
Function value obtained: -0.4255
Current minimum: -0.4255
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 19.9105
Function value obtained: -0.4498
Current minimum: -0.4498
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 18.7219
Function value obtained: -0.4266
Current minimum: -0.4498
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 19.9443
Function value obtained: -0.4231
Current minimum: -0.4498
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 22.3353
Function value obtained: -0.4104
Current minimum: -0.4498
Iteration No: 6 star

In [14]:
print(f"Best values:\n\tC: {res_gp_linear.x[0]}")
res_gp_linear.fun

Best values:
	C: 6.8292536402462


-0.4663022068292486

In [15]:
res_gp_poly = gp_minimize(lambda h_ps: objective(h_ps, "poly", X, y), space_poly, verbose=True)

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 15.1024
Function value obtained: -0.4135
Current minimum: -0.4135
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 21.5046
Function value obtained: -0.4180
Current minimum: -0.4180
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 14.1413
Function value obtained: -0.4265
Current minimum: -0.4265
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 11.5843
Function value obtained: -0.4292
Current minimum: -0.4292
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 14.6171
Function value obtained: -0.3921
Current minimum: -0.4292
Iteration No: 6 star

In [16]:
print(f"Best values:\n\tC: {res_gp_poly.x[0]}\n\tgamma: {res_gp_poly.x[1]}\n\tdegree: {res_gp_poly.x[2]}\n\tcoef0: {res_gp_poly.x[3]}")
res_gp_poly.fun

Best values:
	C: 6.014034071409524
	gamma: 0.05829891261752253
	degree: 3
	coef0: 0.038020107503995426


-0.47395253886659344

In [17]:
res_gp_rbf = gp_minimize(lambda h_ps: objective(h_ps, "rbf", X, y), space_rbf, verbose=True)

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.1606
Function value obtained: -0.3340
Current minimum: -0.3340
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.1723
Function value obtained: -0.3339
Current minimum: -0.3340
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.1597
Function value obtained: -0.3372
Current minimum: -0.3372
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.1617
Function value obtained: -0.3402
Current minimum: -0.3402
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.1604
Function value obtained: -0.3350
Current minimum: -0.3402
Iteration No: 6 started. 

In [18]:
print(f"Best values:\n\tC: {res_gp_rbf.x[0]}\n\tgamma: {res_gp_rbf.x[1]}")
res_gp_rbf.fun

Best values:
	C: 3.5490033207494798
	gamma: 0.001


-0.35922424607905723

In [19]:
svm_opt_results_df = pd.DataFrame.from_dict(
    dict(linear = res_gp_linear.x, poly = res_gp_poly.x, rbf = res_gp_rbf.x),
    orient="index",
    columns=["C", "gamma", "degree", "coef0"]
)
svm_opt_results_df["min"] = [res_gp_linear.fun, res_gp_poly.fun, res_gp_rbf.fun]
svm_opt_results_df = svm_opt_results_df.reset_index().rename(columns={"index": "kernel"})

In [20]:
svm_opt_results_df.to_csv("svc_opt_res.csv", index=False)