This file preprocesses mRNA and Clinical data and saves, X_train, y_train, X_test, y_test files for future use

In [1]:
import pandas as pd
import config
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
import joblib

from preprocessing_utils import load_clinical_data, load_mrna_data, load_mutation_data, generate_recurrence_labels, drop_patients_missing_data, MrnaPreprocessorWrapper, ClinicalPreprocessorWrapper, MutationPreprocessorWrapper

In [None]:
clinical_df = load_clinical_data(config.CLINICAL_DATA_PATH)
mrna_df = load_mrna_data(config.MRNA_DATA_PATH)
mutation_df = load_mutation_data(config.MUTATION_DATA_PATH)
labels = generate_recurrence_labels(
    treatment_file=config.TREATMENT_DATA_PATH,
    status_file=config.STATUS_DATA_PATH,
    clinical_file=config.CLINICAL_DATA_PATH,
)

print("Clinical data shape:", clinical_df.shape)
print("mRNA data shape:", mrna_df.shape)
print("Mutation data shape:", mutation_df.shape)
print("Labels shape:", labels.shape)


clinical_df, mrna_df, mutation_df, labels = drop_patients_missing_data(clinical_df, mrna_df, mutation_df, labels)

clinical_cols = clinical_df.columns.tolist()
mrna_cols = mrna_df.columns.tolist()
mutation_cols = mutation_df.columns.tolist()

print("Clinical data shape:", clinical_df.shape)
print("mRNA data shape:", mrna_df.shape)
print("Mutation data shape:", mutation_df.shape)
print("Labels shape:", labels.shape)


full_df = clinical_df.join(mrna_df, how="inner").join(mutation_df, how="inner")

X_train, X_test, y_train, y_test = train_test_split(full_df, labels, test_size=0.2, random_state=config.SEED, stratify=labels)

preprocessor = ColumnTransformer(
    transformers=[
        ("clinical", ClinicalPreprocessorWrapper(
            cols_to_remove=config.CLINICAL_COLS_TO_REMOVE,
            categorical_cols=config.CATEGORICAL_COLS,
            max_null_frac=config.CLINICAL_MAX_NULL_FRAC,
            uniform_thresh=config.CLINICAL_UNIFORM_THRESH
        ), clinical_cols),

        ("mrna", MrnaPreprocessorWrapper(
            max_null_frac=config.MAX_NULL_FRAC,
            uniform_thresh=config.UNIFORM_THRESHOLD,
            corr_thresh=config.CORRELATION_THRESHOLD,
            var_thresh=config.VARIANCE_THRESHOLD,
            re_run_pruning=config.RE_RUN_PRUNING,
            literature_genes=config.LITERATURE_GENES,
            correlated_genes_path=config.CORRELATED_GENES_PATH,
            use_stability_selection=config.USE_STABILITY_SELECTION,
            n_boots=config.N_BOOTS_FPR,
            fpr_alpha=config.FPR_ALPHA,
            stability_threshold=config.STABILITY_THRESHOLD_FPR,
            random_state=config.SEED
        ), mrna_cols),

        ("mutation", MutationPreprocessorWrapper(
            max_null_frac=config.MUTATION_MAX_NULL_FRAC,
            uniform_thresh=config.MUTATION_UNIFORM_THRESH
        ), mutation_cols),
    ]
)

preprocessor.set_output(transform="pandas") # otherwise, output is converted to numpy array


Clinical data shape: (529, 37)
mRNA data shape: (527, 20531)
Mutation data shape: (515, 19112)
Labels shape: (529,)
Clinical data shape: (452, 37)
mRNA data shape: (452, 20531)
Mutation data shape: (452, 19112)
Labels shape: (452,)


AttributeError: module 'config' has no attribute 'N_BOOTS'

In [None]:
print(len(set(X_train.columns) - set(mutation_cols)))
print(len(set(X_train.columns) - set(mrna_cols)))
print(len(set(X_train.columns) - set(clinical_cols)))
print(len(X_train.columns))

20568
19149
39643
39680


In [None]:
preprocessor.fit(X_train, y_train)

X_train_preprocessed = pd.DataFrame(
    preprocessor.transform(X_train),
    index=X_train.index,
    columns=preprocessor.get_feature_names_out()
)

X_test_preprocessed = pd.DataFrame(
    preprocessor.transform(X_test),
    index=X_test.index,
    columns=preprocessor.get_feature_names_out()
)




Dropped 3024 columns with >25.0% nulls
Dropped 0 highly uniform columns
Dropped 0 low variance columns (<1e-05)
dropping 3183 columns total
Dropped 0 columns with >30.0% nulls
Dropped 1378 highly uniform columns
Dropped 0 low variance columns (<1e-05)
saving correlated genes to  ../data/correlated_genes_to_remove.pkl
Dropped 0 correlated genes (>0.9 correlation)


  f = msb / msw
 16753 17675] are constant.
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
 13410 13870 14259 15757 17146] are constant.
  f = msb / msw
 15919 17068 17129 17661] are constant.
  f = msb / msw
 12848 13575 13810 14579 15435 15574 15728 16419 16813 17146 17463] are constant.
  f = msb / msw
 16195 16234 17010] are constant.
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
 10095 11449 13870 15534 16325] are constant.
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
 16593 17389 17515] are constant.
  f = msb / msw
  8645  8648  8803 10553 11258 12595 12945 13550 13985 15152 15859 15905
 17009 17257] are constant.
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  4886  6085  6630  6634  6710  7305  8102 10380 11881 12611 12959 13272
 15859 16325 16593 16722 16837] are constant.
  f = 

Stability selection: kept 754 / 17734 features (80% stability threshold)Used 100 boots
dropping 18358 columns total
dropping 3183 columns total
dropping 18358 columns total
dropping 3183 columns total
dropping 18358 columns total


In [None]:
# # --- Step 8: Save preprocessed data and labels ---
joblib.dump(X_train_preprocessed, "../data/mutation/X_train.pkl")
joblib.dump(X_test_preprocessed, "../data/mutation/X_test.pkl")
joblib.dump(y_train, "../data/mutation/y_train.pkl")
joblib.dump(y_test, "../data/mutation/y_test.pkl")
joblib.dump(preprocessor, "../data/mutation/preprocessor.pkl")  # Save preprocessor for future use



['../data/mutation/preprocessor.pkl']

In [None]:
kept_mutation_features = set(X_train_preprocessed.columns).intersection(mutation_cols)
kept_clinical_features = set(X_train_preprocessed.columns).intersection(clinical_cols)
kept_mrna_features = set(X_train_preprocessed.columns).intersection(mrna_cols)
print("Number of mutation features kept:", len(kept_mutation_features))
print("Number of clinical features kept:", len(kept_clinical_features))


Number of mutation features kept: 0
Number of clinical features kept: 0
Number of mRNA features kept: 0


In [None]:
# Convert to sets
all_cols = set(X_train.columns)
mutation_cols = set(mutation_cols)
clinical_cols = set(clinical_cols)
mrna_cols = set(mrna_cols)

# Intersections
mut_in_X = all_cols & mutation_cols
clin_in_X = all_cols & clinical_cols
mrna_in_X = all_cols & mrna_cols

print("Mutation features in X:", len(mut_in_X))
print("Clinical features in X:", len(clin_in_X))
print("mRNA features in X:", len(mrna_in_X))

# Optional: check if any columns are unaccounted for
unclassified = all_cols - (mutation_cols | clinical_cols | mrna_cols)
print("Unclassified features:", len(unclassified))


Mutation features in X: 19112
Clinical features in X: 37
mRNA features in X: 20531
Unclassified features: 0
