This file preprocesses mRNA and Clinical data and saves, X_train, y_train, X_test, y_test files for future use

In [1]:
import pandas as pd
import config
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
import joblib

from preprocessing_utils import load_clinical_data, load_mrna_data, generate_recurrence_labels, drop_patients_missing_data, MrnaPreprocessorWrapper, ClinicalPreprocessorWrapper

In [2]:
clinical_df = load_clinical_data(config.CLINICAL_DATA_PATH)
mrna_df = load_mrna_data(config.MRNA_DATA_PATH)
labels = generate_recurrence_labels(
    treatment_file=config.TREATMENT_DATA_PATH,
    status_file=config.STATUS_DATA_PATH,
    clinical_file=config.CLINICAL_DATA_PATH,
)

clinical_df, mrna_df, labels = drop_patients_missing_data(clinical_df, mrna_df, labels)

clinical_cols = clinical_df.columns.tolist()
mrna_cols = mrna_df.columns.tolist()
full_df = clinical_df.join(mrna_df, how="inner")

X_train, X_test, y_train, y_test = train_test_split(full_df, labels, test_size=0.2, random_state=config.SEED, stratify=labels)

preprocessor = ColumnTransformer(
    transformers=[
        ("clinical", ClinicalPreprocessorWrapper(
            cols_to_remove=config.CLINICAL_COLS_TO_REMOVE,
            categorical_cols=config.CATEGORICAL_COLS,
            max_null_frac=config.CLINICAL_MAX_NULL_FRAC,
            uniform_thresh=config.CLINICAL_UNIFORM_THRESH
        ), clinical_cols),

        ("mrna", MrnaPreprocessorWrapper(
            max_null_frac=config.MAX_NULL_FRAC,
            uniform_thresh=config.UNIFORM_THRESHOLD,
            corr_thresh=config.CORRELATION_THRESHOLD,
            var_thresh=config.VARIANCE_THRESHOLD,
            re_run_pruning=config.RE_RUN_PRUNING,
            literature_genes=config.LITERATURE_GENES,
            correlated_genes_path=config.CORRELATED_GENES_PATH,
            use_stability_selection=config.USE_STABILITY_SELECTION,
            n_boots=config.N_BOOTS,
            fpr_alpha=config.FPR_ALPHA,
            stability_threshold=config.STABILITY_THRESHOLD,
            random_state=config.SEED
        ), mrna_cols),
    ]
)

preprocessor.set_output(transform="pandas") # otherwise, output is converted to numpy array

preprocessor.fit(X_train, y_train)

X_train_preprocessed = pd.DataFrame(
    preprocessor.transform(X_train),
    index=X_train.index,
    columns=preprocessor.get_feature_names_out()
)

X_test_preprocessed = pd.DataFrame(
    preprocessor.transform(X_test),
    index=X_test.index,
    columns=preprocessor.get_feature_names_out()
)

# # --- Step 8: Save preprocessed data and labels ---
joblib.dump(X_train_preprocessed, "../data/no_boot/X_train.pkl")
joblib.dump(X_test_preprocessed, "../data/no_boot/X_test.pkl")
joblib.dump(y_train, "../data/no_boot/y_train.pkl")
joblib.dump(y_test, "../data/no_boot/y_test.pkl")
joblib.dump(preprocessor, "../data/no_boot/preprocessor.pkl")  # Save preprocessor for future use



Dropped 3024 columns with >25.0% nulls
Dropped 0 highly uniform columns
Dropped 0 low variance columns (<1e-05)
dropping 3158 columns total
dropping 3158 columns total
dropping 3158 columns total


['../data/no_boot/preprocessor.pkl']