This file preprocesses mRNA and Clinical data and saves, X_train, y_train, X_test, y_test files for future use

In [1]:
import pandas as pd
import config
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
import joblib

from preprocessing_utils import load_clinical_data, load_mrna_data, load_mutation_data, generate_recurrence_labels, drop_patients_missing_data, MrnaPreprocessorWrapper, ClinicalPreprocessorWrapper, MutationPreprocessorWrapper

In [None]:
clinical_df = load_clinical_data(config.CLINICAL_DATA_PATH)
mrna_df = load_mrna_data(config.MRNA_DATA_PATH)
mutation_df = load_mutation_data(config.MUTATION_DATA_PATH)
labels = generate_recurrence_labels(
    treatment_file=config.TREATMENT_DATA_PATH,
    status_file=config.STATUS_DATA_PATH,
    clinical_file=config.CLINICAL_DATA_PATH,
)

print("Clinical data shape:", clinical_df.shape)
print("mRNA data shape:", mrna_df.shape)
print("Mutation data shape:", mutation_df.shape)
print("Labels shape:", labels.shape)


clinical_df, mrna_df, mutation_df, labels = drop_patients_missing_data(clinical_df, mrna_df, mutation_df, labels)

clinical_cols = clinical_df.columns.tolist()
mrna_cols = mrna_df.columns.tolist()
mutation_cols = mutation_df.columns.tolist()

print("Clinical data shape:", clinical_df.shape)
print("mRNA data shape:", mrna_df.shape)
print("Mutation data shape:", mutation_df.shape)
print("Labels shape:", labels.shape)


full_df = clinical_df.join(mrna_df, how="inner").join(mutation_df, how="inner")

X_train, X_test, y_train, y_test = train_test_split(full_df, labels, test_size=0.2, random_state=config.SEED, stratify=labels)

preprocessor = ColumnTransformer(
    transformers=[
        ("clinical", ClinicalPreprocessorWrapper(
            cols_to_remove=config.CLINICAL_COLS_TO_REMOVE,
            categorical_cols=config.CATEGORICAL_COLS,
            max_null_frac=config.CLINICAL_MAX_NULL_FRAC,
            uniform_thresh=config.CLINICAL_UNIFORM_THRESH
        ), clinical_cols),

        ("mrna", MrnaPreprocessorWrapper(
            max_null_frac=config.MAX_NULL_FRAC,
            uniform_thresh=config.UNIFORM_THRESHOLD,
            corr_thresh=config.CORRELATION_THRESHOLD,
            var_thresh=config.VARIANCE_THRESHOLD,
            re_run_pruning=config.RE_RUN_PRUNING,
            literature_genes=config.LITERATURE_GENES,
            correlated_genes_path=config.CORRELATED_GENES_PATH,
            use_stability_selection=config.USE_STABILITY_SELECTION,
            n_boots=config.N_BOOTS,
            fpr_alpha=config.FPR_ALPHA,
            stability_threshold=config.STABILITY_THRESHOLD,
            random_state=config.SEED
        ), mrna_cols),

        ("mutation", MutationPreprocessorWrapper(
            max_null_frac=config.MUTATION_MAX_NULL_FRAC,
            uniform_thresh=config.MUTATION_UNIFORM_THRESH
        ), clinical_cols),
    ]
)

preprocessor.set_output(transform="pandas") # otherwise, output is converted to numpy array


Clinical data shape: (529, 37)
mRNA data shape: (527, 20531)
Mutation data shape: (515, 19112)
Labels shape: (529,)
Clinical data shape: (452, 37)
mRNA data shape: (452, 20531)
Mutation data shape: (452, 19112)
Labels shape: (452,)


In [5]:
preprocessor.fit(X_train, y_train)

X_train_preprocessed = pd.DataFrame(
    preprocessor.transform(X_train),
    index=X_train.index,
    columns=preprocessor.get_feature_names_out()
)

X_test_preprocessed = pd.DataFrame(
    preprocessor.transform(X_test),
    index=X_test.index,
    columns=preprocessor.get_feature_names_out()
)


# # # --- Step 8: Save preprocessed data and labels ---
# joblib.dump(X_train_preprocessed, "../data/mutation/X_train.pkl")
# joblib.dump(X_test_preprocessed, "../data/mutation/X_test.pkl")
# joblib.dump(y_train, "../data/mutation/y_train.pkl")
# joblib.dump(y_test, "../data/mutation/y_test.pkl")
# joblib.dump(preprocessor, "../data/mutation/preprocessor.pkl")  # Save preprocessor for future use



Dropped 3024 columns with >25.0% nulls
Dropped 0 highly uniform columns
Dropped 0 low variance columns (<1e-05)
saving correlated genes to  ../new_data/correlated_genes_to_remove.pkl
Dropped 159 correlated genes (>0.9 correlation)
dropping 3183 columns total
Dropped 5 columns with >30.0% nulls
Dropped 5 highly uniform columns


TypeError: Cannot convert [['UCEC_MSI' 'UCEC_CN_HIGH' 'UCEC_CN_LOW' ... 'UCEC_MSI' 'UCEC_MSI'
  'UCEC_MSI']
 ['a766d113-bc3f-4448-a4e5-f391fb0c0e22'
  'ac216bb5-15e4-4462-955e-8a522380690e'
  '9cde90c8-2be0-4328-a032-ff1dc19e66fd' ...
  '2714af93-e200-4b4a-9df8-7d837b37489f'
  'a438dce7-6592-4ea6-a401-57f5a8fe8ba6'
  'e5d74e47-b63f-410a-8b7a-c9aacc9aa337']
 ['1731' '911' '1477' ... '802' '991' '833']
 ...
 ['0:DiseaseFree' nan '0:DiseaseFree' ... nan '1:Recurred/Progressed'
  '0:DiseaseFree']
 ['0:CENSORED' '1:PROGRESSION' '0:CENSORED' ... '0:CENSORED'
  '1:PROGRESSION' '0:CENSORED']
 ['EUR' 'EUR' 'EUR' ... 'EUR' 'EUR' 'EUR']] to numeric