In [1]:
import os
import pandas as pd
import numpy as np
import pickle
from sklearn import set_config
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from joblib import dump, load

In [2]:
df = pd.read_csv("data_separated/train_data.csv", low_memory=False)
df

Unnamed: 0,age_at_diagnosis,type_of_breast_surgery,cancer_type,cancer_type_detailed,cellularity,chemotherapy,pam50_+_claudin-low_subtype,cohort,er_status_measured_by_ihc,er_status,...,mtap_mut,ppp2cb_mut,smarcd1_mut,nras_mut,ndfip1_mut,hras_mut,prps2_mut,smarcb1_mut,stmn2_mut,siah1_mut
0,66.27,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,High,0,LumB,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
1,57.15,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,Moderate,1,Her2,1.0,Negative,Negative,...,0,0,0,0,0,0,0,0,0,0
2,77.22,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,Moderate,1,claudin-low,1.0,Negative,Negative,...,0,0,0,0,0,0,0,0,0,0
3,63.58,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,Low,0,LumA,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
4,44.95,BREAST CONSERVING,Breast Cancer,Breast Invasive Lobular Carcinoma,Low,0,LumA,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,30.95,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,High,1,Basal,2.0,Negative,Negative,...,0,0,0,0,0,0,0,0,0,0
781,57.98,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,Moderate,0,LumA,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
782,84.22,MASTECTOMY,Breast Cancer,Breast Invasive Lobular Carcinoma,High,0,Her2,1.0,Negative,Positive,...,0,0,0,0,0,0,0,0,0,0
783,43.80,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,Moderate,0,Normal,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0


## Encoding

Se hace uso de de encoding y estandarizacion para ciertos atributos:
- **Atributos clinicos:** Encoding + Estandarizacion
- **mRNA z-score:** Nada
- **Mutaciones:** One-Hot + Estandarización

Cuando usar:
- **Ordinal encoding:** Los datos tienen un orden con sentido como 'low', 'medium', 'high' o 'primero', 'segundo', 'tercero'
- **One-Hot encoding:** cuando los datos no tienen un orden entre categorias, ejemplo: 'rojo', 'verde', 'azul'

Importante: Dado que los clinical attributes deben ser procesados con un scaler, las mutaciones con ordinal encoder + scaler y hay data sin tocar, entonces se utilizará: Sklearn.compose.ColumnTransformer
https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

¿Por qué escalar?
- En algoritmos basados en árboles (RF, DT, GBM) no es necesario escalar ya que no son suceptibles al este cambio.
- En algoritmos basados en distancias (KNN, SVM, Logistic Regression) se benefician de las métricas de distancias, por ello se recomienda escalar

https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf

In [3]:
set_config(display="diagram")

In [12]:
def encoding_scaler_pipeline(df: pd.DataFrame, scaler: str):
    ordinal_columns = ["cellularity", "integrative_cluster"]
    one_hot_columns = ["type_of_breast_surgery", "cancer_type", "cancer_type_detailed", "pam50_+_claudin-low_subtype", "er_status_measured_by_ihc", "er_status", "her2_status_measured_by_snp6", "her2_status", "tumor_other_histologic_subtype", "inferred_menopausal_state", "primary_tumor_laterality", "oncotree_code", "pr_status", "3-gene_classifier_subtype", "death_from_cancer"]
    mut_columns = [col for col in df.columns if "_mut" in col]
    clinical_attributes = ["age_at_diagnosis", "type_of_breast_surgery", "cancer_type", "cancer_type_detailed", "cellularity", "chemotherapy", "pam50_+_claudin-low_subtype", "cohort", "er_status_measured_by_ihc", "er_status", "neoplasm_histologic_grade", "her2_status_measured_by_snp6", "her2_status", "tumor_other_histologic_subtype", "hormone_therapy", "inferred_menopausal_state", "integrative_cluster", "primary_tumor_laterality", "lymph_nodes_examined_positive", "mutation_count", "nottingham_prognostic_index", "oncotree_code", "overall_survival_months", "overall_survival", "pr_status", "radio_therapy", "3-gene_classifier_subtype", "tumor_size", "tumor_stage", "death_from_cancer"]
    z_score_cols = [col for col in df.columns if col not in clinical_attributes and "mut" not in col ]
    # Valores que ya son numeros
    encoding_ignore_list = z_score_cols + list(set(clinical_attributes) - (set(one_hot_columns) | set(ordinal_columns)))

    print("Encoding data:")
    print(f"- One_hot (mut):  {len(mut_columns)}")
    print(f"- Ordinal (hand): {len(ordinal_columns)}")
    print(f"- One_hot (hand): {len(one_hot_columns)}")
    print(f"- ignored:        {len(encoding_ignore_list)}")

    encoding_step = ColumnTransformer(
        transformers = [
            ('mutations', OneHotEncoder(handle_unknown='ignore', sparse_output=False), mut_columns),
            ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, encoded_missing_value=-1), ordinal_columns),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), one_hot_columns),
            # Todas las que no quiero escalar porque ya son numeros
            # Se encuentran 2:
            # - z_score
            # - Aquellos en clinical_attributes que ya son numeros
            ('passthrough', 'passthrough', encoding_ignore_list)
        ]
    )

    # Para extraer las columnas resultantes
    encoding_step.fit(df)

    scaler_columns = encoding_step.get_feature_names_out()
    # El nombre cambia a passthrough__algo
    scaler_columns = [i for i, name in enumerate(scaler_columns) if name.replace("passthrough__","") not in z_score_cols]
    
    print("Resulting encoding:")
    print(f"Number of Z-score columns: {len(z_score_cols)}")
    print(f"New columns count: {len(scaler_columns)}")
    

    if scaler == "StandardScaler":
        scaler_step = ColumnTransformer(
            transformers = [
                ("scaling", StandardScaler(), scaler_columns)
            ],
            remainder='passthrough'
        )
    elif scaler == "MinMaxScaler":
        scaler_step = ColumnTransformer(
            transformers = [
                ("scaling", MinMaxScaler(), scaler_columns)
            ],
            remainder='passthrough'
        )
    elif scaler == "RobustScaler":
        scaler_step = ColumnTransformer(
            transformers = [
                ("scaling", RobustScaler(), scaler_columns)
            ],
            remainder='passthrough'
        )
    else:
        return None, None

    pipeline = Pipeline(
        steps=[
            ("preprocessor", encoding_step),
            ("scaler", scaler_step)
        ]
    )

    return pipeline, pipeline.fit(X=df), (
        list(encoding_step.named_transformers_['mutations'].get_feature_names_out()) +
        list(encoding_step.named_transformers_['ordinal'].get_feature_names_out()) +
        list(encoding_step.named_transformers_['onehot'].get_feature_names_out()) +
        list(encoding_step.named_transformers_['passthrough'].get_feature_names_out())
    )


In [13]:
pipeline, standard_scaler, columns = encoding_scaler_pipeline(df, "StandardScaler")
std_df = pd.DataFrame(data = standard_scaler.transform(df), columns=columns)
std_df

Encoding data:
- One_hot (mut):  173
- Ordinal (hand): 2
- One_hot (hand): 15
- ignored:        502
Resulting encoding:
Number of Z-score columns: 489
New columns count: 3663


Unnamed: 0,pik3ca_mut_0,pik3ca_mut_C420R,pik3ca_mut_E110_I112delinsD,pik3ca_mut_E110_R115delinsG,pik3ca_mut_E110del,pik3ca_mut_E453_G460delinsD,pik3ca_mut_E453_L455del,pik3ca_mut_E542K,pik3ca_mut_E542K E726K,pik3ca_mut_E542K N345K,...,cohort,neoplasm_histologic_grade,overall_survival_months,lymph_nodes_examined_positive,chemotherapy,radio_therapy,nottingham_prognostic_index,tumor_stage,tumor_size,age_at_diagnosis
0,0.871831,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,-0.5559,-0.2513,0.6768,-0.4699,-0.8043,-0.1200,-0.2491,-0.1981,-0.5930,-0.8780
1,-1.147011,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,-0.0549,0.6913,2.5801,-0.9218,-0.8076,-2.1920,-2.2269,2.6627,0.5872,0.9482
2,-1.147011,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,-0.1460,0.3313,0.0037,-0.5823,0.7512,-0.9808,-1.4851,0.1895,0.0004,0.8290
3,-1.147011,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,-0.8698,-0.2068,0.8378,-0.1821,0.6702,1.0958,-0.5434,-0.5250,-0.5375,0.0854
4,-1.147011,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,-0.0866,-0.3135,0.3093,-0.4607,-1.3723,0.7862,0.6970,0.7847,-0.0541,2.3132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,0.871831,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,0.7196,1.0929,-0.8605,1.0466,-0.2438,0.5274,-0.2833,-0.3524,-0.0032,-0.7324
781,-1.147011,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,0.1539,0.4315,-0.0844,-1.1365,-0.4443,0.6097,2.0752,-0.1821,-0.2287,-0.8529
782,-1.147011,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,-0.3020,-0.4788,3.6125,1.9925,0.2253,-0.8414,1.7093,-0.6921,0.6906,-0.8006
783,0.871831,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,-0.2806,-0.5389,-0.5759,-1.4376,1.5154,-0.9568,0.7952,-0.0290,-0.0375,0.1497


In [14]:
pipeline

In [103]:
minmax_scaler, columns = encoding_scaler_pipeline(df, "MinMaxScaler")
minmax_df = pd.DataFrame(data = minmax_scaler.transform(df), columns=columns)
minmax_df

Encoding data:
- One_hot (mut):  173
- Ordinal (hand): 2
- One_hot (hand): 15
- ignored:        502
Resulting encoding:
Number of Z-score columns: 489
New columns count: 3663


Unnamed: 0,pik3ca_mut_0,pik3ca_mut_C420R,pik3ca_mut_E110_I112delinsD,pik3ca_mut_E110_R115delinsG,pik3ca_mut_E110del,pik3ca_mut_E453_G460delinsD,pik3ca_mut_E453_L455del,pik3ca_mut_E542K,pik3ca_mut_E542K E726K,pik3ca_mut_E542K N345K,...,tumor_size,nottingham_prognostic_index,radio_therapy,overall_survival_months,overall_survival,neoplasm_histologic_grade,age_at_diagnosis,chemotherapy,lymph_nodes_examined_positive,mutation_count
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.5559,-0.2513,0.6768,-0.4699,-0.8043,-0.1200,-0.2491,-0.1981,-0.5930,-0.8780
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.0549,0.6913,2.5801,-0.9218,-0.8076,-2.1920,-2.2269,2.6627,0.5872,0.9482
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.1460,0.3313,0.0037,-0.5823,0.7512,-0.9808,-1.4851,0.1895,0.0004,0.8290
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.8698,-0.2068,0.8378,-0.1821,0.6702,1.0958,-0.5434,-0.5250,-0.5375,0.0854
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.0866,-0.3135,0.3093,-0.4607,-1.3723,0.7862,0.6970,0.7847,-0.0541,2.3132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.7196,1.0929,-0.8605,1.0466,-0.2438,0.5274,-0.2833,-0.3524,-0.0032,-0.7324
781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.1539,0.4315,-0.0844,-1.1365,-0.4443,0.6097,2.0752,-0.1821,-0.2287,-0.8529
782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.3020,-0.4788,3.6125,1.9925,0.2253,-0.8414,1.7093,-0.6921,0.6906,-0.8006
783,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.2806,-0.5389,-0.5759,-1.4376,1.5154,-0.9568,0.7952,-0.0290,-0.0375,0.1497


In [104]:
robust_scaler, columns = encoding_scaler_pipeline(df, "RobustScaler")
robust_df = pd.DataFrame(data = robust_scaler.transform(df), columns=columns)
robust_df

Encoding data:
- One_hot (mut):  173
- Ordinal (hand): 2
- One_hot (hand): 15
- ignored:        502
Resulting encoding:
Number of Z-score columns: 489
New columns count: 3663


Unnamed: 0,pik3ca_mut_0,pik3ca_mut_C420R,pik3ca_mut_E110_I112delinsD,pik3ca_mut_E110_R115delinsG,pik3ca_mut_E110del,pik3ca_mut_E453_G460delinsD,pik3ca_mut_E453_L455del,pik3ca_mut_E542K,pik3ca_mut_E542K E726K,pik3ca_mut_E542K N345K,...,tumor_size,nottingham_prognostic_index,radio_therapy,overall_survival_months,overall_survival,neoplasm_histologic_grade,age_at_diagnosis,chemotherapy,lymph_nodes_examined_positive,mutation_count
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.5559,-0.2513,0.6768,-0.4699,-0.8043,-0.1200,-0.2491,-0.1981,-0.5930,-0.8780
1,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.0549,0.6913,2.5801,-0.9218,-0.8076,-2.1920,-2.2269,2.6627,0.5872,0.9482
2,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.1460,0.3313,0.0037,-0.5823,0.7512,-0.9808,-1.4851,0.1895,0.0004,0.8290
3,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.8698,-0.2068,0.8378,-0.1821,0.6702,1.0958,-0.5434,-0.5250,-0.5375,0.0854
4,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.0866,-0.3135,0.3093,-0.4607,-1.3723,0.7862,0.6970,0.7847,-0.0541,2.3132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.7196,1.0929,-0.8605,1.0466,-0.2438,0.5274,-0.2833,-0.3524,-0.0032,-0.7324
781,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.1539,0.4315,-0.0844,-1.1365,-0.4443,0.6097,2.0752,-0.1821,-0.2287,-0.8529
782,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.3020,-0.4788,3.6125,1.9925,0.2253,-0.8414,1.7093,-0.6921,0.6906,-0.8006
783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.2806,-0.5389,-0.5759,-1.4376,1.5154,-0.9568,0.7952,-0.0290,-0.0375,0.1497


In [105]:
std_df.to_csv("data_separated/train_data_std.csv", index=False)
minmax_df.to_csv("data_separated/train_data_minmax.csv", index=False)
robust_df.to_csv("data_separated/train_data_robust.csv", index=False)

In [114]:
if not os.path.exists("results"):
    os.mkdir("results")

dump(standard_scaler, "results/std_scaler.joblib")
dump(minmax_scaler, "results/minmax_scaler.joblib")
dump(robust_scaler, "results/robust_scaler.joblib")

['results/robust_scaler.joblib']

### Testings

In [115]:
rt = load("results/robust_scaler.joblib")

In [116]:
data = pd.read_csv("data_separated/test_data.csv")
data

Unnamed: 0,age_at_diagnosis,type_of_breast_surgery,cancer_type,cancer_type_detailed,cellularity,chemotherapy,pam50_+_claudin-low_subtype,cohort,er_status_measured_by_ihc,er_status,...,mtap_mut,ppp2cb_mut,smarcd1_mut,nras_mut,ndfip1_mut,hras_mut,prps2_mut,smarcb1_mut,stmn2_mut,siah1_mut
0,63.53,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,High,0,LumB,2.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
1,51.74,MASTECTOMY,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,Moderate,0,LumA,1.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
2,66.75,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,Moderate,0,LumA,1.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
3,80.17,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,High,0,LumB,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
4,67.15,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,Moderate,0,Her2,3.0,Negative,Negative,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,61.88,BREAST CONSERVING,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,High,0,LumA,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
106,65.48,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,Moderate,0,claudin-low,3.0,Positve,Negative,...,0,0,0,0,0,0,0,0,0,0
107,38.86,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,High,1,LumA,1.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
108,44.36,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,High,0,LumA,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0


In [117]:
data.dtypes

age_at_diagnosis          float64
type_of_breast_surgery     object
cancer_type                object
cancer_type_detailed       object
cellularity                object
                           ...   
hras_mut                    int64
prps2_mut                   int64
smarcb1_mut                 int64
stmn2_mut                   int64
siah1_mut                   int64
Length: 692, dtype: object

In [118]:
mut_columns = [col for col in data.columns if "_mut" in col]
for col in mut_columns:
    data[col] = data[col].astype('object')

print("Adjusted test dataframe data types:")
print(data.dtypes)

Adjusted test dataframe data types:
age_at_diagnosis          float64
type_of_breast_surgery     object
cancer_type                object
cancer_type_detailed       object
cellularity                object
                           ...   
hras_mut                   object
prps2_mut                  object
smarcb1_mut                object
stmn2_mut                  object
siah1_mut                  object
Length: 692, dtype: object


In [119]:
rt.transform(data)

array([[ 0.    ,  0.    ,  0.    , ...,  3.2955, -0.68  ,  1.0835],
       [ 0.    ,  0.    ,  0.    , ..., -0.528 ,  0.2123, -0.7469],
       [-1.    ,  0.    ,  0.    , ..., -0.7529, -0.4853, -0.6189],
       ...,
       [ 0.    ,  0.    ,  0.    , ...,  0.0554,  0.7301, -0.0947],
       [ 0.    ,  0.    ,  0.    , ..., -0.2562,  0.8312, -0.1766],
       [-1.    ,  0.    ,  0.    , ...,  8.6199,  0.2396,  1.8495]])

# DEPRECATED

In [12]:
std_scaler_instance = StandardScaler()
std_scaler_instance.fit(X=df_data.values)
data_scaled_std = std_scaler_instance.transform(X=df_data.values)

In [13]:
minmax_scaler_instance = MinMaxScaler()
minmax_scaler_instance.fit(X=df_data.values)
data_scaled_minmax = minmax_scaler_instance.transform(X=df_data.values)

In [14]:
robust_scaler_instance = RobustScaler()
robust_scaler_instance.fit(X=df_data.values)
data_scaled_robust = robust_scaler_instance.transform(X=df_data.values)

In [17]:
df_scaled_std = pd.DataFrame(data=data_scaled_std, columns=df_data.columns)
df_scaled_minmax = pd.DataFrame(data=data_scaled_minmax, columns=df_data.columns)
df_scaled_robust = pd.DataFrame(data=data_scaled_robust, columns=df_data.columns)

In [23]:
df_scaled_std.to_csv("data_separated/train_data_std.csv", index=False)
df_scaled_minmax.to_csv("data_separated/train_data_minmax.csv", index=False)
df_scaled_robust.to_csv("data_separated/train_data_robust.csv", index=False)

In [22]:
dump(std_scaler_instance, "results/scaler_std.joblib")
dump(minmax_scaler_instance, "results/scaler_std.joblib")
dump(robust_scaler_instance, "results/scaler_std.joblib")

['results/scaler_std.joblib']