In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from joblib import dump, load

In [22]:
df = pd.read_csv("data_separated/train_data.csv", low_memory=False)
df

Unnamed: 0,age_at_diagnosis,type_of_breast_surgery,cancer_type,cancer_type_detailed,cellularity,chemotherapy,pam50_+_claudin-low_subtype,cohort,er_status_measured_by_ihc,er_status,...,mtap_mut,ppp2cb_mut,smarcd1_mut,nras_mut,ndfip1_mut,hras_mut,prps2_mut,smarcb1_mut,stmn2_mut,siah1_mut
0,66.27,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,High,0,LumB,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
1,57.15,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,Moderate,1,Her2,1.0,Negative,Negative,...,0,0,0,0,0,0,0,0,0,0
2,77.22,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,Moderate,1,claudin-low,1.0,Negative,Negative,...,0,0,0,0,0,0,0,0,0,0
3,63.58,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,Low,0,LumA,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
4,44.95,BREAST CONSERVING,Breast Cancer,Breast Invasive Lobular Carcinoma,Low,0,LumA,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,30.95,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,High,1,Basal,2.0,Negative,Negative,...,0,0,0,0,0,0,0,0,0,0
781,57.98,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,Moderate,0,LumA,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
782,84.22,MASTECTOMY,Breast Cancer,Breast Invasive Lobular Carcinoma,High,0,Her2,1.0,Negative,Positive,...,0,0,0,0,0,0,0,0,0,0
783,43.80,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,Moderate,0,Normal,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0


## Encoding

Se hace uso de de encoding y estandarizacion para ciertos atributos:
- **Atributos clinicos:** Encoding + Estandarizacion
- **mRNA z-score:** Nada
- **Mutaciones:** One-Hot + Estandarización

Cuando usar:
- **Ordinal encoding:** Los datos tienen un orden con sentido como 'low', 'medium', 'high' o 'primero', 'segundo', 'tercero'
- **One-Hot encoding:** cuando los datos no tienen un orden entre categorias, ejemplo: 'rojo', 'verde', 'azul'

Importante: Dado que los clinical attributes deben ser procesados con un scaler, las mutaciones con ordinal encoder + scaler y hay data sin tocar, entonces se utilizará: Sklearn.compose.ColumnTransformer
https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

¿Por qué escalar?
- En algoritmos basados en árboles (RF, DT, GBM) no es necesario escalar ya que no son suceptibles al este cambio.
- En algoritmos basados en distancias (KNN, SVM, Logistic Regression) se benefician de las métricas de distancias, por ello se recomienda escalar

https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf

In [36]:
def encoding_scaler_pipeline(df: pd.DataFrame, scaler: str):
    ordinal_columns = ["cellularity", "integrative_cluster"]
    one_hot_columns = ["type_of_breast_surgery", "cancer_type", "cancer_type_detailed", "pam50_+_claudin-low_subtype", "er_status_measured_by_ihc", "er_status", "her2_status_measured_by_snp6", "her2_status", "tumor_other_histologic_subtype", "inferred_menopausal_state", "primary_tumor_laterality", "oncotree_code", "pr_status", "3-gene_classifier_subtype", "death_from_cancer"]
    mut_columns = [col for col in df.columns if "_mut" in col]
    clinical_attributes = ["age_at_diagnosis", "type_of_breast_surgery", "cancer_type", "cancer_type_detailed", "cellularity", "chemotherapy", "pam50_+_claudin-low_subtype", "cohort", "er_status_measured_by_ihc", "er_status", "neoplasm_histologic_grade", "her2_status_measured_by_snp6", "her2_status", "tumor_other_histologic_subtype", "hormone_therapy", "inferred_menopausal_state", "integrative_cluster", "primary_tumor_laterality", "lymph_nodes_examined_positive", "mutation_count", "nottingham_prognostic_index", "oncotree_code", "overall_survival_months", "overall_survival", "pr_status", "radio_therapy", "3-gene_classifier_subtype", "tumor_size", "tumor_stage", "death_from_cancer"]
    z_score_cols = [col for col in df.columns if col not in clinical_attributes and "mut" not in col ]

    encoding_step = ColumnTransformer(
        transformers = [
            ('mutations', OneHotEncoder(handle_unknown='ignore', sparse_output=False), mut_columns),
            ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ordinal_columns),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), one_hot_columns),
            # Todas las que no quiero escalar porque ya son numeros
            # Se encuentran 2:
            # - z_score
            # - Aquellos en clinical_attributes que ya son numeros
            ('passthrough', 'passthrough', z_score_cols + list(set(clinical_attributes) - (set(one_hot_columns) | set(ordinal_columns))))
        ],
        remainder='passthrough'
    )
    
    if scaler == "StandardScaler":
        pipeline = Pipeline(steps=[
            ('preprocessor', encoding_step),
            ('scaler', StandardScaler())
        ])
    elif scaler == "MinMaxScaler":
        pipeline = Pipeline(steps=[
            ('preprocessor', encoding_step),
            ('scaler', MinMaxScaler())
        ])
    elif scaler == "RobustScaler":
        pipeline = Pipeline(steps=[
            ('preprocessor', encoding_step),
            ('scaler', RobustScaler())
        ])
    else:
        return None, None
    
    
    return pipeline.fit(X=df), (
        list(encoding_step.named_transformers_['mutations'].get_feature_names_out()) +
        list(encoding_step.named_transformers_['ordinal'].get_feature_names_out()) +
        list(encoding_step.named_transformers_['onehot'].get_feature_names_out()) +
        list(encoding_step.named_transformers_['passthrough'].get_feature_names_out())
    )


In [37]:
standard_scaler, columns = encoding_scaler_pipeline(df, "StandardScaler")
std_df = pd.DataFrame(data = standard_scaler.transform(df), columns=columns)
std_df

Unnamed: 0,pik3ca_mut_0,pik3ca_mut_C420R,pik3ca_mut_E110_I112delinsD,pik3ca_mut_E110_R115delinsG,pik3ca_mut_E110del,pik3ca_mut_E453_G460delinsD,pik3ca_mut_E453_L455del,pik3ca_mut_E542K,pik3ca_mut_E542K E726K,pik3ca_mut_E542K N345K,...,nottingham_prognostic_index,neoplasm_histologic_grade,radio_therapy,cohort,hormone_therapy,age_at_diagnosis,chemotherapy,overall_survival_months,overall_survival,mutation_count
0,0.871831,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,0.867233,0.844813,0.707782,0.816962,0.790733,0.451118,-0.545455,2.079405,-0.876366,0.096791
1,-1.147011,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,0.853989,0.844813,0.707782,-1.278839,-1.264649,-0.248938,1.833333,-0.359194,-0.876366,1.806965
2,-1.147011,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,1.822680,0.844813,0.707782,-1.278839,-1.264649,1.291646,1.833333,-1.144447,-0.876366,-0.880452
3,-1.147011,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,-1.039875,-0.737951,-1.412863,0.816962,-1.264649,0.244632,-0.545455,1.139690,-0.876366,-0.391831
4,-1.147011,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,-1.072039,-0.737951,0.707782,0.816962,-1.264649,-1.185416,-0.545455,1.417980,1.141075,0.096791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,0.871831,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,0.871017,0.844813,0.707782,-0.230939,-1.264649,-2.260063,1.833333,1.933140,1.141075,0.096791
781,-1.147011,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,1.775380,0.844813,-1.412863,0.816962,0.790733,-0.185227,-0.545455,-0.174530,-0.876366,-0.147520
782,-1.147011,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,-1.037983,-0.737951,-1.412863,-1.278839,-1.264649,1.828969,-0.545455,-1.149193,-0.876366,-0.147520
783,0.871831,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,-1.058795,-0.737951,0.707782,0.816962,0.790733,-1.273691,-0.545455,0.654731,1.141075,-0.880452


In [25]:
minmax_scaler, columns = encoding_scaler_pipeline(df, "MinMaxScaler")
minmax_df = pd.DataFrame(data = minmax_scaler.transform(df), columns=columns)
minmax_df

Unnamed: 0,pik3ca_mut_0,pik3ca_mut_C420R,pik3ca_mut_E110_I112delinsD,pik3ca_mut_E110_R115delinsG,pik3ca_mut_E110del,pik3ca_mut_E453_G460delinsD,pik3ca_mut_E453_L455del,pik3ca_mut_E542K,pik3ca_mut_E542K E726K,pik3ca_mut_E542K N345K,...,nottingham_prognostic_index,neoplasm_histologic_grade,radio_therapy,cohort,hormone_therapy,age_at_diagnosis,chemotherapy,overall_survival_months,overall_survival,mutation_count
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.703993,1.0,1.0,0.50,1.0,0.596288,0.0,0.847040,0.0,0.111111
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.700780,1.0,1.0,0.00,0.0,0.473642,1.0,0.285658,0.0,0.266667
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.935750,1.0,1.0,0.00,0.0,0.743545,1.0,0.104887,0.0,0.022222
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.241395,0.5,0.0,0.50,0.0,0.560113,0.0,0.630711,0.0,0.066667
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.233593,0.5,1.0,0.50,0.0,0.309575,0.0,0.694776,1.0,0.111111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.704911,1.0,1.0,0.25,0.0,0.121302,1.0,0.813369,1.0,0.111111
781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.924277,1.0,0.0,0.50,1.0,0.484804,0.0,0.328168,0.0,0.088889
782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.241854,0.5,0.0,0.00,0.0,0.837682,0.0,0.103794,0.0,0.088889
783,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.236806,0.5,1.0,0.50,1.0,0.294110,0.0,0.519070,1.0,0.022222


In [26]:
robust_scaler, columns = encoding_scaler_pipeline(df, "RobustScaler")
robust_df = pd.DataFrame(data = robust_scaler.transform(df), columns=columns)
robust_df

Unnamed: 0,pik3ca_mut_0,pik3ca_mut_C420R,pik3ca_mut_E110_I112delinsD,pik3ca_mut_E110_R115delinsG,pik3ca_mut_E110del,pik3ca_mut_E453_G460delinsD,pik3ca_mut_E453_L455del,pik3ca_mut_E542K,pik3ca_mut_E542K E726K,pik3ca_mut_E542K N345K,...,nottingham_prognostic_index,neoplasm_histologic_grade,radio_therapy,cohort,hormone_therapy,age_at_diagnosis,chemotherapy,overall_survival_months,overall_survival,mutation_count
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.514573,0.0,0.0,0.5,0.0,0.272251,0.0,1.271682,0.0,0.25
1,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.507538,0.0,0.0,-0.5,-1.0,-0.205236,1.0,-0.140965,0.0,2.00
2,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.022111,0.0,0.0,-0.5,-1.0,0.845550,1.0,-0.595851,0.0,-0.75
3,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.498492,-1.0,-1.0,0.5,-1.0,0.131414,0.0,0.727318,0.0,-0.25
4,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.515578,-1.0,0.0,0.5,-1.0,-0.843979,0.0,0.888528,1.0,0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.516583,0.0,0.0,0.0,-1.0,-1.576963,1.0,1.186953,1.0,0.25
781,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.996985,0.0,-1.0,0.5,0.0,-0.161780,0.0,-0.033992,0.0,0.00
782,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.497487,-1.0,-1.0,-0.5,-1.0,1.212042,0.0,-0.598600,0.0,0.00
783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.508543,-1.0,0.0,0.5,0.0,-0.904188,0.0,0.446388,1.0,-0.75


In [27]:
std_df.to_csv("data_separated/train_data_std.csv", index=False)
minmax_df.to_csv("data_separated/train_data_minmax.csv", index=False)
robust_df.to_csv("data_separated/train_data_robust.csv", index=False)

In [28]:
if not os.path.exists("results"):
    os.mkdir("results")

dump(standard_scaler, "results/std_scaler.joblib")
dump(minmax_scaler, "results/minmax_scaler.joblib")
dump(robust_df, "results/robust_scaler.joblib")

['results/robust_scaler.joblib']

In [29]:
loaded = load("results/std_scaler.joblib")

In [41]:
data = pd.read_csv("data_separated/test_data.csv")

In [49]:
na_counts = data.apply(pd.isna).sum()
na_counts[na_counts > 0]

Series([], dtype: int64)

In [42]:
loaded.transform(X=data)

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

# DEPRECATED

In [12]:
std_scaler_instance = StandardScaler()
std_scaler_instance.fit(X=df_data.values)
data_scaled_std = std_scaler_instance.transform(X=df_data.values)

In [13]:
minmax_scaler_instance = MinMaxScaler()
minmax_scaler_instance.fit(X=df_data.values)
data_scaled_minmax = minmax_scaler_instance.transform(X=df_data.values)

In [14]:
robust_scaler_instance = RobustScaler()
robust_scaler_instance.fit(X=df_data.values)
data_scaled_robust = robust_scaler_instance.transform(X=df_data.values)

In [17]:
df_scaled_std = pd.DataFrame(data=data_scaled_std, columns=df_data.columns)
df_scaled_minmax = pd.DataFrame(data=data_scaled_minmax, columns=df_data.columns)
df_scaled_robust = pd.DataFrame(data=data_scaled_robust, columns=df_data.columns)

In [23]:
df_scaled_std.to_csv("data_separated/train_data_std.csv", index=False)
df_scaled_minmax.to_csv("data_separated/train_data_minmax.csv", index=False)
df_scaled_robust.to_csv("data_separated/train_data_robust.csv", index=False)

In [22]:
dump(std_scaler_instance, "results/scaler_std.joblib")
dump(minmax_scaler_instance, "results/scaler_std.joblib")
dump(robust_scaler_instance, "results/scaler_std.joblib")

['results/scaler_std.joblib']