In [1]:
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from joblib import dump, load

In [2]:
df = pd.read_csv("data_separated/train_data.csv", low_memory=False)
df

Unnamed: 0,age_at_diagnosis,type_of_breast_surgery,cancer_type,cancer_type_detailed,cellularity,chemotherapy,pam50_+_claudin-low_subtype,cohort,er_status_measured_by_ihc,er_status,...,mtap_mut,ppp2cb_mut,smarcd1_mut,nras_mut,ndfip1_mut,hras_mut,prps2_mut,smarcb1_mut,stmn2_mut,siah1_mut
0,66.27,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,High,0,LumB,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
1,57.15,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,Moderate,1,Her2,1.0,Negative,Negative,...,0,0,0,0,0,0,0,0,0,0
2,77.22,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,Moderate,1,claudin-low,1.0,Negative,Negative,...,0,0,0,0,0,0,0,0,0,0
3,63.58,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,Low,0,LumA,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
4,44.95,BREAST CONSERVING,Breast Cancer,Breast Invasive Lobular Carcinoma,Low,0,LumA,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,30.95,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,High,1,Basal,2.0,Negative,Negative,...,0,0,0,0,0,0,0,0,0,0
781,57.98,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,Moderate,0,LumA,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
782,84.22,MASTECTOMY,Breast Cancer,Breast Invasive Lobular Carcinoma,High,0,Her2,1.0,Negative,Positive,...,0,0,0,0,0,0,0,0,0,0
783,43.80,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,Moderate,0,Normal,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0


## Encoding

Se hace uso de de encoding y estandarizacion para ciertos atributos:
- **Atributos clinicos:** Encoding + Estandarizacion
- **mRNA z-score:** Nada
- **Mutaciones:** One-Hot + Estandarización

Cuando usar:
- **Ordinal encoding:** Los datos tienen un orden con sentido como 'low', 'medium', 'high' o 'primero', 'segundo', 'tercero'
- **One-Hot encoding:** cuando los datos no tienen un orden entre categorias, ejemplo: 'rojo', 'verde', 'azul'

Importante: Dado que los clinical attributes deben ser procesados con un scaler, las mutaciones con ordinal encoder + scaler y hay data sin tocar, entonces se utilizará: Sklearn.compose.ColumnTransformer
https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

¿Por qué escalar?
- En algoritmos basados en árboles (RF, DT, GBM) no es necesario escalar ya que no son suceptibles al este cambio.
- En algoritmos basados en distancias (KNN, SVM, Logistic Regression) se benefician de las métricas de distancias, por ello se recomienda escalar

https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf

In [30]:
def encoding_scaler_pipeline(df: pd.DataFrame, scaler: str):
    ordinal_columns = ["cellularity", "integrative_cluster"]
    one_hot_columns = ["type_of_breast_surgery", "cancer_type", "cancer_type_detailed", "pam50_+_claudin-low_subtype", "er_status_measured_by_ihc", "er_status", "her2_status_measured_by_snp6", "her2_status", "tumor_other_histologic_subtype", "inferred_menopausal_state", "primary_tumor_laterality", "oncotree_code", "pr_status", "3-gene_classifier_subtype", "death_from_cancer"]
    mut_columns = [col for col in df.columns if "_mut" in col]
    clinical_attributes = ["age_at_diagnosis", "type_of_breast_surgery", "cancer_type", "cancer_type_detailed", "cellularity", "chemotherapy", "pam50_+_claudin-low_subtype", "cohort", "er_status_measured_by_ihc", "er_status", "neoplasm_histologic_grade", "her2_status_measured_by_snp6", "her2_status", "tumor_other_histologic_subtype", "hormone_therapy", "inferred_menopausal_state", "integrative_cluster", "primary_tumor_laterality", "lymph_nodes_examined_positive", "mutation_count", "nottingham_prognostic_index", "oncotree_code", "overall_survival_months", "overall_survival", "pr_status", "radio_therapy", "3-gene_classifier_subtype", "tumor_size", "tumor_stage", "death_from_cancer"]
    z_score_cols = [col for col in df.columns if col not in clinical_attributes and "mut" not in col ]

    encoding_step = ColumnTransformer(
        transformers = [
            ('mutations', OneHotEncoder(handle_unknown='ignore', sparse_output=False), mut_columns),
            ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, encoded_missing_value=-1), ordinal_columns),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), one_hot_columns),
            # Todas las que no quiero escalar porque ya son numeros
            # Se encuentran 2:
            # - z_score
            # - Aquellos en clinical_attributes que ya son numeros
            ('passthrough', 'passthrough', z_score_cols + list(set(clinical_attributes) - (set(one_hot_columns) | set(ordinal_columns))))
        ],
        remainder='passthrough'
    )

    if scaler == "StandardScaler":
        pipeline = Pipeline(steps=[
            ('preprocessor', encoding_step),
            ('scaler', StandardScaler())
        ])
    elif scaler == "MinMaxScaler":
        pipeline = Pipeline(steps=[
            ('preprocessor', encoding_step),
            ('scaler', MinMaxScaler())
        ])
    elif scaler == "RobustScaler":
        pipeline = Pipeline(steps=[
            ('preprocessor', encoding_step),
            ('scaler', RobustScaler())
        ])
    else:
        return None, None
    
    return pipeline.fit(X=df), (
        list(encoding_step.named_transformers_['mutations'].get_feature_names_out()) +
        list(encoding_step.named_transformers_['ordinal'].get_feature_names_out()) +
        list(encoding_step.named_transformers_['onehot'].get_feature_names_out()) +
        list(encoding_step.named_transformers_['passthrough'].get_feature_names_out())
    )


In [31]:
standard_scaler, columns = encoding_scaler_pipeline(df, "StandardScaler")
std_df = pd.DataFrame(data = standard_scaler.transform(df), columns=columns)
std_df

Unnamed: 0,pik3ca_mut_0,pik3ca_mut_C420R,pik3ca_mut_E110_I112delinsD,pik3ca_mut_E110_R115delinsG,pik3ca_mut_E110del,pik3ca_mut_E453_G460delinsD,pik3ca_mut_E453_L455del,pik3ca_mut_E542K,pik3ca_mut_E542K E726K,pik3ca_mut_E542K N345K,...,overall_survival_months,hormone_therapy,mutation_count,neoplasm_histologic_grade,overall_survival,age_at_diagnosis,tumor_stage,cohort,chemotherapy,radio_therapy
0,0.871831,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,2.079405,0.790733,0.096791,0.844813,-0.876366,0.451118,0.372317,0.816962,-0.545455,0.707782
1,-1.147011,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,-0.359194,-1.264649,1.806965,0.844813,-0.876366,-0.248938,0.372317,-1.278839,1.833333,0.707782
2,-1.147011,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,-1.144447,-1.264649,-0.880452,0.844813,-0.876366,1.291646,1.943657,-1.278839,1.833333,0.707782
3,-1.147011,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,1.139690,-1.264649,-0.391831,-0.737951,-0.876366,0.244632,0.372317,0.816962,-0.545455,-1.412863
4,-1.147011,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,1.417980,-1.264649,0.096791,-0.737951,1.141075,-1.185416,-1.199022,0.816962,-0.545455,0.707782
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,0.871831,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,1.933140,-1.264649,0.096791,0.844813,1.141075,-2.260063,0.372317,-0.230939,1.833333,0.707782
781,-1.147011,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,-0.174530,0.790733,-0.147520,0.844813,-0.876366,-0.185227,0.372317,0.816962,-0.545455,-1.412863
782,-1.147011,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,-1.149193,-1.264649,-0.147520,-0.737951,-0.876366,1.828969,0.372317,-1.278839,-0.545455,-1.412863
783,0.871831,-0.087762,-0.035714,-0.035714,-0.061938,-0.035714,-0.035714,-0.188733,-0.061938,-0.035714,...,0.654731,0.790733,-0.880452,-0.737951,1.141075,-1.273691,-1.199022,0.816962,-0.545455,0.707782


In [32]:
minmax_scaler, columns = encoding_scaler_pipeline(df, "MinMaxScaler")
minmax_df = pd.DataFrame(data = minmax_scaler.transform(df), columns=columns)
minmax_df

Unnamed: 0,pik3ca_mut_0,pik3ca_mut_C420R,pik3ca_mut_E110_I112delinsD,pik3ca_mut_E110_R115delinsG,pik3ca_mut_E110del,pik3ca_mut_E453_G460delinsD,pik3ca_mut_E453_L455del,pik3ca_mut_E542K,pik3ca_mut_E542K E726K,pik3ca_mut_E542K N345K,...,overall_survival_months,hormone_therapy,mutation_count,neoplasm_histologic_grade,overall_survival,age_at_diagnosis,tumor_stage,cohort,chemotherapy,radio_therapy
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.847040,1.0,0.111111,1.0,0.0,0.596288,0.333333,0.50,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.285658,0.0,0.266667,1.0,0.0,0.473642,0.333333,0.00,1.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.104887,0.0,0.022222,1.0,0.0,0.743545,0.666667,0.00,1.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.630711,0.0,0.066667,0.5,0.0,0.560113,0.333333,0.50,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.694776,0.0,0.111111,0.5,1.0,0.309575,0.000000,0.50,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.813369,0.0,0.111111,1.0,1.0,0.121302,0.333333,0.25,1.0,1.0
781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.328168,1.0,0.088889,1.0,0.0,0.484804,0.333333,0.50,0.0,0.0
782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.103794,0.0,0.088889,0.5,0.0,0.837682,0.333333,0.00,0.0,0.0
783,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.519070,1.0,0.022222,0.5,1.0,0.294110,0.000000,0.50,0.0,1.0


In [33]:
robust_scaler, columns = encoding_scaler_pipeline(df, "RobustScaler")
robust_df = pd.DataFrame(data = robust_scaler.transform(df), columns=columns)
robust_df

Unnamed: 0,pik3ca_mut_0,pik3ca_mut_C420R,pik3ca_mut_E110_I112delinsD,pik3ca_mut_E110_R115delinsG,pik3ca_mut_E110del,pik3ca_mut_E453_G460delinsD,pik3ca_mut_E453_L455del,pik3ca_mut_E542K,pik3ca_mut_E542K E726K,pik3ca_mut_E542K N345K,...,overall_survival_months,hormone_therapy,mutation_count,neoplasm_histologic_grade,overall_survival,age_at_diagnosis,tumor_stage,cohort,chemotherapy,radio_therapy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.271682,0.0,0.25,0.0,0.0,0.272251,0.0,0.5,0.0,0.0
1,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.140965,-1.0,2.00,0.0,0.0,-0.205236,0.0,-0.5,1.0,0.0
2,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.595851,-1.0,-0.75,0.0,0.0,0.845550,1.0,-0.5,1.0,0.0
3,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.727318,-1.0,-0.25,-1.0,0.0,0.131414,0.0,0.5,0.0,-1.0
4,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.888528,-1.0,0.25,-1.0,1.0,-0.843979,-1.0,0.5,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.186953,-1.0,0.25,0.0,1.0,-1.576963,0.0,0.0,1.0,0.0
781,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.033992,0.0,0.00,0.0,0.0,-0.161780,0.0,0.5,0.0,-1.0
782,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.598600,-1.0,0.00,-1.0,0.0,1.212042,0.0,-0.5,0.0,-1.0
783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.446388,0.0,-0.75,-1.0,1.0,-0.904188,-1.0,0.5,0.0,0.0


In [34]:
std_df.to_csv("data_separated/train_data_std.csv", index=False)
minmax_df.to_csv("data_separated/train_data_minmax.csv", index=False)
robust_df.to_csv("data_separated/train_data_robust.csv", index=False)

In [35]:
if not os.path.exists("results"):
    os.mkdir("results")

dump(standard_scaler, "results/std_scaler.joblib")
dump(minmax_scaler, "results/minmax_scaler.joblib")
dump(robust_df, "results/robust_scaler.joblib")

['results/robust_scaler.joblib']

### Testings

In [36]:
rt = load("results/robust_scaler.joblib")

In [38]:
data = pd.read_csv("data_separated/test_data.csv")
data

Unnamed: 0,age_at_diagnosis,type_of_breast_surgery,cancer_type,cancer_type_detailed,cellularity,chemotherapy,pam50_+_claudin-low_subtype,cohort,er_status_measured_by_ihc,er_status,...,mtap_mut,ppp2cb_mut,smarcd1_mut,nras_mut,ndfip1_mut,hras_mut,prps2_mut,smarcb1_mut,stmn2_mut,siah1_mut
0,63.53,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,High,0,LumB,2.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
1,51.74,MASTECTOMY,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,Moderate,0,LumA,1.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
2,66.75,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,Moderate,0,LumA,1.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
3,80.17,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,High,0,LumB,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
4,67.15,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,Moderate,0,Her2,3.0,Negative,Negative,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,61.88,BREAST CONSERVING,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,High,0,LumA,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
106,65.48,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,Moderate,0,claudin-low,3.0,Positve,Negative,...,0,0,0,0,0,0,0,0,0,0
107,38.86,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,High,1,LumA,1.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0
108,44.36,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,High,0,LumA,3.0,Positve,Positive,...,0,0,0,0,0,0,0,0,0,0


In [43]:
rt.transform(data)

SpecificationError: nested renamer is not supported

In [40]:
train_dtypes = df.dtypes
test_dtypes = data.dtypes

In [41]:
comparison = pd.DataFrame({
    'Train_dtype': train_dtypes,
    'Test_dtype': test_dtypes
})

# Find columns where data types differ
different_types = comparison[comparison['Train_dtype'] != comparison['Test_dtype']]

print(different_types)

              Train_dtype Test_dtype
shank2_mut         object      int64
ptprd_mut          object      int64
brca1_mut          object      int64
lifr_mut           object      int64
l1cam_mut          object      int64
chd1_mut           object      int64
arid5b_mut         object      int64
zfp36l1_mut        object      int64
smarcc1_mut        object      int64
cdkn1b_mut         object      int64
gps2_mut           object      int64
stk11_mut          object      int64
ttyh1_mut          object      int64
or6a2_mut          object      int64
large1_mut         object      int64
clk3_mut           object      int64
lipi_mut           object      int64
ppp2r2a_mut        object      int64
prkce_mut          object      int64
gh1_mut            object      int64
kras_mut           object      int64
clrn2_mut          object      int64
cdkn2a_mut         object      int64
ctnna1_mut         object      int64
prr16_mut          object      int64
dtwd2_mut          object      int64
a

In [42]:
# Convert test dataframe columns to match train dataframe columns
for col in different_types.index:
    target_dtype = train_dtypes[col]
    if target_dtype == 'int64':
        data[col] = pd.to_numeric(data[col], errors='coerce').astype('int64')
    elif target_dtype == 'float64':
        data[col] = pd.to_numeric(data[col], errors='coerce').astype('float64')
    elif target_dtype == 'object':
        data[col] = data[col].astype('object')
    elif target_dtype == 'category':
        data[col] = data[col].astype('category')
    # Add more cases if needed for other data types

# Verify if data types are now the same
print("Adjusted test dataframe data types:")
print(data.dtypes)

Adjusted test dataframe data types:
age_at_diagnosis          float64
type_of_breast_surgery     object
cancer_type                object
cancer_type_detailed       object
cellularity                object
                           ...   
hras_mut                    int64
prps2_mut                   int64
smarcb1_mut                 int64
stmn2_mut                   int64
siah1_mut                   int64
Length: 692, dtype: object


# DEPRECATED

In [12]:
std_scaler_instance = StandardScaler()
std_scaler_instance.fit(X=df_data.values)
data_scaled_std = std_scaler_instance.transform(X=df_data.values)

In [13]:
minmax_scaler_instance = MinMaxScaler()
minmax_scaler_instance.fit(X=df_data.values)
data_scaled_minmax = minmax_scaler_instance.transform(X=df_data.values)

In [14]:
robust_scaler_instance = RobustScaler()
robust_scaler_instance.fit(X=df_data.values)
data_scaled_robust = robust_scaler_instance.transform(X=df_data.values)

In [17]:
df_scaled_std = pd.DataFrame(data=data_scaled_std, columns=df_data.columns)
df_scaled_minmax = pd.DataFrame(data=data_scaled_minmax, columns=df_data.columns)
df_scaled_robust = pd.DataFrame(data=data_scaled_robust, columns=df_data.columns)

In [23]:
df_scaled_std.to_csv("data_separated/train_data_std.csv", index=False)
df_scaled_minmax.to_csv("data_separated/train_data_minmax.csv", index=False)
df_scaled_robust.to_csv("data_separated/train_data_robust.csv", index=False)

In [22]:
dump(std_scaler_instance, "results/scaler_std.joblib")
dump(minmax_scaler_instance, "results/scaler_std.joblib")
dump(robust_scaler_instance, "results/scaler_std.joblib")

['results/scaler_std.joblib']