In [1]:
import os
import pandas as pd
import numpy as np
import pickle
from sklearn import set_config
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from joblib import dump, load

In [2]:
clinical_attributes = pd.read_csv('step_03/clinical_attributes.csv').drop(columns=["patient_id"])
mutation = pd.read_csv('step_03/mutation.csv').drop(columns=["patient_id"])
z_score = pd.read_csv('step_03/z_score.csv').drop(columns=["patient_id"])
response = pd.read_csv('step_03/response.csv').drop(columns=["patient_id"])

In [3]:
ca_columns = clinical_attributes.columns[1:]
mutation_columns = mutation.columns[1:]
z_score_columns = z_score.columns[1:]
ordinal_columns = ["cellularity", "integrative_cluster"]
one_hot_columns = ["type_of_breast_surgery", "cancer_type", "cancer_type_detailed", "pam50_+_claudin-low_subtype", "er_status_measured_by_ihc", "er_status", "her2_status_measured_by_snp6", "her2_status", "tumor_other_histologic_subtype", "inferred_menopausal_state", "primary_tumor_laterality", "oncotree_code", "pr_status", "3-gene_classifier_subtype"]

## Encoding

Se hace uso de de encoding y estandarizacion para ciertos atributos:
- **Atributos clinicos:** Encoding + Estandarizacion
- **mRNA z-score:** Nada
- **Mutaciones:** One-Hot + Estandarización

Cuando usar:
- **Ordinal encoding:** Los datos tienen un orden con sentido como 'low', 'medium', 'high' o 'primero', 'segundo', 'tercero'
- **One-Hot encoding:** cuando los datos no tienen un orden entre categorias, ejemplo: 'rojo', 'verde', 'azul'

Importante: Dado que los clinical attributes deben ser procesados con un scaler, las mutaciones con ordinal encoder + scaler y hay data sin tocar, entonces se utilizará: Sklearn.compose.ColumnTransformer
https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

¿Por qué escalar?
- En algoritmos basados en árboles (RF, DT, GBM) no es necesario escalar ya que no son suceptibles al este cambio.
- En algoritmos basados en distancias (KNN, SVM, Logistic Regression) se benefician de las métricas de distancias, por ello se recomienda escalar

https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf

### Clinical

In [4]:
ordinal_columns = ["cellularity", "integrative_cluster"]
one_hot_columns = ["type_of_breast_surgery", "cancer_type", "cancer_type_detailed", "pam50_+_claudin-low_subtype", "er_status_measured_by_ihc", "er_status", "her2_status_measured_by_snp6", "her2_status", "tumor_other_histologic_subtype", "inferred_menopausal_state", "primary_tumor_laterality", "oncotree_code", "pr_status", "3-gene_classifier_subtype"]

onehot_encoder = OneHotEncoder()
ordinal_encoder = OrdinalEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', onehot_encoder, one_hot_columns),
        ('ordinal', ordinal_encoder, ordinal_columns)
    ],
    remainder='passthrough'
)

preprocessor.fit(clinical_attributes)

export_ca = preprocessor.transform(clinical_attributes)

In [5]:
std_scaler = RobustScaler()

std_scaler.fit(export_ca)

std_export_ca = std_scaler.transform(export_ca)

In [6]:
export_df = pd.DataFrame(std_export_ca)
export_df.to_csv('step_04/clinical_attributes.csv', index=False)

### Z-Score

In [7]:
z_score.to_csv('step_04/z_score.csv', index=False)

### Mutation

In [8]:
mut_columns = mutation.columns[1:-1]

onehot_encoder = OneHotEncoder(sparse_output=False)


preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', onehot_encoder, mut_columns)
    ],
    remainder='passthrough'
)

onehot_encoder.fit(mutation)

export_mut = onehot_encoder.transform(mutation)

In [9]:
std_scaler = RobustScaler()

std_scaler.fit(export_mut)

std_export_mut = std_scaler.transform(export_mut)

In [10]:
export_df = pd.DataFrame(std_export_mut)
export_df.to_csv('step_04/mutation.csv', index=False)

In [11]:
response["overall_survival"].to_csv('step_04/response.csv', index=False)