# Pipeline - Health Twin

## Imports

In [125]:
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import timedelta
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer

# Modelo ML
from sklearn.ensemble import RandomForestClassifier

# Guardado del modelo ML
from joblib import dump, load

# Imports relativos a la pipeline
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

## Lectura de los datos

In [2]:
patients_data_df = pd.read_csv("patient_data_after_first_pipeline.csv")

In [3]:
patients_data_df.head()

Unnamed: 0,Edad,Sexo,altura,peso,ALT-GPT,AST-GOT,BEecf_amax,BEecf_amin,BEecf_mean,Basófilos,...,VT_espirado_max,VT_espirado_mean,VT_espirado_min,pCO2_amax,pCO2_amin,pCO2_mean,pO2_amax,pO2_amin,pO2_mean,days_of_stay
0,63.0,0.0,156.8,73.0,46.0,28.0,4.25,3.25,3.75,0.025,...,451.75,343.779427,75.0,40.25,38.0,39.0625,105.0,82.75,93.5,5
1,60.0,0.0,156.8,41.0,55.302326,23.604651,10.976744,7.534884,9.261434,0.034419,...,565.953488,476.269601,434.116279,54.744186,44.023256,48.847287,130.767442,88.44186,108.410271,44
2,38.0,0.0,156.8,103.0,14.444444,26.4,8.944444,8.277778,8.62037,0.066667,...,428.055556,329.14417,191.5,51.055556,46.111111,48.569444,119.055556,67.444444,93.310185,21
3,46.6,0.0,156.8,55.0,59.98,37.26,21.38,18.92,20.088333,0.104,...,485.9,354.365843,266.02,80.5,69.1,75.34,71.0,58.7,64.846667,11
4,62.0,0.0,152.0,60.0,73.333333,74.111111,6.222222,3.333333,5.037037,0.044444,...,605.711111,487.444732,381.622222,46.666667,36.444444,41.555556,130.777778,93.555556,113.638889,19


In [4]:
test_patient = patients_data_df.iloc[[0]]

In [5]:
test_patient

Unnamed: 0,Edad,Sexo,altura,peso,ALT-GPT,AST-GOT,BEecf_amax,BEecf_amin,BEecf_mean,Basófilos,...,VT_espirado_max,VT_espirado_mean,VT_espirado_min,pCO2_amax,pCO2_amin,pCO2_mean,pO2_amax,pO2_amin,pO2_mean,days_of_stay
0,63.0,0.0,156.8,73.0,46.0,28.0,4.25,3.25,3.75,0.025,...,451.75,343.779427,75.0,40.25,38.0,39.0625,105.0,82.75,93.5,5


## 1. Transformer NAs

## 2. Transformer etiquetas edad

In [6]:
bins_age = [0, 4, 14, 20, 29, 39, 49, 59, 69, 79, 89, 120]
labels_age = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"]

In [7]:
class AgeToLabelTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, bins_age, labels_age):
        self.bins_age = bins_age
        self.labels_age = labels_age
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        
        X["age_range"] = pd.cut(X.Edad, self.bins_age, labels = self.labels_age, include_lowest = True)
        X.drop("Edad", axis=1, inplace=True)
        
        return X

## 3. Transformer normalización de los datos

In [119]:
class CustomMinMaxScaler(BaseEstimator, TransformerMixin):
    def __init__(self, scaler=MinMaxScaler()):
        self.scaler = scaler
        self.non_vitals_columns = ["age_range", "Sexo", "altura", "peso", "days_of_stay"]
        
    def fit(self, X, y=None):
        X = X.copy()
        
        self.non_vitals_df = X[self.non_vitals_columns].copy()
        
        X.drop(self.non_vitals_columns, axis=1, inplace=True)
        
        self.scaler.fit(X)
        
        return self
    
    def transform(self, X):
        X = X.copy()
        
        #self.non_vitals_df = X[self.non_vitals_columns].copy()
        
        X.drop(self.non_vitals_columns, axis=1, inplace=True)
        
        X = pd.DataFrame(self.scaler.transform(X), index=X.index, columns=X.columns)
        #X = (X-X.mean())/X.std()
        
        X[self.non_vitals_columns] = self.non_vitals_df
        
        return X
        

In [30]:
class NormalizeVitalsTransformer(BaseEstimator, TransformerMixin):
        
    def fit(self, X, y=None):
        self.non_vitals_df = X[["age_range", "Sexo", "altura", "peso", "days_of_stay"]].copy()
        
        return self
    
    def transform(self, X):
        X = X.copy()
        
        X.drop(["age_range", "Sexo", "altura", "peso", "days_of_stay"], axis=1, inplace=True)
        
        X = (X-X.mean())/X.std()
        
        X[["age_range", "Sexo", "altura", "peso", "days_of_stay"]] = self.non_vitals_df
        
        return X

## 4. Transformer etiquetas rangos días para el alta

In [9]:
bins_length_stay = [0,5,10,20,30,50,120]
labels_length_stay = [5,10,20,30,50,120]

In [10]:
class LengthOfStayToLabelTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, bins_length_stay, labels):
        self.bins_length_stay = bins_length_stay
        self.labels_length_stay = labels_length_stay
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        
        X['stay_bin']=pd.cut(x = X['days_of_stay'], #encoded df is the raw dataframe following one-hot encoding
                             bins = self.bins_length_stay)
        X['stay_label']=pd.cut(x = X['days_of_stay'],
                               bins = self.bins_length_stay,
                               labels = self.labels_length_stay) #lets also rename our bins to be more descriptive since now they are much larger
        
        X['stay_bin'] = X['stay_bin'].apply(lambda x: str(x).replace(',',' -'))
        X['stay_bin'] = X['stay_bin'].apply(lambda x: str(x).replace('120','120+')) 
        
        return X

In [11]:
class FinalDFTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        
        X.drop(['days_of_stay','stay_bin'], axis=1, inplace=True)
        
        return X

In [120]:
stay_range_pipe = Pipeline(steps=[
    ("AgeToLabelTransformer", AgeToLabelTransformer(bins_age, labels_age)),
    ("CustomMinMaxScaler", CustomMinMaxScaler()),
#    ("NormalizeVitalsTransformer", NormalizeVitalsTransformer()),
    ("LengthOfStayToLabelTransformer", LengthOfStayToLabelTransformer(bins_length_stay, labels_length_stay)),
    ("FinalDFTransformer", FinalDFTransformer())
])

In [134]:
df_copy = patients_data_df.copy()

In [136]:
X = df_copy.drop("days_of_stay", axis=1)
y = df_copy["days_of_stay"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [139]:
df_copy = pipe.fit_transform(df_copy)

In [140]:
df_copy.head()

Unnamed: 0,ALT-GPT,AST-GOT,BEecf_amax,BEecf_amin,BEecf_mean,Basófilos,Bicarbonato,Bicarbonato Real_amax,Bicarbonato Real_amin,Bicarbonato Real_mean,...,pCO2_amin,pCO2_mean,pO2_amax,pO2_amin,pO2_mean,age_range,Sexo,altura,peso,stay_label
0,0.033097,0.027912,0.511682,0.538031,0.527635,0.166667,0.512638,0.433872,0.479058,0.457434,...,0.269953,0.241691,0.408124,0.307899,0.361502,8,0.0,156.8,73.0,5
1,0.042086,0.016843,0.700283,0.664354,0.686978,0.229457,0.089108,0.655505,0.615853,0.638781,...,0.411344,0.446223,0.707166,0.369212,0.529505,8,0.0,156.8,41.0,50
2,0.002604,0.023882,0.643302,0.686255,0.668444,0.444444,0.192735,0.606756,0.65096,0.630698,...,0.460355,0.440415,0.571244,0.143028,0.359364,5,0.0,156.8,103.0,30
3,0.046606,0.051232,0.991963,1.0,1.0,0.693333,0.258541,1.0,0.999581,1.0,...,1.0,1.0,0.01354,0.048833,0.038648,6,0.0,156.8,55.0,20
4,0.059509,0.144036,0.566978,0.540487,0.564845,0.296296,0.27155,0.505353,0.488656,0.505128,...,0.233438,0.293803,0.707286,0.424297,0.588419,8,0.0,152.0,60.0,20


In [132]:
pipe.transform(test_patient)

Unnamed: 0,ALT-GPT,AST-GOT,BEecf_amax,BEecf_amin,BEecf_mean,Basófilos,Bicarbonato,Bicarbonato Real_amax,Bicarbonato Real_amin,Bicarbonato Real_mean,...,pCO2_amin,pCO2_mean,pO2_amax,pO2_amin,pO2_mean,age_range,Sexo,altura,peso,stay_label
0,0.033097,0.027912,0.511682,0.538031,0.527635,0.166667,0.512638,0.433872,0.479058,0.457434,...,0.269953,0.241691,0.408124,0.307899,0.361502,8,0.0,156.8,73.0,5


In [173]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.multiclass import OneVsRestClassifier

In [175]:
pipe_model = Pipeline(steps=[
    # Modelo ML
    ("RandomForest", OneVsRestClassifier(RandomForestClassifier(n_estimators=100, max_depth=15, class_weight='balanced')))
])

## Separar el dataset en train y test

In [176]:
X = df_copy.drop("stay_label", axis=1)
y = df_copy["stay_label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [177]:
pipe_model.fit(X_train, y_train)

Pipeline(steps=[('RandomForest',
                 OneVsRestClassifier(estimator=RandomForestClassifier(class_weight='balanced',
                                                                      max_depth=15)))])

In [178]:
y_score = pipe_model.predict_proba(X_train)

In [179]:
n_classes = len(df_copy.stay_label.unique())

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

ValueError: key of type tuple not found and not a MultiIndex

In [169]:
def plot_roc_curve(fpr, tpr, roc_auc, n_classes):
    # Plot of a ROC curve for a specific class
    plt.figure()
    plt.plot(fpr[5], tpr[5], label='ROC curve (area = %0.2f)' % roc_auc[5])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic class 6')
    plt.legend(loc="lower right")
    plt.show()

    # Plot ROC curve
    plt.figure(figsize=(10, 8), dpi=80)
    plt.plot(fpr["micro"], tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["micro"]),
            color='deeppink', linestyle=':', linewidth=4)
    '''plt.plot(fpr["macro"], tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
             color='navy', linestyle=':', linewidth=4)'''
    
    for i in range(n_classes):
        plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
                                       ''.format(i, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Some extension of Receiver operating characteristic to multi-class')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
plot_roc_curve(fpr, tpr, roc_auc, n_classes)