# Clase Supervised Classification ML con evaluación Cross-Validation

## Importar librerías

In [28]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from joblib import dump, load


## Importar los datos

In [7]:
# Tu código aquí debajo
df = pd.read_excel('diabetes_clean.xlsx')
df.sample(8)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Satisfaction,Job,Outcome
433,8,100,74,40,215,39.4,661.0,43,Unsatisfied,Human Resources,1
201,9,106,52,0,0,31.2,0.38,42,Extremely satisfied,High School Teacher,0
266,2,102,86,36,120,45.5,127.0,23,Satisfied,Market Research Analyst,1
577,1,149,68,29,127,29.3,349.0,42,Satisfied,Human Resources,1
168,8,196,76,29,280,37.5,605.0,57,Extremely satisfied,Human Resources,1
61,7,62,78,0,0,32.6,391.0,41,Unsatisfied,High School Teacher,0
455,1,99,72,30,18,38.6,412.0,21,Extremely unsatisfied,Statistician,0
302,1,107,72,30,82,30.8,821.0,24,Unsatisfied,Legislator,0


## Detectar y tratar duplicados

In [8]:
# Tu código aquí debajo
df.duplicated().sum()


0

## Detectar y tratar valores nulos

In [9]:
# Tu código aquí debajo
df.isnull().sum()


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Satisfaction                0
Job                         0
Outcome                     0
dtype: int64

In [None]:
# Tu código aquí debajo



## Detectar y tratar valores atípicos (ouliers)

In [10]:
# Tu código aquí debajo
numericas=['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']

# FUNCIÓN QUE DEVUELVE LOS NÚMEROS DE FILA EN LOS QUE LA VARIABLE EN CUESTIÓN ES OULIER
def outliers(var):
  q1=var.quantile(0.25)
  q3=var.quantile(0.75)
  riq=q3-q1
  sup=q3+1.5*(riq)
  inf=q1-1.5*(riq)
  outl=(var>sup) | (var<inf)
  return outl

for i in df.loc[:,numericas]:
  print("Cantida de oultliers de la variable",i,":",outliers(df[i]).sum())
# Eliminar outliers
# Tu código
for i in df.loc[:,numericas]:
  df=df[~outliers(df[i])]
df.sample(5)

Cantida de oultliers de la variable Pregnancies : 0
Cantida de oultliers de la variable Glucose : 9
Cantida de oultliers de la variable BloodPressure : 4
Cantida de oultliers de la variable SkinThickness : 0
Cantida de oultliers de la variable Insulin : 4
Cantida de oultliers de la variable BMI : 4
Cantida de oultliers de la variable DiabetesPedigreeFunction : 10
Cantida de oultliers de la variable Age : 10


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Satisfaction,Job,Outcome
354,2,83,66,23,50,32.2,497.0,22,Unsatisfied,High School Teacher,0
368,9,120,72,22,56,20.8,733.0,48,Unsatisfied,High School Teacher,0
546,3,121,52,0,0,36.0,127.0,25,Extremely unsatisfied,Market Research Analyst,1
253,2,112,68,22,94,34.1,315.0,26,Unsatisfied,Physicians,0
416,7,125,86,0,0,37.6,304.0,51,Extremely unsatisfied,Financial Analyst,0


# Preparamos los datos



## Transformar variables ordinales en numéricas

Como vemos tenemos la variable Satisfaction que es una variable ordinal y podemos suponer que el impacto que podrá tener sobre la variable a predecir (diabetes si/no) permanecerá constante al pasar de una categoría a la siguiente. por eso, la codificaremos como variable ordinal

In [11]:
# Creamos el objeto que realizará la transformación dándole el orden en una lista con doble corchete
enc=OrdinalEncoder(categories=[['Extremely unsatisfied','Unsatisfied','Satisfied','Extremely satisfied']],dtype='int')
# Particularizamos el codificador en nuestros datos
# Tu código
enc.fit(df[['Satisfaction']])
# Aplicamos el codificador ya particularizado a nuestros datos y lo guardamos machacando la misma variable
# Tu código
df['Satisfaction']=enc.transform(df[['Satisfaction']])
print(df.shape)
df.sample(5)


(576, 11)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Satisfaction,Job,Outcome
597,9,140,94,0,0,32.7,734.0,45,2,Financial Analyst,1
205,12,92,62,7,258,27.6,926.0,44,3,High School Teacher,1
278,8,95,72,0,0,36.8,485.0,57,2,Human Resources,0
284,1,90,62,12,43,27.2,0.58,24,1,Firefighter,0
201,9,106,52,0,0,31.2,0.38,42,3,High School Teacher,0


## Transformar variables nominales en numéricas

Como vemos tenemos la variable Job que es una variable nominal (sin orden) y debenmos codificarla mediante el One-Hot encoder

In [12]:
# Tu código
# Creamos el objeto que realizará la transformación

#Instancio el OneHot
onehot=OneHotEncoder()

# Aplico el OneHot a la columna job y guardo el resultao en a
a=onehot.fit_transform(df[['Job']])

# Convierto el a en data frame y lo llamo encoded_df
encoded_df = pd.DataFrame(a.toarray(), columns=onehot.get_feature_names_out(['Job']))

# Reseteo el índice de los dos data frames antes de concatenarlos
df.reset_index(drop=True, inplace=True)
encoded_df.reset_index(drop=True, inplace=True)

# concateno los dos data frames y los guardo machacando el df
df=pd.concat([df,encoded_df],axis=1)

# Elimino la columna Job original antigua, la categórica
df.drop(columns='Job',inplace=True)

# Muestro tamaño yu sample del data frame df transformado
print(df.shape)
df.sample(5)



(576, 20)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Satisfaction,Outcome,Job_Database Administrator,Job_Epidemiologist,Job_Financial Analyst,Job_Firefighter,Job_High School Teacher,Job_Human Resources,Job_Legislator,Job_Market Research Analyst,Job_Physicians,Job_Statistician
299,3,96,56,34,115,24.7,944.0,39,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
86,3,171,72,33,135,33.3,199.0,24,3,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
221,5,108,72,43,75,36.1,263.0,33,1,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
168,5,109,62,41,129,35.8,514.0,25,2,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
200,4,95,70,32,0,32.1,612.0,24,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# GENERACIÓN Y EVALUACIÓN DE DIFERENTES MODELOS

## 0. Separar la variable a predecir y las predictoras

In [13]:
X=df.drop(columns=['Outcome'],inplace=False)
y=df['Outcome']


In [14]:
# Imprime la dimensión de los datos que vayas creando
# Tu código aquí debajo

print("Tamaño X",X.shape)
print("Tamaño y",y.shape)

Tamaño X (576, 19)
Tamaño y (576,)


## 1. CROSS-VALIDATION. Crear los diferentes grupos K-FOLD

In [16]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)

## 2. Bucles de los K-Fold con todos los pasos para cada K_Fold PARA VARIOS MODELOS

### 2.1. Primer Modelo Regresión Logística

In [17]:
LR = LogisticRegression(max_iter=10000)
accuracies=[]
precisions=[]
recalls=[]
f1s=[]
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    estandarizador = StandardScaler()
    estandarizador.fit(X_train)
    X_train_std=estandarizador.transform(X_train)
    X_test_std=estandarizador.transform(X_test)
    LR.fit(X_train,y_train)
    predictions = LR.predict(X_test)
    accuracies.append(accuracy_score(y_test, predictions))
    precisions.append(precision_score(y_test, predictions))
    recalls.append(recall_score(y_test, predictions))
    f1s.append(f1_score(y_test, predictions))

In [21]:
# MODELO LR
metricas_LR=[]
metricas_LR.append(sum(accuracies)/len(accuracies))
metricas_LR.append(sum(precisions)/len(precisions))
metricas_LR.append(sum(recalls)/len(recalls))
metricas_LR.append(sum(f1s)/len(f1s))

print("Métricas LR")
print("Accuracy",sum(accuracies)/len(accuracies))
print("Precision",sum(precisions)/len(precisions))
print("Recall",sum(recalls)/len(recalls))
print("F1",sum(f1s)/len(f1s))

Métricas LR
Accuracy 0.689236111111111
Precision 0.45384615384615384
Recall 0.4805898937323791
F1 0.46570627276412085


### 2.2. Segundo Modelo Decision Tree SIN ESTANDARIZAR

In [19]:
DT = DecisionTreeClassifier()
accuracies=[]
precisions=[]
recalls=[]
f1s=[]
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    DT.fit(X_train,y_train)
    predictions = DT.predict(X_test)
    accuracies.append(accuracy_score(y_test, predictions))
    precisions.append(precision_score(y_test, predictions))
    recalls.append(recall_score(y_test, predictions))
    f1s.append(f1_score(y_test, predictions))

In [22]:
# MODELO DT
metricas_DT=[]
metricas_DT.append(sum(accuracies)/len(accuracies))
metricas_DT.append(sum(precisions)/len(precisions))
metricas_DT.append(sum(recalls)/len(recalls))
metricas_DT.append(sum(f1s)/len(f1s))

print("Métricas DT")
print("Accuracy",sum(accuracies)/len(accuracies))
print("Precision",sum(precisions)/len(precisions))
print("Recall",sum(recalls)/len(recalls))
print("F1",sum(f1s)/len(f1s))

Métricas DT
Accuracy 0.689236111111111
Precision 0.45384615384615384
Recall 0.4805898937323791
F1 0.46570627276412085


### 2.3. Tercer Modelo Random Forest

In [25]:
RF=RandomForestClassifier()
accuracies=[]
precisions=[]
recalls=[]
f1s=[]
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    estandarizador = StandardScaler()
    estandarizador.fit(X_train)
    X_train_std=estandarizador.transform(X_train)
    X_test_std=estandarizador.transform(X_test)
    RF.fit(X_train,y_train)
    predictions = RF.predict(X_test)
    accuracies.append(accuracy_score(y_test, predictions))
    precisions.append(precision_score(y_test, predictions))
    recalls.append(recall_score(y_test, predictions))
    f1s.append(f1_score(y_test, predictions))

In [26]:
# MODELO RF
metricas_RF=[]
metricas_RF.append(sum(accuracies)/len(accuracies))
metricas_RF.append(sum(precisions)/len(precisions))
metricas_RF.append(sum(recalls)/len(recalls))
metricas_RF.append(sum(f1s)/len(f1s))

print("Métricas RF")
print("Accuracy",sum(accuracies)/len(accuracies))
print("Precision",sum(precisions)/len(precisions))
print("Recall",sum(recalls)/len(recalls))
print("F1",sum(f1s)/len(f1s))

Métricas RF
Accuracy 0.7916666666666666
Precision 0.7010407694516222
Recall 0.4685534591194968
F1 0.557637779907023


### 2.4. Cuarto Modelo k-Nearest Neighbor

In [31]:
KNN=RandomForestClassifier()
accuracies=[]
precisions=[]
recalls=[]
f1s=[]
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    estandarizador = StandardScaler()
    estandarizador.fit(X_train)
    X_train_std=estandarizador.transform(X_train)
    X_test_std=estandarizador.transform(X_test)
    KNN.fit(X_train,y_train)
    predictions = KNN.predict(X_test)
    accuracies.append(accuracy_score(y_test, predictions))
    precisions.append(precision_score(y_test, predictions))
    recalls.append(recall_score(y_test, predictions))
    f1s.append(f1_score(y_test, predictions))

In [32]:
# MODELO KNN
metricas_KNN=[]
metricas_KNN.append(sum(accuracies)/len(accuracies))
metricas_KNN.append(sum(precisions)/len(precisions))
metricas_KNN.append(sum(recalls)/len(recalls))
metricas_KNN.append(sum(f1s)/len(f1s))

print("Métricas KNN")
print("Accuracy",sum(accuracies)/len(accuracies))
print("Precision",sum(precisions)/len(precisions))
print("Recall",sum(recalls)/len(recalls))
print("F1",sum(f1s)/len(f1s))

Métricas KNN
Accuracy 0.7725694444444443
Precision 0.6466838931955211
Recall 0.4313597918022121
F1 0.5161212208942567


In [33]:
dict={"Logistic Regression":metricas_LR, "DecisionTree":metricas_DT, "RandomForest":metricas_RF , "K-NN":metricas_KNN}
Resultados=pd.DataFrame(dict,index=["Acuracy","Precision","Recall","F1"])
Resultados

Unnamed: 0,Logistic Regression,DecisionTree,RandomForest,K-NN
Acuracy,0.689236,0.689236,0.791667,0.772569
Precision,0.453846,0.453846,0.701041,0.646684
Recall,0.48059,0.48059,0.468553,0.43136
F1,0.465706,0.465706,0.557638,0.516121
