# Clase Supervised Classification ML

## Importar librerías

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from joblib import dump, load
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## Importar los datos

In [6]:
# Tu código aquí debajo
df = pd.read_excel('diabetes_clean.xlsx')
df.sample(8)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Satisfaction,Job,Outcome
191,7,181,84,21,192,35.9,586.0,51,Unsatisfied,High School Teacher,1
189,3,74,68,28,45,29.7,293.0,23,Extremely satisfied,High School Teacher,0
242,5,77,82,41,42,35.8,156.0,35,Extremely satisfied,High School Teacher,0
501,4,90,88,47,54,37.7,362.0,29,Unsatisfied,High School Teacher,0
117,4,154,62,31,284,32.8,237.0,23,Extremely unsatisfied,Epidemiologist,0
16,7,196,90,0,0,39.8,451.0,41,Extremely unsatisfied,Human Resources,1
388,0,84,82,31,125,38.2,233.0,23,Extremely satisfied,Physicians,0
120,5,147,78,0,0,33.7,218.0,65,Extremely unsatisfied,Database Administrator,0


## Detectar y tratar duplicados

In [7]:
# Tu código aquí debajo
df.duplicated().sum()

0

## Detectar y tratar valores nulos

In [8]:
# Tu código aquí debajo
df.isnull().sum()


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Satisfaction                0
Job                         0
Outcome                     0
dtype: int64

In [None]:
# Tu código aquí debajo



## Detectar y tratar valores atípicos (ouliers)

In [9]:
# Tu código aquí debajo
numericas=['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']

# FUNCIÓN QUE DEVUELVE LOS NÚMEROS DE FILA EN LOS QUE LA VARIABLE EN CUESTIÓN ES OULIER
def outliers(var):
  q1=var.quantile(0.25)
  q3=var.quantile(0.75)
  riq=q3-q1
  sup=q3+1.5*(riq)
  inf=q1-1.5*(riq)
  outl=(var>sup) | (var<inf)
  return outl

for i in df.loc[:,numericas]:
  print("Cantida de oultliers de la variable",i,":",outliers(df[i]).sum())
# Eliminar outliers
# Tu código
for i in df.loc[:,numericas]:
  df=df[~outliers(df[i])]
df.sample(5)

Cantida de oultliers de la variable Pregnancies : 0
Cantida de oultliers de la variable Glucose : 9
Cantida de oultliers de la variable BloodPressure : 4
Cantida de oultliers de la variable SkinThickness : 0
Cantida de oultliers de la variable Insulin : 4
Cantida de oultliers de la variable BMI : 4
Cantida de oultliers de la variable DiabetesPedigreeFunction : 10
Cantida de oultliers de la variable Age : 10


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Satisfaction,Job,Outcome
349,12,140,85,33,0,37.4,244.0,41,Extremely unsatisfied,High School Teacher,0
363,0,91,68,32,210,39.9,381.0,25,Extremely unsatisfied,High School Teacher,0
291,5,99,54,28,83,34.0,499.0,30,Extremely satisfied,Human Resources,0
13,1,115,70,30,96,34.6,529.0,32,Extremely satisfied,Human Resources,1
602,6,162,62,0,0,24.3,178.0,50,Extremely satisfied,Financial Analyst,1


# Preparamos los datos



## Transformar variables ordinales en numéricas

Como vemos tenemos la variable Satisfaction que es una variable ordinal y podemos suponer que el impacto que podrá tener sobre la variable a predecir (diabetes si/no) permanecerá constante al pasar de una categoría a la siguiente. por eso, la codificaremos como variable ordinal

In [10]:
# Creamos el objeto que realizará la transformación dándole el orden en una lista con doble corchete
enc=OrdinalEncoder(categories=[['Extremely unsatisfied','Unsatisfied','Satisfied','Extremely satisfied']],dtype='int')
# Particularizamos el codificador en nuestros datos
# Tu código
enc.fit(df[['Satisfaction']])
# Aplicamos el codificador ya particularizado a nuestros datos y lo guardamos machacando la misma variable
# Tu código
df['Satisfaction']=enc.transform(df[['Satisfaction']])
print(df.shape)
df.sample(5)


(576, 11)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Satisfaction,Job,Outcome
220,2,100,70,52,57,40.5,677.0,25,3,High School Teacher,0
602,6,162,62,0,0,24.3,178.0,50,3,Financial Analyst,1
520,1,167,74,17,144,23.4,447.0,33,0,High School Teacher,1
118,9,57,80,37,0,32.8,96.0,41,2,Epidemiologist,0
204,0,86,68,32,0,35.8,238.0,25,3,High School Teacher,0


## Transformar variables nominales en numéricas

Como vemos tenemos la variable Job que es una variable nominal (sin orden) y debenmos codificarla mediante el One-Hot encoder

In [11]:
# Tu código
# Creamos el objeto que realizará la transformación

#Instancio el OneHot
onehot=OneHotEncoder()

# Aplico el OneHot a la columna job y guardo el resultao en a
a=onehot.fit_transform(df[['Job']])

# Convierto el a en data frame y lo llamo encoded_df
encoded_df = pd.DataFrame(a.toarray(), columns=onehot.get_feature_names_out(['Job']))

# Reseteo el índice de los dos data frames antes de concatenarlos
df.reset_index(drop=True, inplace=True)
encoded_df.reset_index(drop=True, inplace=True)

# concateno los dos data frames y los guardo machacando el df
df=pd.concat([df,encoded_df],axis=1)

# Elimino la columna Job original antigua, la categórica
df.drop(columns='Job',inplace=True)

# Muestro tamaño yu sample del data frame df transformado
print(df.shape)
df.sample(5)



(576, 20)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Satisfaction,Outcome,Job_Database Administrator,Job_Epidemiologist,Job_Financial Analyst,Job_Firefighter,Job_High School Teacher,Job_Human Resources,Job_Legislator,Job_Market Research Analyst,Job_Physicians,Job_Statistician
248,1,157,72,21,168,25.6,123.0,24,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
417,1,84,64,23,115,36.9,471.0,28,3,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
30,4,111,72,47,207,37.1,0.42,56,2,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
559,9,140,94,0,0,32.7,734.0,45,2,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
465,2,112,86,42,160,38.4,246.0,28,1,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## 0. Separar la variable a predecir y las predictoras

In [12]:
X=df.drop(columns=['Outcome'],inplace=False)
y=df['Outcome']

## 1. Split. Separar los datos en conjunto de entrenamiento (train) y conjunto de evaluación o test(test)

In [9]:
# Tu código aquí debajo
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## 2. Estandarización controlada.

* Fit en el train
* Transform en el train
* Transform en el test

In [10]:
# Imprime la dimensión de los datos que vayas creando
# Tu código aquí debajo
estandarizador = StandardScaler()
estandarizador.fit(X_train)
X_train_std=estandarizador.transform(X_train)
X_test_std=estandarizador.transform(X_test)

# REGRESIÓN LOGÍSTICA. Usamos la estandarización



In [16]:
LR = LogisticRegression()
LR.fit(X_train_std,y_train)
predictions = LR.predict(X_test_std)

In [20]:
metricas_LR=[]
metricas_LR.append(accuracy_score(y_test, predictions))
metricas_LR.append(precision_score(y_test, predictions))
metricas_LR.append(recall_score(y_test, predictions))
metricas_LR.append(f1_score(y_test, predictions))
metricas_LR

[0.8055555555555556,
 0.5925925925925926,
 0.48484848484848486,
 0.5333333333333333]

# DECISION TREE. NO Usamos la estandarización

In [22]:
DT=DecisionTreeClassifier()
DT.fit(X_train,y_train)
predictions = DT.predict(X_test)
metricas_DT=[]
metricas_DT.append(accuracy_score(y_test, predictions))
metricas_DT.append(precision_score(y_test, predictions))
metricas_DT.append(recall_score(y_test, predictions))
metricas_DT.append(f1_score(y_test, predictions))
metricas_DT



[0.7847222222222222, 0.53125, 0.5151515151515151, 0.5230769230769231]

# RANDOM FOREST. Usamos la estandarización

In [40]:
RF=RandomForestClassifier()
RF.fit(X_train_std,y_train)
predictions = RF.predict(X_test_std)
metricas_RF=[]
metricas_RF.append(accuracy_score(y_test, predictions))
metricas_RF.append(precision_score(y_test, predictions))
metricas_RF.append(recall_score(y_test, predictions))
metricas_RF.append(f1_score(y_test, predictions))
metricas_RF

[0.8055555555555556, 0.6086956521739131, 0.42424242424242425, 0.5]

In [41]:
dict={"Logistic Regression":metricas_LR, "DecisionTree":metricas_DT, "RandomForest":metricas_RF}
Resultados=pd.DataFrame(dict,index=["Acuracy","Precision","Recall","F1"])
Resultados

Unnamed: 0,Logistic Regression,DecisionTree,RandomForest
Acuracy,0.805556,0.784722,0.805556
Precision,0.592593,0.53125,0.608696
Recall,0.484848,0.515152,0.424242
F1,0.533333,0.523077,0.5
