# Clase Supervised Classification ML

## Importar librerías

In [2]:
# Recuerda importar:
# 1. Las librerías típicas como pandas
# 2. Las que necesites para las transformaciones
# 3. Las que necesites para los modelos
# 4. Las que necesites para guardar y recuperar modelos

# Ves añadiéndolas conforme las necesites

# Tu código aquí debajo
import pandas as pd
import sklearn as sk
from joblib import dump 
from joblib import load
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from seaborn import heatmap
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score



## Importar los datos

In [3]:
# Tu código aquí debajo
df = pd.read_excel('diabetes_clean.xlsx')
df2 = df.copy()

## Detectar y tratar duplicados

In [4]:
# Tu código aquí debajo

df.duplicated().sum()

0

## Detectar y tratar valores nulos

In [5]:
# Tu código aquí debajo

df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Satisfaction                0
Job                         0
Outcome                     0
dtype: int64

## Detectar y tratar valores atípicos (ouliers)

In [6]:
# Tu código aquí debajo
def outliers(d,v):
    import numpy as np
    q3=np.quantile(d[v],0.75)
    q1=np.quantile(d[v],0.25)
    ric=q3-q1
    df_outliers=d.loc[(d[v]>=q3+1.5*ric) | (d[v]<=q1-1.5*ric), : ]
    df_limpio= d.loc[(d[v]<q3+1.5*ric) & (d[v]>q1-1.5*ric), : ]
    return df_outliers,df_limpio

In [7]:
num = ["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age"]
for i in num:
    a,b=outliers(df2,i)
    df2 = b

In [8]:
df2.sample(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Satisfaction,Job,Outcome
496,2,68,62,13,15,20.1,257.0,23,Unsatisfied,High School Teacher,0
88,0,95,85,25,36,37.4,247.0,24,Extremely satisfied,High School Teacher,1
235,0,107,62,30,74,36.6,757.0,25,Extremely satisfied,High School Teacher,1
25,3,88,58,11,54,24.8,267.0,22,Extremely unsatisfied,Human Resources,0
345,1,95,82,25,180,35.0,233.0,43,Extremely satisfied,High School Teacher,1


# Preparamos los datos



## Transformar variables ordinales en numéricas

Como vemos tenemos la variable Satisfaction que es una variable ordinal y podemos suponer que el impacto que podrá tener sobre la variable a predecir (diabetes si/no) permanecerá constante al pasar de una categoría a la siguiente. por eso, la codificaremos como variable ordinal

In [9]:
df2["Satisfaction"].value_counts()

Unsatisfied              175
Extremely unsatisfied    149
Extremely satisfied      141
Satisfied                105
Name: Satisfaction, dtype: int64

In [10]:
# Creamos el objeto que realizará la transformación dándole el orden en una lista con doble corchete
# Tu código aquí debajo

# Particularizamos el codificador en nuestros datos
# Tu código

# Aplicamos el codificador ya particularizado a nuestros datos y lo guardamos machacando la misma variable
# Tu código aquí debajo

# Imprime la dimensión de los datos y un sample
# Tu código aquí debajo

ordinal_encoder = OrdinalEncoder(categories = [['Extremely unsatisfied','Unsatisfied','Satisfied','Extremely satisfied']],dtype= 'int')

df2['Satisfaction'] = ordinal_encoder.fit_transform(df2[["Satisfaction"]])


In [11]:
# Guardamos el codificador
# Tu código aquí debajo

## Transformar variables nominales en numéricas

Como vemos tenemos la variable Job que es una variable nominal (sin orden) y debenmos codificarla mediante el One-Hot encoder

In [12]:
# Tu código

OneHot = OneHotEncoder()
aux = OneHot.fit_transform(df2[['Job']])

df2_one_hot = pd.DataFrame(aux.toarray(),columns=OneHot.get_feature_names_out(['Job']))
df2.reset_index(drop=True,inplace=True)
df2_one_hot.reset_index(drop=True,inplace=True)
df2=pd.concat([df2,df2_one_hot],axis=1)
df2.drop(columns='Job',inplace=True)
df2.sample(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Satisfaction,Outcome,Job_Database Administrator,Job_Epidemiologist,Job_Financial Analyst,Job_Firefighter,Job_High School Teacher,Job_Human Resources,Job_Legislator,Job_Market Research Analyst,Job_Physicians,Job_Statistician
560,1,128,88,39,110,36.5,1057.0,37,3,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
388,3,125,58,0,0,31.6,151.0,24,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
419,0,91,80,0,0,32.4,601.0,27,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
138,0,119,64,18,92,34.9,725.0,23,0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
386,3,124,80,33,130,33.2,305.0,26,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# Guardamos el codificador

# REGRESIÓN LOGÍSTICA. Usamos la estandarización

## 0. Separar la variable a predecir y las predictoras

In [14]:
# Tu código aquí debajo
X = df2.drop(columns=["Outcome"],inplace=False)
y = df2["Outcome"]

In [15]:
# Imprime la dimensión de los datos que vayas creando
# Tu código aquí debajo

X.shape

(570, 19)

## 1. Split. Separar los datos en conjunto de entrenamiento (train) y conjunto de evaluación o test(test)

In [16]:
# Tu código aquí debajo

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Imprime la dimensión de los datos que vayas creando
# Tu código aquí debajo
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(456, 19)
(114, 19)
(456,)
(114,)


## 2. Estandarización controlada.

* Fit en el train
* Transform en el train
* Transform en el test

In [18]:
# Imprime la dimensión de los datos que vayas creando
# Tu código aquí debajo

scaler = StandardScaler()

X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)



In [19]:
# Imprime la dimensión de los datos que vayas creando
# Tu código aquí debajo

print(X_train_std.shape)
print(X_test_std.shape)

(456, 19)
(114, 19)


## 3. Fit. Entrenar el modelo

* Creamos el modelo
* Entrenamos el modelo sobre los datos de train y obtenemos el modelo entrenado

In [20]:
# Cargamos el modelo y lo creamos
# Imprime la dimensión de los datos que vayas creando
# Tu código aquí debajo

LR = DecisionTreeClassifier()

LR.fit(X_train_std,y_train)

# Ahora LR ya es un modelo que se puede entrenar (fit)

DecisionTreeClassifier()

## 4. Evaluar el Modelo

* Hacemos predicciones sobre el conjunto de test
* Comparamos esas predicciones con los valores reales. Calculamos la precisión (accuracy)

In [21]:
# Hacemos predicciones sobre el conjunto de test y la guardamos en predictions
# Tu código aquí debajo

predicciones = LR.predict(X_test_std)

accuracy_score(y_test,predicciones)

0.7543859649122807

In [22]:
# Calculamos la Accuracy
# Tu código aquí debajo
metricas_LR = []

metricas_LR.append(accuracy_score(y_test, predicciones))
metricas_LR.append(precision_score(y_test, predicciones))
metricas_LR.append(recall_score(y_test, predicciones))
metricas_LR.append(f1_score(y_test, predicciones))

## 5. DECISION TREE

* Iniciamos libreria "Decision Tree"
* Creamos codigo

In [23]:
DT = DecisionTreeClassifier()
DT.fit(X_train, y_train)

prediction_tree = DT.predict(X_test)
metricas_DT = []

metricas_DT.append(accuracy_score(y_test, prediction_tree))

metricas_DT.append(precision_score(y_test, prediction_tree))

metricas_DT.append(recall_score(y_test, prediction_tree))

metricas_DT.append(f1_score(y_test, prediction_tree))

metricas_DT

[0.8070175438596491, 0.7272727272727273, 0.5, 0.5925925925925926]

In [24]:
est_final = StandardScaler()
est_final.fit(X)

X_std = est_final.transform(X)

DT_final = DecisionTreeClassifier()
DT_final.fit(X_std, y)

DecisionTreeClassifier()

In [25]:
dump(est_final,'standard_final.std')
dump(OneHot, 'one_hot_final.std')
dump(ordinal_encoder, "ordinal_final.std")
dump(DT_final, 'DT_final.std')

['DT_final.std']

In [26]:
model = load('DT_final.std')
ohf = load('one_hot_final.std')
odf = load('ordinal_final.std')
stf = load('standard_final.std')

In [27]:
new_dat = pd.read_excel('diabetes_new_data.xlsx')

In [28]:
# Ordinal encoder

new_dat["Satisfaction"] = odf.transform(new_dat[["Satisfaction"]])

In [29]:
# One Hot Encoder

OneHot = OneHotEncoder()
aux = ohf.transform(new_dat[["Job"]])
encoded_df = pd.DataFrame(aux.toarray(), columns = ohf.get_feature_names(["Job"]))
new_dat.reset_index(drop = True, inplace = True)
encoded_df.reset_index(drop = True, inplace = True)

new_dat = pd.concat([new_dat, encoded_df], axis = 1)
new_dat.drop(columns= "Job", inplace = True)



In [30]:
new_dat

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Satisfaction,Job_Database Administrator,Job_Epidemiologist,Job_Financial Analyst,Job_Firefighter,Job_High School Teacher,Job_Human Resources,Job_Legislator,Job_Market Research Analyst,Job_Physicians,Job_Statistician
0,10,100,72,35,0,33.6,627,50,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1,85,70,29,0,26.6,351,31,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,183,64,5,10,30.0,672,32,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,3,89,66,23,94,28.1,167,21,3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [31]:
dict = {"Logistic Regression": metricas_LR, "Decision Tree": metricas_DT}
results = pd.DataFrame(dict,index = ["Acuracy","Precision","Recall","F1"])

In [32]:
results

Unnamed: 0,Logistic Regression,Decision Tree
Acuracy,0.754386,0.807018
Precision,0.590909,0.727273
Recall,0.40625,0.5
F1,0.481481,0.592593


* Mostramos el decision tree graficamente:

In [33]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(16,12))
plot_tree(DT, feature_names=new_data.columns, fontsize=12, filled=True,
class_names=['NO','SI'])

NameError: name 'new_data' is not defined

<Figure size 1600x1200 with 0 Axes>

## 6. RANDOM FOREST

In [None]:
# Importamos libreria "Random Forest"

from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier()

RF.fit(X_train, y_train)

prediction_RF = RF.predict(X_test)

metricas_RF = []

metricas_RF.append(accuracy_score(y_test, prediction_RF))

metricas_RF.append(precision_score(y_test, prediction_RF))

metricas_RF.append(recall_score(y_test, prediction_RF))

metricas_RF.append(f1_score(y_test, prediction_RF))

metricas_RF

dict = {"Logistic Regression": metricas_LR, "Decision Tree": metricas_DT, "Random Forest": metricas_RF}

results = pd.DataFrame(dict,index = ["Acuracy","Precision","Recall","F1"])

results