In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline # Pipeline es una estructura 
# que encadena múltiples transformaciones y un estimador final en un único objeto entrenable.

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_auc_score

import joblib

In [3]:
#encoder = OneHotEncoder(sparse_output = False)
#encoded = encoder.fit_transform(processor.data[["Sex"]])
#encoded
# tambien se puede hacer con el panda dummy

In [4]:
class MachineLearningProcessor:
    def __init__(self, model_name: str):
        self.model_name = model_name
        self.data = pd.read_csv("../data/heart.csv")
        self.pre_process_data()

    def pre_process_data(self):
        self.processed_data = self.data.drop(["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"], axis = 1)
        
    def split_data(self):
        X = self.processed_data.drop("HeartDisease", axis = 1)
        y = self.processed_data["HeartDisease"]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, random_state = 100, test_size = 0.3, stratify= y)

    def train(self, model):
        print("1. Separando datos de train y test...")
        self.split_data()
        print(f"2. Entrando model {self.model_name}")
        self.fitted_model = model.fit(self.X_train, self.y_train)
        print("3. Entrenamiento finalizado")

    def predict(self):
        self.predictions = self.fitted_model.predict(self.X_test)
        print(classification_report(self.y_test, self.predictions))

In [5]:
processor = MachineLearningProcessor(model_name = "Model Baseline")

In [6]:
processor.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [7]:
processor.train(model = DummyClassifier())

1. Separando datos de train y test...
2. Entrando model Model Baseline
3. Entrenamiento finalizado


In [8]:
processor.fitted_model

0,1,2
,strategy,'prior'
,random_state,
,constant,


In [9]:
processor.predict()

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       123
           1       0.55      1.00      0.71       153

    accuracy                           0.55       276
   macro avg       0.28      0.50      0.36       276
weighted avg       0.31      0.55      0.40       276



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [10]:
logistic_regression_processor = MachineLearningProcessor(model_name = "Regresion Logistica")
logistic_regression_processor.train(model = LogisticRegression())

1. Separando datos de train y test...
2. Entrando model Regresion Logistica
3. Entrenamiento finalizado


In [11]:
logistic_regression_processor.predict()

              precision    recall  f1-score   support

           0       0.72      0.72      0.72       123
           1       0.77      0.78      0.78       153

    accuracy                           0.75       276
   macro avg       0.75      0.75      0.75       276
weighted avg       0.75      0.75      0.75       276



### Guardar modelos

In [12]:
def save_model(ml_object, name):
    joblib.dump(f"../models/{name}")
    print("Modelo guardado")