In [3]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
import pandas as pd
import os
import mlflow
from mlflow.models import infer_signature
from mlflow import MlflowClient

In [10]:
def tranformaciones():

    df = pd.read_csv('clean.csv', sep = ',', decimal = '.', header = 0, encoding = 'utf-8')

    df = df.loc[:,['age', 'discharge_disposition_id', 'time_in_hospital', 'num_lab_procedures', 
                   'num_procedures', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
                   'readmitted'
                  ]]

    # Separar las características (X) y la variable objetivo (y)
    X = df.drop(['readmitted'], axis=1)
    y = df['readmitted']

    ## Escalar las características
    scaler = MinMaxScaler()
    scaler2 = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = scaler2.fit_transform(X_scaled)

    ## Balanceo de clases
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

    ## Dividir el conjunto de datos en conjuntos de entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

    mlflow.set_tracking_uri("http://Mlflow:5000")

    EXPERIMENT_NAME = "Readmitted-Survived-Classifier-Experiment"
    mlflow.set_experiment(EXPERIMENT_NAME)

    current_experiment=dict(mlflow.get_experiment_by_name(EXPERIMENT_NAME))
    experiment_id=current_experiment['experiment_id']

    os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://Minio:9000"
    os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
    os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'

    # modelo Decision Tree

    model_name = 'Decision Tree'
    RUN_NAME = f'Readmitted Classifier Experiment {model_name}'
    params = {'max_depth':3, 'min_samples_split':2}
    with mlflow.start_run(experiment_id=experiment_id, run_name=RUN_NAME):
        
        model = DecisionTreeClassifier(**params)
        
        model.fit(X_train, y_train)  # Train model
        predictions = model.predict(X_test)  # Predictions

        # Calculate metrics
        accuracy = accuracy_score(y_test, predictions)
        f1 = f1_score(y_test, predictions, average='weighted')  
        
        # Log the hyperparameters
        mlflow.log_params(params)

        # Log the loss metric
        mlflow.log_metric(f"{model_name}_accuracy", accuracy)
        mlflow.log_metric(f"{model_name}_f1", f1)

        # Set a tag that we can use to remind ourselves what this run was for
        mlflow.set_tag("Training Info", f"{model_name} model for Readmitted")

        # Infer the model signature
        signature = infer_signature(X_train, model.predict(X_train))
        
        #log the model
        model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path=f"readmitted_{model_name}_model",
        signature=signature,
        input_example=X_train,
        registered_model_name=f"tracking-readmitted-{model_name}",)

        mlflow.end_run() 

    client = MlflowClient()
    client.set_registered_model_tag("tracking-readmitted-Decision Tree", "task", "classification")

    # fin modelo Decision Tree

    # modelo Random forest

    model_name = 'RF'
    RUN_NAME = f'Readmitted Classifier Experiment {model_name}'
    params = {'n_estimators':50}
    with mlflow.start_run(experiment_id=experiment_id, run_name=RUN_NAME):
        
        model = RandomForestClassifier(**params)
        
        model.fit(X_train, y_train)  # Train model
        predictions = model.predict(X_test)  # Predictions

        # Calculate metrics
        accuracy = accuracy_score(y_test, predictions)
        f1 = f1_score(y_test, predictions, average='weighted')  
        
        # Log the hyperparameters
        mlflow.log_params(params)

        # Log the loss metric
        mlflow.log_metric(f"{model_name}_accuracy", accuracy)
        mlflow.log_metric(f"{model_name}_f1", f1)

        # Set a tag that we can use to remind ourselves what this run was for
        mlflow.set_tag("Training Info", f"{model_name} model for Readmitted")

        # Infer the model signature
        signature = infer_signature(X_train, model.predict(X_train))
        
        #log the model

        model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path=f"readmitted_{model_name}_model",
        signature=signature,
        input_example=X_train,
        registered_model_name=f"tracking-readmitted-{model_name}",)

        mlflow.end_run() 

    # fin modelo Random forest

    client = MlflowClient()
    # client.set_registered_model_tag("tracking-readmitted-RF", "task", "classification")
    mlflow.register_model(model_uri=mlflow.set_tracking_uri("http://Mlflow:5000"), name="tracking-readmitted-RF")

In [11]:
tranformaciones()

2024/05/06 04:17:57 INFO mlflow.tracking.fluent: Experiment with name 'Readmitted-Survived-Classifier-Experiment' does not exist. Creating a new experiment.
Successfully registered model 'tracking-readmitted-Decision Tree'.
2024/05/06 04:18:06 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-readmitted-Decision Tree, version 1
Created version '1' of model 'tracking-readmitted-Decision Tree'.
Successfully registered model 'tracking-readmitted-RF'.
2024/05/06 04:18:09 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-readmitted-RF, version 1
Created version '1' of model 'tracking-readmitted-RF'.


In [None]:
print('ok_')