In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sqlalchemy import create_engine, MetaData, Table
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import mlflow
from mlflow import MlflowClient
from mlflow.models import infer_signature
# from airflow import DAG
# from airflow.operators.python_operator import PythonOperator
# from airflow.operators.dummy_operator import DummyOperator

le = LabelEncoder()
isolation_forest = IsolationForest(random_state=42)

In [2]:
# Definir los argumentos del DAG
default_args = {
    'owner': 'Oscar C',
    'depends_on_past': False,
    'email_on_failure': False,
    'email': ['oecorrechag@gmail.com'],
    'retries': 1,
    'start_date': datetime(2024, 5, 20),
    'retry_delay': timedelta(minutes=1),
}

In [3]:
def drop_table(table_name):
    # Conexión a MySQL (en docker)
    engine = create_engine('mysql+pymysql://root:airflow@mysql:3306/db')
    # engine = create_engine('mysql+pymysql://root:airflow@127.0.0.1:3306/db')
    metadata = MetaData()
    mi_tabla = Table(table_name, metadata)
    mi_tabla.drop(engine)
    ## otra forma de eliminar
    # metadata.drop_all(engine, tables=[mi_tabla])

# drop_table('raw_data')
# drop_table('test_data')
# drop_table('penguin_data')

In [None]:
def raw_data():

    # Conexión a MySQL (en docker)
    engine = create_engine('mysql+pymysql://root:airflow@mysql:3306/db')
    # engine = create_engine('mysql+pymysql://root:airflow@127.0.0.1:3306/db')


    # load data
    # df = pd.read_csv('data/realtor-data.csv', sep = ',', decimal = '.', header = 0, encoding = 'utf-8')
    df = pd.read_parquet('data/df_g1_b0.parquet.gzip')
    df.columns = ['brokered_by','status','price','bed','bath','acre_lot','street','city','state',
                  'zip_code','house_size','prev_sold_date']
    print(df.shape)

    # Guardar los datos en MySQL
    df.to_sql('raw_data', con=engine, if_exists='append', index=False)

    
    print("Datos raw_data guardados en MySQL") 

    return df.head()

raw_data()

In [None]:
def clean_data():
    # Conexión a la base de datos MySQL
    engine = create_engine('mysql+pymysql://root:airflow@mysql:3306/db')
    # engine = create_engine('mysql+pymysql://root:airflow@127.0.0.1:3306/db')
    # Consulta para cargar los datos desde la tabla en la base de datos
    query = "SELECT * FROM raw_data"
    # Leer los datos desde MySQL
    df = pd.read_sql(query, con=engine)


    # Selecciono como prueba solo las variables numericas
    df = df.loc[:,['price','bed','bath','acre_lot','state','house_size','prev_sold_date']]
    # Eliminar los registros con faltantes
    df = df.dropna()
    # limpieza

    df["año"] = pd.to_datetime(df['prev_sold_date']).dt.year
    df["decada"] = (df["año"] // 10) * 10


    df = df[df['bed'] < 7]
    df = df[df['bath'] < 5]
    df = df[df['price'] < 300000]
    df = df[df['acre_lot'] <= 0.0894211]
    df = df[df['house_size'] < 3500]
    df = df[df['decada'] >= 1980]

    encoded_labels = le.fit_transform(df['state'])
    df['states'] = encoded_labels

    isolation_forest.fit(df.loc[:,['price', 'bed', 'bath', 'acre_lot', 'states', 'house_size']])
    anomalies = isolation_forest.predict(df.loc[:,['price', 'bed', 'bath', 'acre_lot', 'states', 'house_size']])
    df = df[anomalies == 1]
    
    df = df.loc[:,['price','bed','bath','acre_lot','states','house_size']]

    # Guardar los datos en MySQL
    df.to_sql('clean_data', con=engine, if_exists='append', index=False)

    print("Datos limpios guardados en MySQL") 

    return df.head()

clean_data() 

In [4]:
def load_and_slip():
    # Conexión a la base de datos MySQL
    engine = create_engine('mysql+pymysql://root:airflow@mysql:3306/db')
    # engine = create_engine('mysql+pymysql://root:airflow@127.0.0.1:3306/db')
    # Consulta para cargar los datos desde la tabla en la base de datos
    query = "SELECT * FROM clean_data"
    # Leer los datos desde MySQL
    df = pd.read_sql(query, con=engine)
    # Convertir las columnas 'Sex' y 'Species' a tipo categórico
    # df[['Wilderness_Area', 'Soil_Type','Cover_Type']] = df[['Wilderness_Area', 'Soil_Type','Cover_Type']].astype('category')
    # Dividir los datos en características (X) y etiquetas (y)
    X = df.drop(columns='price')
    y = df['price']
    # Dividir los datos en conjuntos de entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42) 
    
    print("Datos limpios cargados desde MySQL")  

    return X_train, X_test, y_train, y_test

load_and_slip()

Datos limpios cargados desde MySQL


(      bed  bath  acre_lot  states  house_size prev_sold_date
 3759  3.0   3.0      0.04      24      1607.0     2005-12-16
 414   3.0   2.0      0.03      37      1900.0     2021-02-24
 505   2.0   3.0      0.06      31      1764.0     2002-07-09
 296   3.0   3.0      0.04      37      1520.0     2019-05-09
 1261  3.0   1.0      0.06      37      1534.0     2004-07-30
 ...   ...   ...       ...     ...         ...            ...
 3444  3.0   3.0      0.05      13      1653.0     2007-07-06
 466   3.0   2.0      0.08      29      1758.0     1990-02-20
 3092  3.0   3.0      0.06      13      1732.0     2019-12-17
 3772  3.0   3.0      0.05      24      1865.0     2009-11-02
 860   3.0   2.0      0.07      37      1810.0     2021-06-11
 
 [4036 rows x 6 columns],
       bed  bath  acre_lot  states  house_size prev_sold_date
 2269  3.0   3.0      0.04       9      1564.0     2007-07-05
 1192  3.0   2.0      0.06      20      1790.0     2017-03-31
 2623  3.0   2.0      0.07      17      17

In [None]:
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV


# xgb_pipeline -> data
X_train, X_test, y_train, y_test = load_and_slip()

param_dist = {
    'xgb__n_estimators': randint(50, 200),
    'xgb__max_depth': randint(3, 10),
    'xgb__learning_rate': uniform(0.01, 0.3),
    'xgb__subsample': uniform(0.7, 0.3),
    'xgb__colsample_bytree': uniform(0.7, 0.3)
}

model = xgb.XGBRegressor()

# Configurar RandomizedSearchCV
random_search = RandomizedSearchCV(model, 
                                   param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, random_state=42)

# Ajustar el modelo
random_search.fit(X_train, y_train)

# Mejor modelo después de la búsqueda
best_model = random_search.best_estimator_
best_model

In [6]:
def model_train():


    # conectar con mlflow y minio
    mlflow.set_tracking_uri("http://Mlflow:5000")

    os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://Minio:9000"
    os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
    os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'


    X_train, X_test, y_train, y_test = load_and_slip()

    EXPERIMENT_NAME = "Classifier-Experiment"
    mlflow.set_experiment(EXPERIMENT_NAME)

    #agregaron para eliminar el resto
    mlflow.sklearn.autolog(log_input_examples=True, log_model_signatures=True, registered_model_name='modelo_test')    

    current_experiment=dict(mlflow.get_experiment_by_name(EXPERIMENT_NAME))
    # experiment_id=current_experiment['experiment_id']

    print('inicia el experimento')

    model_name = 'XGB'
    with mlflow.start_run(run_name="tracking-house-XGB") as run:

        model = xgb.XGBRegressor(n_estimators=18)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test) 

        # Calculate metrics
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        
        # # Log the hyperparameters
        # mlflow.log_params(params)

        # Log the loss metric
        mlflow.log_metric(f"{model_name}_r2", r2)
        mlflow.log_metric(f"{model_name}_mae", mae)
        mlflow.log_metric(f"{model_name}_mse", mse)
        mlflow.log_metric(f"{model_name}_rmse", rmse)

        model_uri = f"runs:/{run.info.run_id}/model"
        model_details = mlflow.register_model(model_uri=model_uri, name="cover_type_class")

        # Set a tag that we can use to remind ourselves what this run was for
        mlflow.set_tag("Training Info", f"{model_name} model for regression")

        # Infer the model signature
        signature = infer_signature(X_train, model.predict(X_train))
    
        # og the model

        model_info = mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path=f"house_{model_name}_model",
            # signature=signature,
            input_example=X_train,
            registered_model_name=f"tracking-house-{model_name}"
        )

        print('finaliza el experimento')

        mlflow.end_run() 

    client = MlflowClient()
    client.set_registered_model_tag("tracking-house-XGB", "task", "regression")

    print("Trained")

model_train()

Datos limpios cargados desde MySQL
inicia el experimento


Successfully registered model 'tracking-house-XGB'.
2024/05/30 21:17:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-house-XGB, version 1


finaliza el experimento
Trained successfully.


Created version '1' of model 'tracking-house-XGB'.


In [9]:
def predic_model_train(data_predict):


    # conectar con mlflow y minio
    mlflow.set_tracking_uri("http://Mlflow:5000")

    os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://Minio:9000"
    os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
    os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'


    model_name = "tracking-house-XGB"
    model_version = 1

    lr = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{model_version}")

    return lr.predict(data_predict)


user_input = [3.0, 2.0, 0.09, 302, 1409.0]
columns = ['bed','bath','acre_lot', 'states', 'house_size']
df_pred = pd.DataFrame([user_input], columns=columns)
out_model = predic_model_train(df_pred)
out_model

array([194975.64], dtype=float32)

In [None]:
print('ok_')