In [None]:
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sqlalchemy import create_engine, MetaData, Table
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import mlflow
from mlflow import MlflowClient
from mlflow.models import infer_signature
# from airflow import DAG
# from airflow.operators.python_operator import PythonOperator
# from airflow.operators.dummy_operator import DummyOperator

In [None]:
# Definir los argumentos del DAG
default_args = {
    'owner': 'Oscar C',
    'depends_on_past': False,
    'email_on_failure': False,
    'email': ['oecorrechag@gmail.com'],
    'retries': 1,
    'start_date': datetime(2024, 5, 20),
    'retry_delay': timedelta(minutes=1),
}

In [None]:
def drop_table(table_name):
    # Conexión a MySQL (en docker)
    engine = create_engine('mysql+pymysql://root:airflow@mysql:3306/db')
    metadata = MetaData()
    mi_tabla = Table(table_name, metadata)
    mi_tabla.drop(engine)
    ## otra forma de eliminar
    # metadata.drop_all(engine, tables=[mi_tabla])

# drop_table('iris_table')
# drop_table('raw')
# drop_table('clean_data')

In [None]:
# drop_table('penguin_data')

In [None]:
def raw_data():

    # Conexión a MySQL (en docker)
    # engine = create_engine('mysql+pymysql://root:airflow@mysql:3306/db')
    engine = create_engine('mysql+pymysql://root:airflow@127.0.0.1:3306/db')


    # load data
    df = pd.read_csv('data/realtor-data.csv', sep = ',', decimal = '.', header = 0, encoding = 'utf-8')
    df.columns = ['brokered_by','status','price','bed','bath','acre_lot','street','city','state',
                  'zip_code','house_size','prev_sold_date']
    # tomar el 10% para que guarde
    df = df.sample(frac=0.1, random_state=42)
    print(df.shape)

    # Guardar los datos en MySQL
    df.to_sql('raw_data', con=engine, if_exists='append', index=False)


    print("Datos raw_data guardados en MySQL") 

    return df.head()

raw_data()

In [None]:
def clean_data():
    # Conexión a la base de datos MySQL
    # engine = create_engine('mysql+pymysql://root:airflow@mysql:3306/db')
    engine = create_engine('mysql+pymysql://root:airflow@127.0.0.1:3306/db')
    # Consulta para cargar los datos desde la tabla en la base de datos
    query = "SELECT * FROM raw_data"
    # Leer los datos desde MySQL
    df = pd.read_sql(query, con=engine)


    # Selecciono como prueba solo las variables numericas
    df = df.loc[:,['price','bed','bath','acre_lot','street','house_size']]
    # Eliminar los registros con faltantes
    df = df.dropna()
    # Convertir en string el zip code
    df['zip_code'] = df['zip_code'].astype(str)
    # print(df.shape)
    

    # Guardar los datos en MySQL
    df.to_sql('clean_data', con=engine, if_exists='append', index=False)

    print("Datos limpios guardados en MySQL") 

    return df.head()

clean_data() 

In [None]:
def load_and_slip():
    # Conexión a la base de datos MySQL
    engine = create_engine('mysql+pymysql://root:airflow@mysql:3306/db')
    # Consulta para cargar los datos desde la tabla en la base de datos
    query = "SELECT * FROM clean_data"
    # Leer los datos desde MySQL
    df = pd.read_sql(query, con=engine)
    # Convertir las columnas 'Sex' y 'Species' a tipo categórico
    # df[['Wilderness_Area', 'Soil_Type','Cover_Type']] = df[['Wilderness_Area', 'Soil_Type','Cover_Type']].astype('category')
    # Dividir los datos en características (X) y etiquetas (y)
    X = df.drop(columns='price')
    y = df['price']
    # Dividir los datos en conjuntos de entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
    
    print("Datos limpios cargados desde MySQL")  

    return X_train, X_test, y_train, y_test

load_and_slip()

In [None]:
def model_train():


    # conectar con mlflow y minio
    mlflow.set_tracking_uri("http://Mlflow:5000")

    os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://Minio:9000"
    os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
    os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'


    X_train, X_test, y_train, y_test = load_and_slip()

    EXPERIMENT_NAME = "Classifier-Experiment"
    mlflow.set_experiment(EXPERIMENT_NAME)

    current_experiment=dict(mlflow.get_experiment_by_name(EXPERIMENT_NAME))
    experiment_id=current_experiment['experiment_id']

    print('inicia el experimento')

    model_name = 'Lineal model'
    RUN_NAME = f'Regression Experiment {model_name}'
    with mlflow.start_run(experiment_id=experiment_id, run_name=RUN_NAME):

        model = LinearRegression()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test) 

        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Log the hyperparameters
        mlflow.log_params()

        # Log the loss metric
        mlflow.log_metric(f"{model_name}_mse", mse)
        mlflow.log_metric(f"{model_name}_rmse", rmse)
        mlflow.log_metric(f"{model_name}_mae", mae)
        mlflow.log_metric(f"{model_name}_r2", r2)

        # Set a tag that we can use to remind ourselves what this run was for
        mlflow.set_tag("Training Info", f"{model_name} model for regression")

        # Infer the model signature
        signature = infer_signature(X_train, model.predict(X_train))
        
        #log the model

        model_info = mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path=f"house_{model_name}_model",
            signature=signature,
            input_example=X_train,
            registered_model_name=f"tracking-house-{model_name}"
        )

        print('finaliza el experimento')

        mlflow.end_run() 

    client = MlflowClient()
    client.set_registered_model_tag("tracking-house-Lineal model", "task", "regression")

    print("Trained successfully.")

model_train()

In [None]:
def predic_model_train(data_predict):


    # conectar con mlflow y minio
    mlflow.set_tracking_uri("http://Mlflow:5000")

    os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://Minio:9000"
    os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
    os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'


    model_name = "tracking-house-Lineal model"
    model_version = 1

    lr = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{model_version}")

    return lr.predict(data_predict)


user_input = [3, 2, 0.09, 892999.0, 1409.0]
columns = ['bed','bath','acre_lot','street','house_size']
df_pred = pd.DataFrame([user_input], columns=columns)
out_model = predic_model_train(df_pred[0])
out_model

In [None]:
print('ok_')