In [1]:
import mlflow
from datetime import datetime
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import joblib
import os

Funciones

In [2]:
# ------------------------------------ Load raw data --------------------------------------
def load_csv_to_mysql():
    try:
        csv_path = 'penguins_lter.csv'
        df = pd.read_csv(csv_path)

        engine = create_engine('mysql+pymysql://user:password@db:3306/db_jupyter')

        # Guardar en MySQL
        df.to_sql("penguins_original", con=engine, if_exists='replace', index=False)
        print("✅ Datos originales cargados en MySQL.")
    
    except Exception as e:
        print(f"❌ Error en load_csv_to_mysql: {e}")

# ------------------------------------ Load transformed data --------------------------------------
def preprocesamiento():
    try:
        engine = create_engine("mysql+pymysql://user:password@db:3306/db_jupyter")

        # Leer los datos de MySQL con pd.read_sql en lugar de pd.read_sql_table
        df = pd.read_sql("SELECT * FROM penguins_original", con=engine)

        # Transformación de "Date Egg" a día del año (manejo de NaN)
        if "Date Egg" in df.columns:
            df["Date Egg"] = pd.to_datetime(df["Date Egg"], errors="coerce").dt.dayofyear
            df["Date Egg"].fillna(df["Date Egg"].median(), inplace=True)

        # Eliminar columnas irrelevantes si existen
        cols_to_drop = ["studyName", "Sample Number", "Individual ID", "Comments"]
        df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)

        # Manejo de valores nulos en columnas numéricas
        num_cols = ["Culmen Length (mm)", "Culmen Depth (mm)", "Flipper Length (mm)", "Body Mass (g)", "Delta 15 N (o/oo)", "Delta 13 C (o/oo)"]
        for col in num_cols:
            if col in df.columns:
                df[col].fillna(df[col].mean(), inplace=True)

        # Manejo de valores nulos en columnas categóricas
        cat_cols = ["Species", "Region", "Island", "Stage", "Clutch Completion", "Sex"]
        for col in cat_cols:
            if col in df.columns:
                df[col].fillna("Desconocido", inplace=True)

        # One-Hot Encoding para variables categóricas si existen
        df = pd.get_dummies(df, columns=[col for col in cat_cols if col in df.columns], drop_first=True)

        # Selección de variables predictoras y objetivo
        feature_cols = ["Culmen Length (mm)", "Culmen Depth (mm)", "Flipper Length (mm)"]
        target_col = "Body Mass (g)"

        if not all(col in df.columns for col in feature_cols + [target_col]):
            raise ValueError("⚠️ Algunas columnas clave no están en los datos procesados.")

        X = df[feature_cols]
        y = df[target_col]

        # Dividir en entrenamiento y prueba (80%-20%)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Guardar los conjuntos en MySQL
        X_train.to_sql("train_data_X", con=engine, if_exists="replace", index=False)
        X_test.to_sql("test_data_X", con=engine, if_exists="replace", index=False)
        y_train.to_sql("train_data_y", con=engine, if_exists="replace", index=False)
        y_test.to_sql("test_data_y", con=engine, if_exists="replace", index=False)

        print("✅ Preprocesamiento completo. Tablas 'train_data_X', 'train_data_y', 'test_data_X' y 'test_data_y' creadas en MySQL.")

    except Exception as e:
        print(f"❌ Error en preprocesamiento: {e}")

In [3]:
#truncate_all_tables()
load_csv_to_mysql()
preprocesamiento()

✅ Datos originales cargados en MySQL.


  df["Date Egg"] = pd.to_datetime(df["Date Egg"], errors="coerce").dt.dayofyear
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Date Egg"].fillna(df["Date Egg"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This i

✅ Preprocesamiento completo. Tablas 'train_data_X', 'train_data_y', 'test_data_X' y 'test_data_y' creadas en MySQL.


In [26]:
engine = create_engine("mysql+pymysql://user:password@db:3306/db_jupyter")

    
# Separar variables predictoras y objetivo
X_train = pd.read_sql_table("train_data_X", engine)
y_train = pd.read_sql_table("train_data_y", engine).values.ravel()
X_test = pd.read_sql_table("test_data_X", engine)
y_test = pd.read_sql_table("test_data_y", engine).values.ravel()

In [27]:
# run description (just metadata)
desc = "the simplest possible example"

# connects to the Mlflow tracking server that you started above
mlflow.set_tracking_uri("http://192.168.1.10:5000")
mlflow.set_experiment("mlflow_tracking_pinguins_proofs")

mlflow.autolog(log_model_signatures=True, log_input_examples=True)


# executes the run
with mlflow.start_run(run_name="Random_forest_no_params", description=desc) as run:
    # Entrenar RandomForest
    rf_model = RandomForestRegressor()
    rf_model.fit(X_train, y_train)

2025/03/14 03:00:19 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [10]:
# run description (just metadata)
desc = "the simplest possible example"

# connects to the Mlflow tracking server that you started above
mlflow.set_tracking_uri("http://192.168.1.10:5000")
mlflow.set_experiment("mlflow_tracking_pinguins_proofs")

mlflow.autolog(log_model_signatures=True, log_input_examples=True)


# executes the run
with mlflow.start_run(run_name="no_artifacts_logged2", description=desc) as run:
    # Entrenar GradientBoostingRegressor
    print("Entrenando GradientBoostingRegressor...")
    gb_model = GradientBoostingRegressor()
    gb_model.fit(X_train, y_train)

2025/03/14 02:52:07 INFO mlflow.tracking.fluent: Experiment with name 'mlflow_tracking_pinguins_proofs' does not exist. Creating a new experiment.
2025/03/14 02:52:08 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


Entrenando GradientBoostingRegressor...


NameError: name 'X_train' is not defined

In [None]:
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor

# loads the diabetes dataset
db = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(db.data, db.target)

# run description (just metadata)
desc = "the simplest possible example"

# connects to the Mlflow tracking server that you started above
mlflow.set_tracking_uri("http://10.43.101.149:5000")
mlflow.set_experiment("mlflow_tracking_examples_class")

# executes the run
with mlflow.start_run(run_name="no_artifacts_logged", description=desc) as run:
    rf = RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3)
    rf.fit(X_train, y_train)

In [None]:
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor

# loads the diabetes dataset
db = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(db.data, db.target)

# run description (just metadata)
desc = "the simplest possible example"

# connects to the Mlflow tracking server that you started above
mlflow.set_tracking_uri("http://10.43.101.149:5000")
mlflow.set_experiment("mlflow_tracking_examples_class")


with mlflow.start_run(run_name="params_no_artifacts_logged") as run:

    params = {"n_estimators":100, "max_depth":6, "max_features":3}

    rf = RandomForestRegressor(**params)
    rf.fit(X_train, y_train)

    mlflow.log_params(params)
    mlflow.log_param("my_extra_param", "extra_param_value")
    mlflow.log_metric("my_metric", 0.8)
    mlflow.set_tag("my_tag", "my_tag_value")

In [None]:
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor

import os
os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://10.43.101.149:9000"
os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'

# loads the diabetes dataset
db = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(db.data, db.target)

# run description (just metadata)
desc = "the simplest possible example"

# connects to the Mlflow tracking server that you started above
mlflow.set_tracking_uri("http://10.43.101.149:5000")
mlflow.set_experiment("mlflow_tracking_examples_class")

with mlflow.start_run(run_name="logged_artifacts") as run:
    params = {"n_estimators":100, "max_depth":6, "max_features":3}

    rf = RandomForestRegressor(**params)
    rf.fit(X_train, y_train)

    mlflow.log_params(params)
    mlflow.sklearn.log_model(
      sk_model=rf,
      artifact_path="random_forest_regressor"
    )

In [None]:
print('tracking uri:', mlflow.get_tracking_uri())
print('artifact uri:', mlflow.get_artifact_uri())

In [None]:
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor

import os
os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://10.43.101.149:9000"
os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'

db = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(db.data, db.target)

# connect to mlflow
mlflow.set_tracking_uri("http://10.43.101.149:5000")
mlflow.set_experiment("mlflow_tracking_examples_class2")

# this is the magical stuff
mlflow.autolog(log_input_examples=True, log_model_signatures=True)

# train the model
rf = RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3)
rf.fit(X_train, y_train)

In [None]:
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor

import os
os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://10.43.101.149:9000"
os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'

db = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(db.data, db.target)

# connect to mlflow
mlflow.set_tracking_uri("http://10.43.101.149:5000")
mlflow.set_experiment("mlflow_tracking_examples_class")

mlflow.autolog(log_model_signatures=True, log_input_examples=True)

with mlflow.start_run(run_name="autolog_with_named_run") as run:
    rf = RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3)
    rf.fit(X_train, y_train)

In [None]:
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor

import os
os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://10.43.101.149:9000"
os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'

db = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(db.data, db.target)

# connect to mlflow
mlflow.set_tracking_uri("http://10.43.101.149:5000")
mlflow.set_experiment("mlflow_tracking_examples_class")

mlflow.autolog(log_model_signatures=True, log_input_examples=True)

with mlflow.start_run(run_name="main_run_for_nested") as run:
    for estimators in range(20, 100, 20):
        with mlflow.start_run(run_name=f"nested_{estimators}_estimators", nested=True) as nested:
            rf = RandomForestRegressor(n_estimators=estimators, max_depth=6, max_features=3)
            rf.fit(X_train, y_train)

In [None]:
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

import os
os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://10.43.101.149:9000"
os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'

db = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(db.data, db.target)

# connect to mlflow
mlflow.set_tracking_uri("http://10.43.101.149:5000")
mlflow.set_experiment("mlflow_tracking_examples_class1")

mlflow.autolog(log_model_signatures=True, log_input_examples=True)

params = {
  "n_estimators": [33, 66, 200],
  "max_depth": [2, 4, 6],
  "max_features": [3, 4, 5]
}

rf = RandomForestRegressor()
searcher = GridSearchCV(estimator=rf, param_grid=params)

with mlflow.start_run(run_name="autolog_with_grid_search") as run:
    searcher.fit(X_train, y_train)