# Modelo de predicción de pesos de peces a partir de medidas anatomicas

Preparar ambiente

In [6]:
import numpy as np
import pandas as pd
from abc import ABC, abstractmethod
from sklearn.base import clone
from sklearn.linear_model import Lasso
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut, StratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import optuna
from optuna.samplers import TPESampler

In [7]:
from pathlib import Path
import joblib
import time
import sklearn
import platform
import json
import os

In [8]:
class PipelineOptimizer:
    """Clase de contexto que utiliza el patrón estrategia"""

    def __init__(self, pipeline, X, y):
        self.pipeline = pipeline
        self.X = X
        self.y = y

    def optimize(self, strategy, param_definitions):
        """
        Parametros:
        ----------
            strategy: Estrategia de optimización
            param_definitions: Para GridSearch: diccionario de parámetros
                            Para Optuna: función que toma un trial y devuelve parámetros
        """
        return strategy.optimize(
            pipeline=self.pipeline,
            X=self.X,
            y=self.y,
            param_definitions=param_definitions,
        )

class OptimizationStrategy(ABC):
    """Clase base abstracta para estrategias de optimización"""

    @abstractmethod
    def optimize(self, pipeline, X, y, param_definitions):
        pass


class OptunaSearchStrategy(OptimizationStrategy):
    """Estrategia de Optuna"""

    def __init__(self, n_trials=100, cv=5, scoring="accuracy", direction="maximize", n_jobs = -1, sampler = TPESampler()):
        self.n_trials = n_trials
        self.cv = cv
        self.scoring = scoring
        self.direction = direction
        self.sampler = sampler
        self.n_jobs = n_jobs

    def optimize(self, pipeline, X, y, param_definitions):
        def objective(trial):
            # Obtener parámetros usando la función proporcionada por el usuario
            params = param_definitions(trial)
            pipeline_clone = clone(pipeline)
            pipeline_clone.set_params(**params)
            return cross_val_score(
                pipeline_clone, X, y, cv=self.cv, scoring=self.scoring, n_jobs = self.n_jobs
            ).mean()

        sampler=TPESampler(seed = 42)
        study = optuna.create_study(sampler = self.sampler, direction=self.direction)
        study.optimize(objective, n_trials=self.n_trials, n_jobs=self.n_jobs)

        return study

In [9]:
RANDOM_STATE = int(os.getenv("MODEL_RANDOM_STATE",42))
ARTIFACTS_DIR = Path(os.getenv("ARTIFACTS_DIR", "./"))

Cargar datos y entrenar modelo

In [10]:
df = pd.read_csv('../src/Fish.csv')

In [11]:
# Separar características y variable objetivo
X = df.drop('Weight', axis=1)
y = df[['Weight']]

In [12]:
# Dividir en conjunto de entrenamiento y prueba (70/30)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=RANDOM_STATE)

In [13]:
# Identificar columnas numéricas y categóricas
numeric_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X_train.select_dtypes(include=['category', 'object']).columns

In [14]:
# Escalar y
y_scaler = StandardScaler()
y_train_sc = y_scaler.fit_transform(y_train)
y_test_sc = y_scaler.transform(y_test)

Aplicamos preprocesamiento.
Fijamos el grado del polinomio en 3. Grado 3 tiene sentido físico ya que corresponde a una medida de volumén.

In [15]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('poly', PolynomialFeatures(include_bias=False, degree=3)),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combinar transformadores en un preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

Lassu = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', Lasso(fit_intercept=True, max_iter = 10000, random_state=RANDOM_STATE))
])


Ajustamos modelos con optimización de hiperparametros

In [16]:
def LR_params(trial):
    return {
        'model__alpha': trial.suggest_float('model__alpha', 10**-2, 10**2),
    }

In [17]:
# Definir optimizador usando la clase y splits de cross validation
sampler = TPESampler(seed=RANDOM_STATE)
optimizerL = PipelineOptimizer(Lassu, X_train, y_train_sc)
cv = KFold(n_splits = 5)
studyL = optimizerL.optimize(
      strategy=OptunaSearchStrategy(n_trials=200, cv = cv, scoring = 'r2', n_jobs=1, sampler=sampler),
      param_definitions=LR_params)

[I 2025-09-28 16:41:35,785] A new study created in memory with name: no-name-bd001e2b-7011-4f30-abd8-4227cc4bf7b2
[I 2025-09-28 16:41:36,146] Trial 0 finished with value: -0.04144236768182341 and parameters: {'model__alpha': 37.46026648354777}. Best is trial 0 with value: -0.04144236768182341.
[I 2025-09-28 16:41:36,214] Trial 1 finished with value: -0.04144236768182341 and parameters: {'model__alpha': 95.07192349792751}. Best is trial 0 with value: -0.04144236768182341.
[I 2025-09-28 16:41:36,285] Trial 2 finished with value: -0.04144236768182341 and parameters: {'model__alpha': 73.2020742417224}. Best is trial 0 with value: -0.04144236768182341.
[I 2025-09-28 16:41:36,359] Trial 3 finished with value: -0.04144236768182341 and parameters: {'model__alpha': 59.86986183486169}. Best is trial 0 with value: -0.04144236768182341.
[I 2025-09-28 16:41:36,430] Trial 4 finished with value: -0.04144236768182341 and parameters: {'model__alpha': 15.610303857839227}. Best is trial 0 with value: -0.

In [18]:
# Entrenar los mejores hiperparametros usando todos los datos
best_params = studyL.best_params
best_L = clone(Lassu)
best_L.set_params(**best_params)
preprocessor = best_L.named_steps['preprocessor']
preprocessor.set_output(transform='pandas')
best_L.fit(X_train, y_train_sc)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,degree,3
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,alpha,0.011699785816691427
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,10000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'


Guardamos el modelo

In [19]:
manifest = {
    "name": "Lasso-Prediccion-Peso-Fish",
    "created_at": time.strftime("%Y-%m-%d %H:%M:%S"),
    "framework": "scikit-learn",
    "python_version": platform.python_version(),
    "pandas_version": pd.__version__,
    "sklearn_version": sklearn.__version__,
    "optuna_version": optuna.__version__,
    "random_state": RANDOM_STATE,
    "features": list(X.columns),
    "target": "Weight",
    "cv_metric": "r2",
    "cv_best_score": studyL.best_value,
    "cv_best_params": studyL.best_params,
    "test_metrics": best_L.score(X_test,y_test_sc),
}

In [20]:
ARTIFACTS_DIR.mkdir(exist_ok=True)
manifest_path = ARTIFACTS_DIR / "model_card.json"
with open(manifest_path, "w", encoding="utf-8") as f:
    json.dump(manifest, f, indent=2, ensure_ascii=False)

In [21]:
model_path = ARTIFACTS_DIR / "model.pkl"
joblib.dump(best_L, model_path)

['model.pkl']