# Tarea 5

In [1]:
# Cargar librerías
import os, mlflow
from dotenv import load_dotenv
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
import math
import optuna
import pathlib
import xgboost as xgb
from optuna.samplers import TPESampler
from mlflow.models.signature import infer_signature
from mlflow import MlflowClient
import mlflow

### Iniciar el experimento

In [2]:
load_dotenv(override=True)  # Carga las variables del archivo .env
EXPERIMENT_NAME = "/Users/aclarapao@gmail.com/nyc-taxi-experiments"

mlflow.set_tracking_uri("databricks")
experiment = mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

### Pre-Procesamiento

In [3]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

df_train = read_dataframe('../data/green_tripdata_2025-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2025-02.parquet')

In [4]:
def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)

df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
categorical = ['PU_DO']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

X_val = preprocess(df_val, dv)

In [5]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

### Gradient Boost

In [6]:
from sklearn.ensemble import GradientBoostingRegressor

def objective_gb(trial: optuna.trial.Trial):
    # Hiperparámetros MUESTREADOS por Optuna en CADA trial.
    # Nota: usamos log=True para emular rangos log-uniformes (similar a loguniform).
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "random_state": 42
    }

    # Run anidado para dejar rastro de cada trial en MLflow
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "gradient_boosting")  # etiqueta informativa
        mlflow.log_params(params)                  # registra hiperparámetros del trial

        # Entrenamiento 
        model = GradientBoostingRegressor(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)

        # Registrar la métrica principal
        mlflow.log_metric("rmse", rmse)

        # La "signature" describe la estructura esperada de entrada y salida del modelo:
        # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
        # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
        signature = infer_signature(X_val[:5].toarray(), y_pred[:5])

         # Guardar el modelo del trial como artefacto en MLflow.
        mlflow.sklearn.log_model(
            model,
            name="model",
            input_example=pd.DataFrame(X_val[:5].toarray(), columns=dv.get_feature_names_out()),
            signature=signature
        )      

    # Optuna minimiza el valor retornado
    return rmse

In [8]:
mlflow.sklearn.autolog(log_models=False)

# ------------------------------------------------------------
# Crear el estudio de Optuna
#    - Usamos TPE (Tree-structured Parzen Estimator) como sampler.
#    - direction="minimize" porque queremos minimizar el RMSE.
# ------------------------------------------------------------
sampler = TPESampler(seed=42)
study_gb = optuna.create_study(direction="minimize", sampler=sampler)

# ------------------------------------------------------------
# Ejecutar la optimización (n_trials = número de intentos)
#    - Cada trial ejecuta la función objetivo con un set distinto de hiperparámetros.
#    - Abrimos un run "padre" para agrupar toda la búsqueda.
# ------------------------------------------------------------

with mlflow.start_run(run_name="Gradient Boosting Hyperparameter Optimization (Optuna)", nested=True):
    study_gb.optimize(objective_gb, n_trials=3)

        # --------------------------------------------------------
    # Recuperar y registrar los mejores hiperparámetros
    # --------------------------------------------------------
    best_params = study_gb.best_params
    best_params["random_state"] = 42

    mlflow.log_params(best_params)
    # Etiquetas del run "padre" (metadatos del experimento)
    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "optuna",
        "model_family": "gradient_boosting",
        "feature_set_version": 1,
    })

     # --------------------------------------------------------
    # 7) Entrenar un modelo FINAL con los mejores hiperparámetros
    #    (normalmente se haría sobre train+val o con CV; aquí mantenemos el patrón original)
    # --------------------------------------------------------
    
    model = GradientBoostingRegressor(**best_params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    # --------------------------------------------------------
    # 8) Guardar artefactos adicionales (p. ej. el preprocesador)
    # --------------------------------------------------------
    
    pathlib.Path("preprocessor").mkdir(exist_ok=True)
    with open("preprocessor/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")

    # La "signature" describe la estructura esperada de entrada y salida del modelo:
    # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
    # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
    # Si X_val es la matriz dispersa (scipy.sparse) salida de DictVectorizer:
    
    input_example = pd.DataFrame(X_val[:5].toarray(), columns=dv.get_feature_names_out())
    signature = infer_signature(input_example, y_val[:5])

    # Guardar el modelo del trial como artefacto en MLflow.
    mlflow.sklearn.log_model(model, name="model", input_example=input_example, signature=signature)

[I 2025-10-28 18:30:54,550] A new study created in memory with name: no-name-ac77cfee-0967-4f30-8d27-fd7b670db4b5


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/28 18:31:49 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run omniscient-asp-80 at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000/runs/c414eae07dbe455ebb27baa054b1c043
🧪 View experiment at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000


[I 2025-10-28 18:31:52,375] Trial 0 finished with value: 5.423033869318215 and parameters: {'n_estimators': 69, 'learning_rate': 0.2536999076681772, 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 2}. Best is trial 0 with value: 5.423033869318215.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/28 18:32:15 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run bustling-perch-963 at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000/runs/28a47a9be1e54fb988935609e17cbe3c
🧪 View experiment at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000


[I 2025-10-28 18:32:24,310] Trial 1 finished with value: 6.667930901959873 and parameters: {'n_estimators': 57, 'learning_rate': 0.012184186502221764, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 8}. Best is trial 0 with value: 5.423033869318215.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/28 18:32:46 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-28 18:32:49,857] Trial 2 finished with value: 5.418235873807679 and parameters: {'n_estimators': 51, 'learning_rate': 0.2708160864249968, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 2 with value: 5.418235873807679.


🏃 View run trusting-crow-478 at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000/runs/18a1596fc21c4bf8a0e586e7bfab8246
🧪 View experiment at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/28 18:33:15 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run Gradient Boosting Hyperparameter Optimization (Optuna) at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000/runs/6740b1855bde4fea81cb1d7aa6f5580e
🧪 View experiment at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000


### Random Forest

In [9]:
from sklearn.ensemble import RandomForestRegressor

mlflow.sklearn.autolog(log_models=False)

def objective_rf(trial: optuna.trial.Trial):
    # Hiperparámetros MUESTREADOS por Optuna en CADA trial.
    # Nota: usamos log=True para emular rangos log-uniformes (similar a loguniform).
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        "max_depth": trial.suggest_int("max_depth", 5, 30),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "random_state": 42,
        "n_jobs": -1
    }

    # Run anidado para dejar rastro de cada trial en MLflow
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "random_forest")
        mlflow.log_params(params)

        # Entrenamiento 
        model = RandomForestRegressor(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)

        # Registrar la métrica principal
        mlflow.log_metric("rmse", rmse)

        # La "signature" describe la estructura esperada de entrada y salida del modelo:
        # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
        # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
        signature = infer_signature(X_val[:5].toarray(), y_pred[:5])
        
        # Guardar el modelo del trial como artefacto en MLflow.
        mlflow.sklearn.log_model(
            model,
            name="model",
            input_example=pd.DataFrame(X_val[:5].toarray(), columns=dv.get_feature_names_out()),
            signature=signature
        )

    return rmse

In [10]:
# ------------------------------------------------------------
# Crear el estudio de Optuna
#    - Usamos TPE (Tree-structured Parzen Estimator) como sampler.
#    - direction="minimize" porque queremos minimizar el RMSE.
# ------------------------------------------------------------
sampler = TPESampler(seed=42)
study_rf = optuna.create_study(direction="minimize", sampler=sampler)

# ------------------------------------------------------------
# Ejecutar la optimización (n_trials = número de intentos)
#    - Cada trial ejecuta la función objetivo con un set distinto de hiperparámetros.
#    - Abrimos un run "padre" para agrupar toda la búsqueda.
# ------------------------------------------------------------

with mlflow.start_run(run_name="Random Forest Hyperparameter Optimization (Optuna)", nested=True):
    study_rf.optimize(objective_rf, n_trials=5)

    # Recuperar y registrar los mejores hiperparámetros
    best_params = study_rf.best_params
    best_params["random_state"] = 42

    mlflow.log_params(best_params)
    # Etiquetas del run "padre" (metadatos del experimento)
    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "optuna",
        "model_family": "random_forest",
        "feature_set_version": 1,
    })

     # --------------------------------------------------------
    # 7) Entrenar un modelo FINAL con los mejores hiperparámetros
    #    (normalmente se haría sobre train+val o con CV; aquí mantenemos el patrón original)
    # --------------------------------------------------------
    
    model = RandomForestRegressor(**best_params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    # 8) Guardar artefactos adicionales (p. ej. el preprocesador)
    
    pathlib.Path("preprocessor").mkdir(exist_ok=True)
    with open("preprocessor/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")

     # La "signature" describe la estructura esperada de entrada y salida del modelo:
    # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
    # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
    # Si X_val es la matriz dispersa (scipy.sparse) salida de DictVectorizer:
    
    input_example = pd.DataFrame(X_val[:5].toarray(), columns=dv.get_feature_names_out())
    signature = infer_signature(input_example, y_val[:5])

    mlflow.sklearn.log_model(model, name="model", input_example=input_example, signature=signature)

[I 2025-10-28 18:37:06,919] A new study created in memory with name: no-name-5c3de02b-49b2-45cf-8c5c-bc6b4e7cfb82


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/28 18:37:34 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run skittish-hawk-214 at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000/runs/c5c9abfd33ca46c495a1d8e6c7e0095f
🧪 View experiment at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000


[I 2025-10-28 18:38:10,187] Trial 0 finished with value: 5.535684811605542 and parameters: {'n_estimators': 69, 'max_depth': 29, 'min_samples_split': 8, 'min_samples_leaf': 6}. Best is trial 0 with value: 5.535684811605542.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/28 18:38:28 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-28 18:38:34,076] Trial 1 finished with value: 5.591476239254621 and parameters: {'n_estimators': 57, 'max_depth': 9, 'min_samples_split': 2, 'min_samples_leaf': 9}. Best is trial 0 with value: 5.535684811605542.


🏃 View run capricious-dove-104 at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000/runs/1c852020c36e4ec091481c995fa2e69e
🧪 View experiment at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/28 18:38:55 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-28 18:39:24,336] Trial 2 finished with value: 5.600874046133536 and parameters: {'n_estimators': 80, 'max_depth': 23, 'min_samples_split': 2, 'min_samples_leaf': 10}. Best is trial 0 with value: 5.535684811605542.


🏃 View run auspicious-slug-11 at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000/runs/f02e982b02b344c683a3ae07fab25e8f
🧪 View experiment at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/28 18:40:06 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-28 18:40:21,003] Trial 3 finished with value: 5.556766672109476 and parameters: {'n_estimators': 92, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 0 with value: 5.535684811605542.


🏃 View run nimble-stag-624 at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000/runs/59f45bdea2db45c7b759f0e5d66ac21b
🧪 View experiment at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/28 18:40:40 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run incongruous-lynx-710 at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000/runs/8db13b998d184e23ab34a451e1ab45aa
🧪 View experiment at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000


[I 2025-10-28 18:41:28,771] Trial 4 finished with value: 5.489056041015845 and parameters: {'n_estimators': 65, 'max_depth': 18, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 4 with value: 5.489056041015845.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/28 18:42:08 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run Random Forest Hyperparameter Optimization (Optuna) at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000/runs/370f728bcf5843eaafd7a2923df4fb02
🧪 View experiment at: https://dbc-2a5806a7-a130.cloud.databricks.com/ml/experiments/4060777923239000


### Registrar modelo en Model Registry

In [11]:
model_name = "workspace.default.nyc-taxi-model"

In [34]:
CHAMPIO_RUN = "efaf6a662ed744568149beaf015d16d9"
CHALLENGER_RUN = "370f728bcf5843eaafd7a2923df4fb02"

In [None]:
run_id = "370f728bcf5843eaafd7a2923df4fb02"

# Registrar el mejor modelo
result = mlflow.register_model(
    model_uri=f"runs:/{run_id}/model",
    name=model_name
)

Registered model 'workspace.default.nyc-taxi-model' already exists. Creating a new version of this model...


Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Created version '2' of model 'workspace.default.nyc-taxi-model'.


In [13]:
client = MlflowClient()

model_version = result.version
new_alias = "Challenger"

client.set_registered_model_alias(
    name=model_name,
    alias=new_alias,
    version=result.version
)

In [36]:
import mlflow.pyfunc
champion_uri = f"runs:/{CHAMPIO_RUN}/model"
challenger_uri = f"runs:/{CHALLENGER_RUN}/model"

champion_version = mlflow.pyfunc.load_model(champion_uri)
challenger_version = mlflow.pyfunc.load_model(challenger_uri)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

In [37]:
df_val = read_dataframe('../data/green_tripdata_2025-03.parquet')

In [38]:
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']
categorical = ['PU_DO']
numerical = ['trip_distance']

with open("preprocessor/preprocessor.b", "rb") as f:
    dv = pickle.load(f)

reveal_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = preprocess(df_val, dv)

In [39]:
target = 'duration'
y_val = df_val[target].values

y_train = y_train.astype(int)

In [40]:
val_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2025-03")

En este caso mi champion fue mejor a comparación del challenger, y mi mejor modelo fue random forest

In [41]:
y_champ_pred = champion_version.predict(X_val)
y_challenger_pred = challenger_version.predict(X_val)

MlflowException: Failed to enforce schema of data '<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 93697 stored elements and shape (48336, 4159)>
  Coords	Values
  (0, 3391)	1.0
  (0, 4158)	2.2
  (1, 2497)	1.0
  (1, 4158)	1.06
  (2, 4158)	18.91
  (3, 3717)	1.0
  (3, 4158)	8.36
  (4, 3072)	1.0
  (4, 4158)	0.82
  (5, 4158)	4.94
  (6, 2150)	1.0
  (6, 4158)	0.81
  (7, 1798)	1.0
  (7, 4158)	1.56
  (8, 3412)	1.0
  (8, 4158)	1.53
  (9, 3996)	1.0
  (9, 4158)	4.56
  (10, 3582)	1.0
  (10, 4158)	0.89
  (11, 4041)	1.0
  (11, 4158)	1.17
  (12, 3953)	1.0
  (12, 4158)	2.48
  (13, 4158)	2.65
  :	:
  (48323, 1762)	1.0
  (48323, 4158)	9.1
  (48324, 2497)	1.0
  (48324, 4158)	1.54
  (48325, 642)	1.0
  (48325, 4158)	1.51
  (48326, 4158)	8.28
  (48327, 758)	1.0
  (48327, 4158)	3.3
  (48328, 3097)	1.0
  (48328, 4158)	2.25
  (48329, 799)	1.0
  (48329, 4158)	1.09
  (48330, 798)	1.0
  (48330, 4158)	1.92
  (48331, 2038)	1.0
  (48331, 4158)	1.57
  (48332, 3275)	1.0
  (48332, 4158)	2.31
  (48333, 3000)	1.0
  (48333, 4158)	3.57
  (48334, 3415)	1.0
  (48334, 4158)	13.51
  (48335, 1826)	1.0
  (48335, 4158)	4.64' with schema '[Tensor('float64', (-1, 449))]'. Error: Shape of input (48336, 4159) does not match expected shape (-1, 449).