"""
**2. Preparar el entorno y cargar el dataset

Este notebook carga y preprocesa el dataset de covertype para realizar la selección de características y preparar el ambiente de desarrollo.
"""

In [1]:
# Importar librerías necesarias
import os
import requests
import pandas as pd
from pathlib import Path
# Importar librerías para preprocesamiento y selección de características
from dataclasses import dataclass
from typing import List
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

"""
## 2.2 Cargar y explorar el dataset
"""

Se define el directorio donde se guardarán los datos y se descarga el dataset si aún no existe.
"""

In [2]:
# Define el directorio donde se guardarán los datos y la ruta del archivo CSV
_data_root = Path("./data/covertype")
_data_root.mkdir(parents=True, exist_ok=True)
_data_filepath = _data_root / "covertype_train.csv"

In [3]:
# Descargar el dataset si no existe, se descarga desde la URL
if not _data_filepath.is_file():
    #https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/
    url = "https://docs.google.com/uc?export=download&id=1lVF1BCWLH4eXXV_YOJzjR7xZjj-wAGj9"  
    r = requests.get(url, allow_redirects=True, stream=True)
    with open(_data_filepath, 'wb') as f:
        f.write(r.content)
    print("Dataset descargado.")
else:
    print("El dataset ya existe en la ruta especificada.")

El dataset ya existe en la ruta especificada.


In [4]:
# Cargar el dataset en un DataFrame
df = pd.read_csv(_data_filepath)
print("Dimensiones del dataset:", df.shape)
# Mostrar las primeras 5 filas
df.head()

Dimensiones del dataset: (116203, 13)


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2991,119,7,67,11,1015,233,234,133,1570,Commanche,C7202,1
1,2876,3,18,485,71,2495,192,202,144,1557,Commanche,C7757,1
2,3171,315,2,277,9,4374,213,237,162,1052,Rawah,C7745,0
3,3087,342,13,190,31,4774,193,221,166,752,Rawah,C7745,0
4,2835,158,10,212,41,3596,231,242,141,3280,Rawah,C4744,1


In [5]:
# Mostrar información general del dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116203 entries, 0 to 116202
Data columns (total 13 columns):
 #   Column                              Non-Null Count   Dtype 
---  ------                              --------------   ----- 
 0   Elevation                           116203 non-null  int64 
 1   Aspect                              116203 non-null  int64 
 2   Slope                               116203 non-null  int64 
 3   Horizontal_Distance_To_Hydrology    116203 non-null  int64 
 4   Vertical_Distance_To_Hydrology      116203 non-null  int64 
 5   Horizontal_Distance_To_Roadways     116203 non-null  int64 
 6   Hillshade_9am                       116203 non-null  int64 
 7   Hillshade_Noon                      116203 non-null  int64 
 8   Hillshade_3pm                       116203 non-null  int64 
 9   Horizontal_Distance_To_Fire_Points  116203 non-null  int64 
 10  Wilderness_Area                     116203 non-null  object
 11  Soil_Type                           116

# 3. Selección de características

Se utilizará un dataclass para almacenar la configuración de los datos, se eliminarán las columnas no numéricas,
se realizará la selección de las mejores características, y se guardará el DataFrame preprocesado.
"""

In [6]:
# Definir la configuración de los datos usando un dataclass
@dataclass
class DataConfig:
    target_col: str
    non_numeric_cols: List[str]
    final_df_path: Path

    
# Definir data_root_prepro y crear el directorio si es necesario
data_root_prepro = Path("./data_prepro")
data_root_prepro.mkdir(parents=True, exist_ok=True)

 # Crear la instancia de configuración con los valores deseados
config = DataConfig(
    target_col="Cover_Type",
    non_numeric_cols=list(df.select_dtypes(include=['object']).columns),
    final_df_path= data_root_prepro / "covertype_preprocessed.csv"
)

"""
## 3.1 Selección de las mejores características

Se elimina las columnas no numéricas, se separan las características (X) y la etiqueta (y),
se utiliza StandardScaler para normalizar los datos y SelectKBest con la función de puntuación f_classif para
seleccionar las 8 mejores características.
"""

In [7]:
# %% [code] tags=[]
# Eliminar las columnas no numéricas
df_1 = df.drop(columns=config.non_numeric_cols)

# Separar las características y la etiqueta
X = df_1.drop(columns=[config.target_col])
y = df_1[config.target_col].astype('category')

# Normalizar los datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Convertir de nuevo a DataFrame, manteniendo los nombres originales
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Seleccionar las 8 mejores características usando f_classif
selector = SelectKBest(score_func=f_classif, k=8)
selector.fit(X, y)

# Crear un DataFrame que muestre qué columnas se retuvieron
selected_columns_df = pd.DataFrame({
    'Column': X.columns,
    'Retain': selector.get_support()
})
selected_columns_df

Unnamed: 0,Column,Retain
0,Elevation,True
1,Aspect,False
2,Slope,True
3,Horizontal_Distance_To_Hydrology,True
4,Vertical_Distance_To_Hydrology,True
5,Horizontal_Distance_To_Roadways,True
6,Hillshade_9am,True
7,Hillshade_Noon,True
8,Hillshade_3pm,False
9,Horizontal_Distance_To_Fire_Points,True


In [8]:
# %% [code] tags=[]
# Seleccionar las mejores características y agregar la etiqueta de vuelta
X_selected = X.loc[:, selector.get_support()]
final_df = X_selected.copy()
final_df[config.target_col] = y.values

# Guardar el DataFrame preprocesado
final_df.to_csv(config.final_df_path, index=False)
print("DataFrame preprocesado guardado en:", config.final_df_path)

DataFrame preprocesado guardado en: data_prepro/covertype_preprocessed.csv


In [9]:
final_df.head()

Unnamed: 0,Elevation,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Horizontal_Distance_To_Fire_Points,Cover_Type
0,2991,7,67,11,1015,233,234,1570,1
1,2876,18,485,71,2495,192,202,1557,1
2,3171,2,277,9,4374,213,237,1052,0
3,3087,13,190,31,4774,193,221,752,0
4,2835,10,212,41,3596,231,242,3280,1


In [10]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116203 entries, 0 to 116202
Data columns (total 9 columns):
 #   Column                              Non-Null Count   Dtype   
---  ------                              --------------   -----   
 0   Elevation                           116203 non-null  int64   
 1   Slope                               116203 non-null  int64   
 2   Horizontal_Distance_To_Hydrology    116203 non-null  int64   
 3   Vertical_Distance_To_Hydrology      116203 non-null  int64   
 4   Horizontal_Distance_To_Roadways     116203 non-null  int64   
 5   Hillshade_9am                       116203 non-null  int64   
 6   Hillshade_Noon                      116203 non-null  int64   
 7   Horizontal_Distance_To_Fire_Points  116203 non-null  int64   
 8   Cover_Type                          116203 non-null  category
dtypes: category(1), int64(8)
memory usage: 7.2 MB


"""
# 4. Data Pipeline 
A continuación se muestra un ejemplo básico de cómo configurar el contexto interactivo de TFX y generar ejemplos usando CsvExampleGen.

**4.1 Configurar el contexto interactivo

In [11]:
# Instala TFX
!pip install tfx

import os
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext

# Define la raíz del pipeline
pipeline_root = os.path.join(os.getcwd(), "tfx_pipeline_output")

# Crea el contexto interactivo
context = InteractiveContext(pipeline_root=str(pipeline_root))

# Verifica el directorio
print("Pipeline root:", pipeline_root)

[0m

2025-03-04 19:26:15.877176: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-04 19:26:15.884127: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-04 19:26:15.906232: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-04 19:26:15.947639: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-04 19:26:15.947720: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-04 19:26:15.977240: I tensorflow/core/platform/cpu_feature_guard.cc:

Pipeline root: /home/jovyan/work/tfx_pipeline_output


**Generando ejemplos

In [12]:
 # Componente para generar ejemplos a partir de archivos CSV.
from tfx.components import CsvExampleGen
from tfx.proto import example_gen_pb2

# 2) Define la carpeta donde se encuentra tu CSV.
data_root_prepro = os.path.join(os.getcwd(), "data_prepro")
print("Archivos en data_prepro:", os.listdir(data_root_prepro))

# 3) Configura el input_config para que solo se procese 'covertype_preprocessed.csv'
input_config = example_gen_pb2.Input(
    splits=[
        example_gen_pb2.Input.Split(
            name='train',       # Nombre del split (puedes ajustarlo si lo deseas)
            pattern='covertype_preprocessed.csv'  # Solo este archivo
        )
    ]
)

# 4) Instancia CsvExampleGen usando el input_config definido
example_gen = CsvExampleGen(
    input_base=data_root_prepro,
    input_config=input_config
)

# 5) Ejecuta el componente en el contexto interactivo
context.run(example_gen)

print("CsvExampleGen ejecutado correctamente")


Archivos en data_prepro: ['covertype_preprocessed.csv']




CsvExampleGen ejecutado correctamente


**4.3 Estadísticas

In [13]:
# Obtener el objeto de ejemplo (Artifact)
artifact = example_gen.outputs['examples'].get()[0]

# Imprimir información del artifact
print(f'split names: {artifact.split_names}')
print(f'artifact uri: {artifact.uri}')

# Instanciar el componente StatisticsGen
from tfx.components import StatisticsGen
statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

#Ejecutar el componente en el contexto interactivo
context.run(statistics_gen)
print("StatisticsGen ejecutado correctamente")

split names: ["train", "eval"]
artifact uri: /home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44
StatisticsGen ejecutado correctamente


In [14]:
#Mostrar la salida de estadísticas
context.show(statistics_gen.outputs['statistics'])

Despues de revisar lo anterior se puede observar como dice el documento del proyecto que la columna ceros para Cover type está resaltada en rojo. 


**4.4 Inferir el esquema (SchemaGen)

A partir de las estadísticas calculadas, SchemaGen infiere un esquema que describe las características de tus datos (tipos, rangos, valores esperados, etc.).

In [15]:
from tfx.components import SchemaGen

# Instanciar SchemaGen utilizando las estadísticas generadas
schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],)

# Ejecutar el componente en el contexto interactivo
context.run(schema_gen)

print("SchemaGen ejecutado correctamente")

SchemaGen ejecutado correctamente


In [16]:
# Visualizar el Schema
context.show(schema_gen.outputs['schema'])

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Cover_Type',INT,required,,-
'Elevation',INT,required,,-
'Hillshade_9am',INT,required,,-
'Hillshade_Noon',INT,required,,-
'Horizontal_Distance_To_Fire_Points',INT,required,,-
'Horizontal_Distance_To_Hydrology',INT,required,,-
'Horizontal_Distance_To_Roadways',INT,required,,-
'Slope',INT,required,,-
'Vertical_Distance_To_Hydrology',INT,required,,-


** 4.5 Curando el esquema

En este paso se revisa y, si es necesario, se ajusta (curar) el esquema inferido. Esto puede implicar:

Establecer rangos de valores para ciertas columnas (por ejemplo, Hillshade entre 0 y 255, Slope entre 0 y 90, etc.).
Declarar que la columna de la etiqueta (Cover Type) es categórica.
Este proceso se puede hacer manualmente editando el archivo de esquema (por ejemplo, un archivo YAML o JSON generado por SchemaGen) o programáticamente usando las utilidades de TensorFlow Data Validation (TFDV).

In [17]:
import tensorflow_data_validation as tfdv
from tensorflow_metadata.proto.v0 import schema_pb2

# 1. Cargar el esquema inferido
# Se obtiene el esquema inferido desde el artifact generado por SchemaGen bajo el nombre "schema.pbtxt"
schema_path = schema_gen.outputs['schema'].get()[0].uri + "/schema.pbtxt"
# 2.Cargar el esquema desde el archivo (en formato texto Protobuf)
schema = tfdv.load_schema_text(schema_path)
print("Tipo de esquema:", type(schema))

Tipo de esquema: <class 'tensorflow_metadata.proto.v0.schema_pb2.Schema'>


In [18]:
# 3.Curar el esquema: establecer rangos esperados
# Establecer rango para 'Hillshade_9am' (0 a 255)
tfdv.set_domain(schema, 'Hillshade_9am', schema_pb2.IntDomain(min=0, max=255))
# Establecer rango para 'Hillshade_Noon' (0 a 255)
tfdv.set_domain(schema, 'Hillshade_Noon', schema_pb2.IntDomain(min=0, max=255))
# Establecer rango para 'Slope' (0 a 90)
tfdv.set_domain(schema, 'Slope', schema_pb2.IntDomain(min=0, max=90))
# Para 'Cover_Type': dado que en el preprocesamiento se restó 1 y se trata como etiqueta,
# se define un dominio de tipo string con los valores de '0' a '6'
tfdv.set_domain(schema, 'Cover_Type', schema_pb2.StringDomain(value=['0', '1', '2', '3', '4', '5', '6']))

A continuación se actualiza el esquema en memoria,además escribe esos cambios en el archivo del esquema y vuelve a cargarlo en el artifact, garantizando que el pipeline utilizará el esquema curado.

In [19]:
schema.feature[0].type = schema_pb2.FeatureType.BYTES

In [20]:
schema

feature {
  name: "Cover_Type"
  type: BYTES
  string_domain {
    value: "0"
    value: "1"
    value: "2"
    value: "3"
    value: "4"
    value: "5"
    value: "6"
  }
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "Elevation"
  type: INT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "Hillshade_9am"
  type: INT
  int_domain {
    min: 0
    max: 255
  }
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "Hillshade_Noon"
  type: INT
  int_domain {
    min: 0
    max: 255
  }
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "Horizontal_Distance_To_Fire_Points"
  type: INT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "Horizontal_

In [21]:
# 4. Mostrar el esquema actualizado en memoria
print("\nEsquema actualizado (en memoria):")
tfdv.display_schema(schema)


Esquema actualizado (en memoria):


Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Cover_Type',STRING,required,,'Cover_Type_domain'
'Elevation',INT,required,,-
'Hillshade_9am',INT,required,,min: 0; max: 255
'Hillshade_Noon',INT,required,,min: 0; max: 255
'Horizontal_Distance_To_Fire_Points',INT,required,,-
'Horizontal_Distance_To_Hydrology',INT,required,,-
'Horizontal_Distance_To_Roadways',INT,required,,-
'Slope',INT,required,,min: 0; max: 90
'Vertical_Distance_To_Hydrology',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Cover_Type_domain',"'0', '1', '2', '3', '4', '5', '6'"


In [22]:
# 5. Mostrar el esquema original (tal como está almacenado en el artifact) para comparar
print("\nEsquema original inferido (antes de sobrescribir):")
context.show(schema_gen.outputs['schema'])


Esquema original inferido (antes de sobrescribir):


Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Cover_Type',INT,required,,-
'Elevation',INT,required,,-
'Hillshade_9am',INT,required,,-
'Hillshade_Noon',INT,required,,-
'Horizontal_Distance_To_Fire_Points',INT,required,,-
'Horizontal_Distance_To_Hydrology',INT,required,,-
'Horizontal_Distance_To_Roadways',INT,required,,-
'Slope',INT,required,,-
'Vertical_Distance_To_Hydrology',INT,required,,-


In [23]:
# 6. Sobrescribir el archivo de esquema con el esquema curado para que los cambios sean persistentes
tfdv.write_schema_text(schema, schema_path)
print("\nEsquema actualizado guardado en:", schema_path)


Esquema actualizado guardado en: /home/jovyan/work/tfx_pipeline_output/SchemaGen/schema/46/schema.pbtxt


In [24]:
# 7. Verificar que el artifact ahora refleje el esquema actualizado
print("\nEsquema en el artifact después de la actualización:")
context.show(schema_gen.outputs['schema'])


Esquema en el artifact después de la actualización:


Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Cover_Type',STRING,required,,'Cover_Type_domain'
'Elevation',INT,required,,-
'Hillshade_9am',INT,required,,min: 0; max: 255
'Hillshade_Noon',INT,required,,min: 0; max: 255
'Horizontal_Distance_To_Fire_Points',INT,required,,-
'Horizontal_Distance_To_Hydrology',INT,required,,-
'Horizontal_Distance_To_Roadways',INT,required,,-
'Slope',INT,required,,min: 0; max: 90
'Vertical_Distance_To_Hydrology',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Cover_Type_domain',"'0', '1', '2', '3', '4', '5', '6'"


**4.6 Entornos de esquema y validación de datos de inferencia

In [27]:
from sklearn.model_selection import train_test_split

# Definir rutas y variables
data_root = Path("./data")
serving_csv_path = data_root / "serving_data.csv"
schema_path = Path("./tfx_pipeline_output/SchemaGen/schema/46/schema.pbtxt")

# 1. Generar el CSV de SERVICIO (simula datos de inferencia sin etiqueta)
def generate_serving_csv(df: pd.DataFrame, path: Path, target: str) -> str:
    # Dividir: 70% entrena, 30% de los datos para SERVICIO.
    _, serving_df = train_test_split(df, test_size=0.3, random_state=42)
    # Eliminar la columna de la etiqueta para simular datos de inferencia
    serving_df = serving_df.drop(columns=[target])
    # Guardar el DataFrame en CSV
    serving_df.to_csv(path, index=False)
    return str(path)

# Generar el archivo CSV de datos de servicio
serving_data = generate_serving_csv(final_df, serving_csv_path, target="Cover_Type")
print("CSV de datos de SERVICIO generado en:", serving_data)

CSV de datos de SERVICIO generado en: data/serving_data.csv


In [28]:
# 2. Cargar el esquema curado
schema = tfdv.load_schema_text(str(schema_path))
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Cover_Type',STRING,required,,'Cover_Type_domain'
'Elevation',INT,required,,-
'Hillshade_9am',INT,required,,min: 0; max: 255
'Hillshade_Noon',INT,required,,min: 0; max: 255
'Horizontal_Distance_To_Fire_Points',INT,required,,-
'Horizontal_Distance_To_Hydrology',INT,required,,-
'Horizontal_Distance_To_Roadways',INT,required,,-
'Slope',INT,required,,min: 0; max: 90
'Vertical_Distance_To_Hydrology',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Cover_Type_domain',"'0', '1', '2', '3', '4', '5', '6'"


In [29]:
# 3. Configurar los entornos en el esquema (4.6)
# Se agregan los entornos TRAINING y SERVING para diferenciar datos de entrenamiento de datos de inferencia.
if 'TRAINING' not in schema.default_environment:
    schema.default_environment.append('TRAINING')
if 'SERVING' not in schema.default_environment:
    schema.default_environment.append('SERVING')

# Indicar que la característica 'Cover_Type' (la etiqueta) NO se espera en SERVING
cover_feature = tfdv.get_feature(schema, 'Cover_Type')
if cover_feature is not None:
    if 'SERVING' not in cover_feature.not_in_environment:
        cover_feature.not_in_environment.append('SERVING')
    print("Configurado 'Cover_Type' para no estar en SERVING.")
else:
    print("La característica 'Cover_Type' no se encontró en el esquema.")


Configurado 'Cover_Type' para no estar en SERVING.


In [30]:
# 4. Guardar y visualizar el esquema actualizado
tfdv.write_schema_text(schema, str(schema_path))
print("Esquema actualizado guardado en:", schema_path)
print("Entornos definidos en el esquema:", list(schema.default_environment))
tfdv.display_schema(schema)

Esquema actualizado guardado en: tfx_pipeline_output/SchemaGen/schema/46/schema.pbtxt
Entornos definidos en el esquema: ['TRAINING', 'SERVING']


Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Cover_Type',STRING,required,,'Cover_Type_domain'
'Elevation',INT,required,,-
'Hillshade_9am',INT,required,,min: 0; max: 255
'Hillshade_Noon',INT,required,,min: 0; max: 255
'Horizontal_Distance_To_Fire_Points',INT,required,,-
'Horizontal_Distance_To_Hydrology',INT,required,,-
'Horizontal_Distance_To_Roadways',INT,required,,-
'Slope',INT,required,,min: 0; max: 90
'Vertical_Distance_To_Hydrology',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Cover_Type_domain',"'0', '1', '2', '3', '4', '5', '6'"


In [None]:
# 5. Generar estadísticas para el conjunto de datos de SERVICIO usando el esquema actualizado
# Configurar las opciones de estadísticas para que usen el esquema actualizado
options = tfdv.StatsOptions(schema=schema)
# Generar estadísticas a partir del CSV de servicio
serving_stats = tfdv.generate_statistics_from_csv(data_location=serving_data, stats_options=options)
print("Estadísticas generadas para SERVING DATA:")
# Visualizar las estadísticas
tfdv.visualize_statistics(serving_stats)


Estadísticas generadas para SERVING DATA:


In [33]:
# 5. Validar anomalías en los datos de SERVICIO
# Validación sin especificar entorno: se comparan las estadísticas con el esquema sin usar la información de entornos.
serving_anomalies = tfdv.validate_statistics(statistics=serving_stats, schema=schema)
print("Anomalías en SERVING DATA (sin entorno):")
tfdv.display_anomalies(serving_anomalies)

Anomalías en SERVING DATA (sin entorno):


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Cover_Type',Column dropped,Column is completely missing


In [35]:
# Validación especificando el entorno "SERVING" para que la ausencia de la etiqueta no se marque como anomalía.
serving_anomalies_with_env = tfdv.validate_statistics(statistics=serving_stats, schema=schema, environment='SERVING')
print("Anomalías en SERVING DATA (con entorno 'SERVING'):")
tfdv.display_anomalies(serving_anomalies_with_env)

Anomalías en SERVING DATA (con entorno 'SERVING'):


In [36]:
# Imprimir los entornos configurados en el esquema
print("Entornos en el esquema:", schema.default_environment)

Entornos en el esquema: ['TRAINING', 'SERVING']


In [37]:
print(schema)

feature {
  name: "Cover_Type"
  type: BYTES
  string_domain {
    value: "0"
    value: "1"
    value: "2"
    value: "3"
    value: "4"
    value: "5"
    value: "6"
  }
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  not_in_environment: "SERVING"
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "Elevation"
  type: INT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "Hillshade_9am"
  type: INT
  int_domain {
    min: 0
    max: 255
  }
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "Hillshade_Noon"
  type: INT
  int_domain {
    min: 0
    max: 255
  }
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "Horizontal_Distance_To_Fire_Points"
  type: INT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }


In [38]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Cover_Type',STRING,required,,'Cover_Type_domain'
'Elevation',INT,required,,-
'Hillshade_9am',INT,required,,min: 0; max: 255
'Hillshade_Noon',INT,required,,min: 0; max: 255
'Horizontal_Distance_To_Fire_Points',INT,required,,-
'Horizontal_Distance_To_Hydrology',INT,required,,-
'Horizontal_Distance_To_Roadways',INT,required,,-
'Slope',INT,required,,min: 0; max: 90
'Vertical_Distance_To_Hydrology',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Cover_Type_domain',"'0', '1', '2', '3', '4', '5', '6'"


** 4.7 Genere nuevas estadısticas usando el esquema actualizado


In [41]:
from tfx.v1.components import ImportSchemaGen

#Creación y ejecución de un nuevo artefacto de esquema y generación de estadísticas con el esquema importado.
# 1. Crear un nuevo artefacto que apunte al esquema anterior guardado
schema_new = ImportSchemaGen(schema_file=str (schema_path))
context.run(schema_new)

0,1
.execution_id,47
.component,function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } ImportSchemaGen at 0x782a8d50d3d0.inputs{}.outputs['schema'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Schema' (1 artifact) at 0x7829fff6ca30.type_nameSchema._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47) at 0x782a630c68e0.type<class 'tfx.types.standard_artifacts.Schema'>.uri/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47.exec_properties['schema_file']tfx_pipeline_output/SchemaGen/schema/46/schema.pbtxt
.component.inputs,{}
.component.outputs,['schema'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Schema' (1 artifact) at 0x7829fff6ca30.type_nameSchema._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47) at 0x782a630c68e0.type<class 'tfx.types.standard_artifacts.Schema'>.uri/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47

0,1
.inputs,{}
.outputs,['schema'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Schema' (1 artifact) at 0x7829fff6ca30.type_nameSchema._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47) at 0x782a630c68e0.type<class 'tfx.types.standard_artifacts.Schema'>.uri/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47
.exec_properties,['schema_file']tfx_pipeline_output/SchemaGen/schema/46/schema.pbtxt

0,1
['schema'],function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Schema' (1 artifact) at 0x7829fff6ca30.type_nameSchema._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47) at 0x782a630c68e0.type<class 'tfx.types.standard_artifacts.Schema'>.uri/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47

0,1
.type_name,Schema
._artifacts,[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47) at 0x782a630c68e0.type<class 'tfx.types.standard_artifacts.Schema'>.uri/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47

0,1
[0],function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47) at 0x782a630c68e0.type<class 'tfx.types.standard_artifacts.Schema'>.uri/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47

0,1
.type,<class 'tfx.types.standard_artifacts.Schema'>
.uri,/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47

0,1
['schema_file'],tfx_pipeline_output/SchemaGen/schema/46/schema.pbtxt

0,1
['schema'],function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Schema' (1 artifact) at 0x7829fff6ca30.type_nameSchema._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47) at 0x782a630c68e0.type<class 'tfx.types.standard_artifacts.Schema'>.uri/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47

0,1
.type_name,Schema
._artifacts,[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47) at 0x782a630c68e0.type<class 'tfx.types.standard_artifacts.Schema'>.uri/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47

0,1
[0],function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47) at 0x782a630c68e0.type<class 'tfx.types.standard_artifacts.Schema'>.uri/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47

0,1
.type,<class 'tfx.types.standard_artifacts.Schema'>
.uri,/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47


In [42]:
# 2. Generación de estadísticas utilizando el esquema importado
statistics_new = StatisticsGen(
    examples = example_gen.outputs['examples'],
    schema = schema_new.outputs['schema']
)
context.run(statistics_new)

0,1
.execution_id,48
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } StatisticsGen at 0x7829fff6c550.inputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x782a61db97c0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44) at 0x782a0e43f670.type<class 'tfx.types.standard_artifacts.Examples'>.uri/home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44.span0.split_names[""train"", ""eval""].version0['schema'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Schema' (1 artifact) at 0x7829fff6ca30.type_nameSchema._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47) at 0x782a630c68e0.type<class 'tfx.types.standard_artifacts.Schema'>.uri/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47.outputs['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7829fff6c970.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48) at 0x782a8d50d190.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48.span0.split_names[""train"", ""eval""].exec_properties['stats_options_json']None['exclude_splits'][]"
.component.inputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x782a61db97c0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44) at 0x782a0e43f670.type<class 'tfx.types.standard_artifacts.Examples'>.uri/home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44.span0.split_names[""train"", ""eval""].version0['schema'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Schema' (1 artifact) at 0x7829fff6ca30.type_nameSchema._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47) at 0x782a630c68e0.type<class 'tfx.types.standard_artifacts.Schema'>.uri/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47"
.component.outputs,"['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7829fff6c970.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48) at 0x782a8d50d190.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48.span0.split_names[""train"", ""eval""]"

0,1
.inputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x782a61db97c0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44) at 0x782a0e43f670.type<class 'tfx.types.standard_artifacts.Examples'>.uri/home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44.span0.split_names[""train"", ""eval""].version0['schema'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Schema' (1 artifact) at 0x7829fff6ca30.type_nameSchema._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47) at 0x782a630c68e0.type<class 'tfx.types.standard_artifacts.Schema'>.uri/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47"
.outputs,"['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7829fff6c970.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48) at 0x782a8d50d190.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48.span0.split_names[""train"", ""eval""]"
.exec_properties,['stats_options_json']None['exclude_splits'][]

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x782a61db97c0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44) at 0x782a0e43f670.type<class 'tfx.types.standard_artifacts.Examples'>.uri/home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44.span0.split_names[""train"", ""eval""].version0"
['schema'],function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Schema' (1 artifact) at 0x7829fff6ca30.type_nameSchema._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47) at 0x782a630c68e0.type<class 'tfx.types.standard_artifacts.Schema'>.uri/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44) at 0x782a0e43f670.type<class 'tfx.types.standard_artifacts.Examples'>.uri/home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44) at 0x782a0e43f670.type<class 'tfx.types.standard_artifacts.Examples'>.uri/home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
.type_name,Schema
._artifacts,[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47) at 0x782a630c68e0.type<class 'tfx.types.standard_artifacts.Schema'>.uri/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47

0,1
[0],function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47) at 0x782a630c68e0.type<class 'tfx.types.standard_artifacts.Schema'>.uri/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47

0,1
.type,<class 'tfx.types.standard_artifacts.Schema'>
.uri,/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47

0,1
['statistics'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7829fff6c970.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48) at 0x782a8d50d190.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48.span0.split_names[""train"", ""eval""]"

0,1
.type_name,ExampleStatistics
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48) at 0x782a8d50d190.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48.span0.split_names[""train"", ""eval""]"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48) at 0x782a8d50d190.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48.span0.split_names[""train"", ""eval""]"

0,1
.type,<class 'tfx.types.standard_artifacts.ExampleStatistics'>
.uri,/home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48
.span,0
.split_names,"[""train"", ""eval""]"

0,1
['stats_options_json'],
['exclude_splits'],[]

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x782a61db97c0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44) at 0x782a0e43f670.type<class 'tfx.types.standard_artifacts.Examples'>.uri/home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44.span0.split_names[""train"", ""eval""].version0"
['schema'],function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Schema' (1 artifact) at 0x7829fff6ca30.type_nameSchema._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47) at 0x782a630c68e0.type<class 'tfx.types.standard_artifacts.Schema'>.uri/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44) at 0x782a0e43f670.type<class 'tfx.types.standard_artifacts.Examples'>.uri/home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44) at 0x782a0e43f670.type<class 'tfx.types.standard_artifacts.Examples'>.uri/home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/home/jovyan/work/tfx_pipeline_output/CsvExampleGen/examples/44
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
.type_name,Schema
._artifacts,[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47) at 0x782a630c68e0.type<class 'tfx.types.standard_artifacts.Schema'>.uri/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47

0,1
[0],function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Schema' (uri: /home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47) at 0x782a630c68e0.type<class 'tfx.types.standard_artifacts.Schema'>.uri/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47

0,1
.type,<class 'tfx.types.standard_artifacts.Schema'>
.uri,/home/jovyan/work/tfx_pipeline_output/ImportSchemaGen/schema/47

0,1
['statistics'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7829fff6c970.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48) at 0x782a8d50d190.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48.span0.split_names[""train"", ""eval""]"

0,1
.type_name,ExampleStatistics
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48) at 0x782a8d50d190.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48.span0.split_names[""train"", ""eval""]"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48) at 0x782a8d50d190.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48.span0.split_names[""train"", ""eval""]"

0,1
.type,<class 'tfx.types.standard_artifacts.ExampleStatistics'>
.uri,/home/jovyan/work/tfx_pipeline_output/StatisticsGen/statistics/48
.span,0
.split_names,"[""train"", ""eval""]"


In [43]:
# 3. Mostrar las estadísticas generadas
context.show(statistics_new.outputs['statistics'])

** 4.8 Comprobar anomalías (ExampleValidator)

Ahora, utiliza ExampleValidator para detectar posibles anomalías en los datos comparándolos contra el esquema curado.

In [44]:
from tfx.components import ExampleValidator

# Validación de ejemplos contra un esquema utilizando TensorFlow Data Validation (TFDV).
example_validator = ExampleValidator(
    statistics = statistics_new.outputs['statistics'],
    schema = schema_new.outputs['schema']
)
context.run(example_validator)
context.show(example_validator.outputs['anomalies'])

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Cover_Type',Unexpected data type,Expected data of type: BYTES but got INT


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Cover_Type',Unexpected data type,Expected data of type: BYTES but got INT


** 4.9 Ingeniería de características y Transformación (Transform)

In [92]:
# Set the constants module filename
_constants_module_file = 'my_constants.py'

In [93]:
%%writefile {_constants_module_file}

# Features with string data types that will be converted to indices
CATEGORICAL_FEATURE_KEYS = ['Wilderness_Area']

# Numerical features that are marked as continuous
NUMERIC_FEATURE_KEYS = [
    'Elevation','Hillshade_9am', 'Hillshade_Noon', 'Horizontal_Distance_To_Fire_Points', 'Horizontal_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways', 'Slope', 'Vertical_Distance_To_Hydrology'
]

# Feature that can be grouped into buckets
#BUCKET_FEATURE_KEYS = ['age']

# Number of buckets used by tf.transform for encoding each bucket feature.
#FEATURE_BUCKET_COUNT = {'age': 4}

# Feature that the model will predict
LABEL_KEY = 'Cover_Type'

# Utility function for renaming the feature
def transformed_name(key):
    return key + '_xf'

Writing my_constants.py


In [94]:
# Set the transform module filename
_transform_module_file = 'my_transform.py'

In [95]:
%%writefile {_transform_module_file}

import tensorflow as tf
import tensorflow_transform as tft

import importlib
import my_constants
importlib.reload(my_constants)

# Unpack the contents of the constants module
_NUMERIC_FEATURE_KEYS = my_constants.NUMERIC_FEATURE_KEYS
_CATEGORICAL_FEATURE_KEYS = my_constants.CATEGORICAL_FEATURE_KEYS
#_BUCKET_FEATURE_KEYS = my_constants.BUCKET_FEATURE_KEYS
#_FEATURE_BUCKET_COUNT = my_constants.FEATURE_BUCKET_COUNT
_LABEL_KEY = my_constants.LABEL_KEY
_transformed_name = my_constants.transformed_name


# Define the transformations
def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.
    Args:
        inputs: map from feature keys to raw not-yet-transformed features.
    Returns:
        Map from string feature key to transformed feature operations.
    """
    outputs = {}

    # Scale these features to the range [0,1]
    for key in _NUMERIC_FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.scale_to_0_1(
            inputs[key])
    
    # Bucketize these features
    #for key in _BUCKET_FEATURE_KEYS:
    #    outputs[_transformed_name(key)] = tft.bucketize(
    #        inputs[key], _FEATURE_BUCKET_COUNT[key])

    # Convert strings to indices in a vocabulary
    for key in _CATEGORICAL_FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(inputs[key])

    # Convert the label strings to an index
    outputs[_transformed_name(_LABEL_KEY)] = tft.compute_and_apply_vocabulary(inputs[_LABEL_KEY])

    return outputs

Writing my_transform.py


4.10 Transformar

In [96]:
import tensorflow as tf
from tfx.components import Transform
# Ignore TF warning messages
tf.get_logger().setLevel('ERROR')

# Instantiate the Transform component
transform = Transform(
    examples=example_gen.outputs['examples'],
    schema=schema_gen.outputs['schema'],
    module_file=os.path.abspath(_transform_module_file))

# Run the component
context.run(transform)

running bdist_wheel
running build
running build_py
creating build
creating build/lib
copying my_constants.py -> build/lib
copying my_transform.py -> build/lib
installing to /tmp/tmp2dikjp4h
running install
running install_lib
copying build/lib/my_constants.py -> /tmp/tmp2dikjp4h
copying build/lib/my_transform.py -> /tmp/tmp2dikjp4h
running install_egg_info
running egg_info
creating tfx_user_code_Transform.egg-info
writing tfx_user_code_Transform.egg-info/PKG-INFO
writing dependency_links to tfx_user_code_Transform.egg-info/dependency_links.txt
writing top-level names to tfx_user_code_Transform.egg-info/top_level.txt
writing manifest file 'tfx_user_code_Transform.egg-info/SOURCES.txt'
reading manifest file 'tfx_user_code_Transform.egg-info/SOURCES.txt'
writing manifest file 'tfx_user_code_Transform.egg-info/SOURCES.txt'
Copying tfx_user_code_Transform.egg-info to /tmp/tmp2dikjp4h/tfx_user_code_Transform-0.0+33a58799dfecb2c0df99f08b2c079f5b5336eef0ee2a3419b597363674a6562a-py3.9.egg-info


[0m

Processing ./tfx_pipeline_output/_wheels/tfx_user_code_Transform-0.0+33a58799dfecb2c0df99f08b2c079f5b5336eef0ee2a3419b597363674a6562a-py3-none-any.whl
Installing collected packages: tfx-user-code-Transform
Successfully installed tfx-user-code-Transform-0.0+33a58799dfecb2c0df99f08b2c079f5b5336eef0ee2a3419b597363674a6562a


[0m

Processing ./tfx_pipeline_output/_wheels/tfx_user_code_Transform-0.0+33a58799dfecb2c0df99f08b2c079f5b5336eef0ee2a3419b597363674a6562a-py3-none-any.whl
Installing collected packages: tfx-user-code-Transform
Successfully installed tfx-user-code-Transform-0.0+33a58799dfecb2c0df99f08b2c079f5b5336eef0ee2a3419b597363674a6562a


[0m

KeyError: 'Wilderness_Area'

In [None]:
# Get the uri of the transform graph
transform_graph_uri = transform.outputs['transform_graph'].get()[0].uri

# List the subdirectories under the uri
os.listdir(transform_graph_uri)

In [None]:
# Get the URI of the output artifact representing the transformed examples
train_uri = os.path.join(transform.outputs['transformed_examples'].get()[0].uri, 'Split-train')

# Get the list of files in this directory (all compressed TFRecord files)
tfrecord_filenames = [os.path.join(train_uri, name)
                      for name in os.listdir(train_uri)]

# Create a `TFRecordDataset` to read these files
transformed_dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")

In [None]:
from google.protobuf.json_format import MessageToDict
import pprint
pp = pprint.PrettyPrinter()

# Define a helper function to get individual examples
def get_records(dataset, num_records):
    '''Extracts records from the given dataset.
    Args:
        dataset (TFRecordDataset): dataset saved by ExampleGen
        num_records (int): number of records to preview
    '''
    
    # initialize an empty list
    records = []
    
    # Use the `take()` method to specify how many records to get
    for tfrecord in dataset.take(num_records):
        
        # Get the numpy property of the tensor
        serialized_example = tfrecord.numpy()
        
        # Initialize a `tf.train.Example()` to read the serialized data
        example = tf.train.Example()
        
        # Read the example data (output is a protocol buffer message)
        example.ParseFromString(serialized_example)
        
        # convert the protocol bufffer message to a Python dictionary
        example_dict = (MessageToDict(example))
        
        # append to the records list
        records.append(example_dict)
        
    return records

# Get 3 records from the dataset
sample_records_xf = get_records(transformed_dataset, 3)

# Print the output
pp.pprint(sample_records_xf)

** 5. Metadatos de aprendizaje automático

In [None]:
from ml_metadata import metadata_store
from ml_metadata.proto import metadata_store_pb2

In [None]:
# Configura la conexión al almacén de metadatos
connection_config = metadata_store_pb2.ConnectionConfig()
connection_config.sqlite.filename_uri = "./tfx_pipeline/metadata.sqlite"
connection_config.sqlite.connection_mode = 3  # READWRITE_OPENCREATE
store = metadata_store.MetadataStore(connection_config)

Acceso a artefactos almacenados

In [None]:
#permite ver propiedades de lso shemas segun tipo y catidad de shemas que se desean ver
def inf_schema(schema_type_name, limit=3):
    # Obtener los esquemas del tipo dado
    schemas = store.get_artifacts_by_type(schema_type_name)

    # Limitar la cantidad de esquemas si se proporciona un límite
    if limit:
        schemas = schemas[:limit]

    # Iterar sobre los esquemas y mostrar sus propiedades
    for schema in schemas:
        print(f"ID del Esquema: {schema.id}")
        print("Propiedades:")
        for key, value in schema.custom_properties.items():
            if value.HasField('string_value'):
                print(f"  {key}: {value.string_value}")
            elif value.HasField('int_value'):
                print(f"  {key}: {value.int_value}")
        print("-" * 40)  # Separador visual entre esquemas

In [None]:
for artifact_type in store.get_artifact_types():
    print("////"*15)
    print(artifact_type.name)
    print("*****"*15)
    inf_schema(artifact_type.name)

Seguimiento de artefactos

In [None]:
def get_parent_artifacts(store, artifact_id):
    """
    Obtiene los artefactos de entrada que fueron utilizados para generar un artefacto en particular.

    """
    parent_artifacts_info = {}

    # Obtener eventos asociados al artefacto dado
    events = store.get_events_by_artifact_ids([artifact_id])

    # Extraer los execution_ids relacionados con estos eventos
    execution_ids = {event.execution_id for event in events}

    # Obtener eventos asociados a las ejecuciones anteriores
    parent_events = store.get_events_by_execution_ids(list(execution_ids))

    # Filtrar los artefactos de entrada (INPUT) usados en esas ejecuciones
    input_artifact_ids = {event.artifact_id for event in parent_events if event.type == event.INPUT}

    # Obtener los detalles de los artefactos de entrada
    parent_artifacts = store.get_artifacts_by_id(list(input_artifact_ids))

    for artifact in parent_artifacts:
        try:
            # Obtener el tipo de artefacto usando su type_id
            artifact_type = store.get_artifact_types_by_id([artifact.type_id])[0].name
        except IndexError:
            # Manejo de error en caso de no encontrar el tipo del artefacto
            print(f"Error: No se encontró tipo para artifact_id {artifact.id} con type_id {artifact.type_id}")
            continue
        # Guardar la información del artefacto padre en el diccionario
        parent_artifacts_info[artifact.id] = {
            'type': artifact_type,
            'uri': artifact.uri
        }
    return parent_artifacts_info

In [None]:
parent_artifacts = get_parent_artifacts(store, 6)  # Reemplaza con un ID real
for artifact_id, info in parent_artifacts.items():
    print(f"ID: {artifact_id}, Tipo: {info['type']}, URI: {info['uri']}")

In [None]:
# import os
# from pathlib import Path
# import tensorflow_data_validation as tfdv
# from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext

# from tfx.components import ImportSchemaGen, Transform

# schema_path = Path("./tfx_pipeline_output/SchemaGen/schema/21/schema.pbtxt")

# # Crear el contexto interactivo de TFX:
# pipeline_root = os.path.join(os.getcwd(), "tfx_pipeline_output")
# context = InteractiveContext(pipeline_root=str(pipeline_root))
# print("Pipeline root:", pipeline_root)

# # --------------------------------------------------------------------------
# # 1. Importar el esquema curado como un artefacto TFX usando ImportSchemaGen
# # Esto convierte el archivo de esquema en un artifact del pipeline.
# schema_gen_component = ImportSchemaGen(schema_file=str(schema_path))
# context.run(schema_gen_component)
# imported_schema = schema_gen_component.outputs['schema']
# print("Esquema importado al ML Metadata:")
# context.show(imported_schema)

# # ---------------------------------------------------------------------------
# # 2. Definir la ruta a la función de preprocesamiento.
# # Aquí se asume que has guardado tu función en el archivo "transform_fn.py"
# # y que la función se llama "preprocessing_fn".
# preprocessing_fn_path = "transform_fn.preprocessing_fn"  # módulo.función

# # ---------------------------------------------------------------------------
# # 3. Instanciar el componente Transform
# transform = Transform(
#     examples=example_gen.outputs['examples'],  # Salida de CsvExampleGen
#     schema=imported_schema,                            # Esquema curado previamente
#     preprocessing_fn=preprocessing_fn_path      # Función de preprocesamiento a aplicar
# )

# # Ejecutar el componente Transform usando el contexto interactivo de TFX
# context.run(transform)
# print("Componente Transform ejecutado exitosamente.")

# # ---------------------------------------------------------------------------
# # 4. Obtener la URI de los ejemplos transformados
# transformed_examples_uri = transform.outputs['transformed_examples'].get()[0].uri
# print("Los ejemplos transformados se encuentran en:", transformed_examples_uri)

# # ---------------------------------------------------------------------------
# # 5. Generar estadísticas de los datos transformados para verificar la transformación
# # Esto genera estadísticas a partir de los archivos TFRecord generados por Transform.
# transformed_stats = tfdv.generate_statistics_from_tfrecord(data_location=transformed_examples_uri)
# print("Estadísticas de los datos transformados:")
# # Visualizar las estadísticas
# tfdv.visualize_statistics(transformed_stats)