In [1]:
# 
# # TFDV Lab: Exploration & Schema Generation
# In this notebook, we will explore the dataset, fix anomalies, and **generate the Golden Schema** # that will be used by our production pipeline.

# %%
import os
import pandas as pd
import tensorflow_data_validation as tfdv
import yaml

# %%
# Load Configuration
with open('../config/lab_config.yaml', 'r') as f:
    config = yaml.safe_load(f)

print("Loaded Config for Columns:", config['data']['column_names'])

# %%
# 1. Load Data
df = pd.read_csv(config['data']['train_url'], names=config['data']['column_names'], skipinitialspace=True)

# Split: 80% Train (Golden State), 20% Eval (To test against)
train_df = df.sample(frac=0.8, random_state=42)
eval_df = df.drop(train_df.index)

print(f"Training set: {len(train_df)} examples")

# %%
# 2. Generate Statistics
train_stats = tfdv.generate_statistics_from_dataframe(train_df)
tfdv.visualize_statistics(train_stats)

# %%
# 3. Infer Initial Schema
schema = tfdv.infer_schema(statistics=train_stats)
tfdv.display_schema(schema=schema)

# %%
# 4. Refine Schema (MLOps Step)
# In production, we often relax constraints. 
# For example, let's ensure 'age' is treated as optional if missing in <10% of cases.
# (Here we just demonstrate accessing the feature)
age_feature = tfdv.get_feature(schema, 'age')

# %%
# 5. Save the Golden Schema (The "Artifact")
# This is crucial. This file becomes the 'contract' for our pipeline.
output_dir = "../artifacts"
os.makedirs(output_dir, exist_ok=True)
schema_file = os.path.join(output_dir, 'schema.pbtxt')

tfdv.write_schema_text(schema, schema_file)
print(f"Schema saved to {schema_file}")

# %% [markdown]
# ### Next Steps
# Now that `schema.pbtxt` is saved, you can run the `pipelines/validate_data.py` script
# to simulate how a production system would validate new incoming data using this schema.

ModuleNotFoundError: No module named 'tensorflow_data_validation'