# Ag IQ Feature Engineering

This notebook creates engineered features from the cleaned dataset to prepare for model training.

**Goal**: Transform cleaned data into model-ready features:
- Equipment features (age, utilization, production status)
- Temporal features (seasonality, cyclical patterns)
- Macro features (normalized economic indicators)
- Density features (volume patterns, rare category handling)

**Expected Output**: ~20-30 engineered features ready for LightGBM training


In [None]:
# =============================================================================
# CELL 1: Setup
# =============================================================================
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', '{:.2f}'.format)

print("Setup complete")


In [None]:
# =============================================================================
# CELL 2: Load Processed Data
# =============================================================================
print("Loading processed data...")
training_df = pd.read_parquet('../data/processed/training_data.parquet')

print(f"Loaded: {len(training_df):,} records")
print(f"Columns: {len(training_df.columns)}")
print(f"Date range: {training_df['sold_date'].min().date()} to {training_df['sold_date'].max().date()}")


In [None]:
# =============================================================================
# CELL 3: Initialize Feature Pipeline
# =============================================================================
from src.features.pipeline import FeaturePipeline, FeatureConfig

# Create feature pipeline with custom config
config = FeatureConfig(
    hours_per_year_cap=2000,
    equipment_age_cap=50,
    min_category_frequency=20
)

pipeline = FeaturePipeline(config=config)

print("Feature pipeline initialized")
print(f"Config: {pipeline.config}")


In [None]:
# =============================================================================
# CELL 4: Fit and Transform Features
# =============================================================================
print("=" * 60)
print("FEATURE ENGINEERING")
print("=" * 60)

# Fit and transform in one step
featured_df = pipeline.fit_transform(training_df)

print(f"\nFeature engineering complete!")
print(f"Input columns: {len(training_df.columns)}")
print(f"Output columns: {len(featured_df.columns)}")
print(f"New features created: {len(featured_df.columns) - len(training_df.columns)}")


In [None]:
# =============================================================================
# CELL 5: Inspect Engineered Features
# =============================================================================
cat_features, num_features = pipeline.get_feature_columns()

print("\n--- CATEGORICAL FEATURES ---")
for i, feat in enumerate(cat_features, 1):
    if feat in featured_df.columns:
        unique = featured_df[feat].nunique()
        print(f"  {i}. {feat:25} {unique} unique values")

print(f"\n--- NUMERIC FEATURES ---")
for i, feat in enumerate(num_features, 1):
    if feat in featured_df.columns:
        non_null = featured_df[feat].notna().sum()
        pct = non_null / len(featured_df) * 100
        print(f"  {i:2}. {feat:25} {pct:5.1f}% coverage")


In [None]:
# =============================================================================
# CELL 6: Sample Feature Values
# =============================================================================
print("\n--- SAMPLE EQUIPMENT FEATURES ---")
equipment_cols = ['equipment_age', 'hours_per_year', 'utilization_bucket', 
                  'is_current_production', 'years_since_discontinued']
display_eq = [c for c in equipment_cols if c in featured_df.columns]
print(featured_df[display_eq].head(10))

print("\n--- SAMPLE TEMPORAL FEATURES ---")
temporal_cols = ['sale_month', 'sale_quarter', 'month_sin', 'month_cos',
                 'is_planting_season', 'is_harvest_season']
display_temp = [c for c in temporal_cols if c in featured_df.columns]
print(featured_df[display_temp].head(10))

print("\n--- SAMPLE MACRO FEATURES ---")
macro_cols = ['barometer_norm', 'sentiment_spread', 'investment_confidence', 
              'diesel_relative', 'el_nino_phase']
display_macro = [c for c in macro_cols if c in featured_df.columns]
print(featured_df[display_macro].head(10))


In [None]:
# =============================================================================
# CELL 7: Visualize Key Features
# =============================================================================
fig, axes = plt.subplots(2, 3, figsize=(16, 10))

# Equipment age
if 'equipment_age' in featured_df.columns:
    axes[0, 0].hist(featured_df['equipment_age'].dropna(), bins=50, edgecolor='black', alpha=0.7)
    axes[0, 0].set_xlabel('Equipment Age (years)')
    axes[0, 0].set_ylabel('Count')
    axes[0, 0].set_title('Equipment Age Distribution')
    axes[0, 0].axvline(featured_df['equipment_age'].median(), color='red', linestyle='--',
                       label=f"Median: {featured_df['equipment_age'].median():.1f}")
    axes[0, 0].legend()

# Hours per year
if 'hours_per_year' in featured_df.columns:
    axes[0, 1].hist(featured_df['hours_per_year'].dropna(), bins=50, edgecolor='black', alpha=0.7)
    axes[0, 1].set_xlabel('Hours per Year')
    axes[0, 1].set_ylabel('Count')
    axes[0, 1].set_title('Utilization Rate Distribution')
    axes[0, 1].axvline(featured_df['hours_per_year'].median(), color='red', linestyle='--',
                       label=f"Median: {featured_df['hours_per_year'].median():.0f}")
    axes[0, 1].legend()

# Utilization bucket
if 'utilization_bucket' in featured_df.columns:
    util_counts = featured_df['utilization_bucket'].value_counts()
    axes[0, 2].bar(range(len(util_counts)), util_counts.values, alpha=0.7)
    axes[0, 2].set_xticks(range(len(util_counts)))
    axes[0, 2].set_xticklabels(util_counts.index, rotation=45)
    axes[0, 2].set_ylabel('Count')
    axes[0, 2].set_title('Utilization Buckets')

# Seasonal patterns
if 'sale_month' in featured_df.columns:
    monthly_avg_price = featured_df.groupby('sale_month')['price'].median()
    axes[1, 0].plot(monthly_avg_price.index, monthly_avg_price.values, marker='o')
    axes[1, 0].set_xlabel('Month')
    axes[1, 0].set_ylabel('Median Price ($)')
    axes[1, 0].set_title('Price Seasonality')
    axes[1, 0].set_xticks(range(1, 13))
    axes[1, 0].grid(True, alpha=0.3)

# Barometer vs Price
if 'barometer_norm' in featured_df.columns:
    sample = featured_df.sample(min(5000, len(featured_df)))
    axes[1, 1].scatter(sample['barometer_norm'], sample['price'], alpha=0.3, s=1)
    axes[1, 1].set_xlabel('Normalized Barometer')
    axes[1, 1].set_ylabel('Price ($)')
    axes[1, 1].set_title('Economic Sentiment vs Price')

# Make volume distribution
if 'log_make_volume' in featured_df.columns:
    axes[1, 2].hist(featured_df['log_make_volume'].dropna(), bins=50, edgecolor='black', alpha=0.7)
    axes[1, 2].set_xlabel('Log(Make Volume)')
    axes[1, 2].set_ylabel('Count')
    axes[1, 2].set_title('Make Volume Distribution')

plt.tight_layout()
plt.show()


In [None]:
# =============================================================================
# CELL 8: Feature Statistics
# =============================================================================
print("=" * 60)
print("FEATURE STATISTICS")
print("=" * 60)

# Get all numeric features that exist
available_num_features = [f for f in num_features if f in featured_df.columns]

if available_num_features:
    print(f"\nNumeric features summary:")
    print(featured_df[available_num_features].describe().round(2))

print(f"\n--- Missing Value Summary ---")
missing_summary = featured_df[available_num_features].isnull().sum()
missing_pct = (missing_summary / len(featured_df) * 100).round(1)
missing_df = pd.DataFrame({
    'missing_count': missing_summary,
    'missing_pct': missing_pct
})
missing_df = missing_df[missing_df['missing_count'] > 0].sort_values('missing_count', ascending=False)
if len(missing_df) > 0:
    print(missing_df)
else:
    print("No missing values in numeric features!")


In [None]:
# =============================================================================
# CELL 9: Save Feature-Engineered Dataset and Pipeline
# =============================================================================
# Save featured dataset
features_path = Path('../data/features/training_features.parquet')
features_path.parent.mkdir(parents=True, exist_ok=True)

print("Saving feature-engineered dataset...")
featured_df.to_parquet(features_path, index=False)

file_size_mb = features_path.stat().st_size / (1024 * 1024)
print(f"✓ Saved to: {features_path}")
print(f"  File size: {file_size_mb:.1f} MB")
print(f"  Records: {len(featured_df):,}")
print(f"  Total columns: {len(featured_df.columns)}")

# Save fitted pipeline
pipeline_path = Path('../models/feature_pipeline.joblib')
pipeline_path.parent.mkdir(parents=True, exist_ok=True)
pipeline.save(str(pipeline_path))

print(f"\n✓ Pipeline saved to: {pipeline_path}")

print("\n" + "=" * 60)
print("FEATURE ENGINEERING COMPLETE")
print("=" * 60)
print(f"\nEngineered Features Summary:")
print(f"  - Categorical features: {len([f for f in cat_features if f in featured_df.columns])}")
print(f"  - Numeric features: {len([f for f in num_features if f in featured_df.columns])}")
print(f"  - Total feature columns: {len(cat_features) + len(num_features)}")
print(f"\nNext step: Model training (notebook 04)")
print("  - Train LightGBM FMV model")
print("  - Time-based train/val/test splits")
print("  - Target: <10% MAPE")
