# Ag IQ FMV Model Training

This notebook trains the Fair Market Value (FMV) prediction model using LightGBM gradient boosting.

**Goal**: Build a high-accuracy valuation model achieving:
- **MAPE < 10%** (Mean Absolute Percentage Error)
- **R² > 0.85** (Variance explained)
- **RMSE < $15,000** (Root Mean Squared Error)

**Approach**:
- Time-based train/val/test splits (70/15/15)
- LightGBM with early stopping
- Feature importance analysis
- Comprehensive performance evaluation


In [4]:
# =============================================================================
# CELL 1: Setup
# =============================================================================
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', '{:.2f}'.format)

print("Setup complete")


Setup complete


In [5]:
# =============================================================================
# CELL 2: Load Processed Data
# =============================================================================
print("Loading processed data...")
training_df = pd.read_parquet('../data/processed/training_data.parquet')

print(f"Loaded: {len(training_df):,} records")
print(f"Date range: {training_df['sold_date'].min().date()} to {training_df['sold_date'].max().date()}")
print(f"Price range: ${training_df['price'].min():,.0f} to ${training_df['price'].max():,.0f}")


Loading processed data...


FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/training_data.parquet'

In [None]:
# =============================================================================
# CELL 3: Initialize and Train Model
# =============================================================================
from src.models.fmv import FMVModel
from src.features.pipeline import FeaturePipeline

# Initialize model
model = FMVModel()

# Train model
# This will:
# - Create time-based splits (70% train, 15% val, 15% test)
# - Fit feature pipeline on training data
# - Train LightGBM with early stopping
# - Evaluate on validation and test sets
metrics = model.train(training_df)


In [None]:
# =============================================================================
# CELL 4: Detailed Performance Analysis
# =============================================================================
print("=" * 60)
print("PERFORMANCE SUMMARY")
print("=" * 60)

print(f"\nModel trained at: {model.metadata['trained_at']}")
print(f"Best iteration: {model.metadata['best_iteration']}")

print(f"\nDataset sizes:")
print(f"  Training:   {model.metadata['n_train']:,} records")
print(f"  Validation: {model.metadata['n_val']:,} records")
print(f"  Test:       {model.metadata['n_test']:,} records")

print(f"\n--- Validation Set Performance ---")
print(f"  RMSE:  ${metrics['val_rmse']:,.0f}")
print(f"  MAPE:  {metrics['val_mape']:.2f}%")
print(f"  R²:    {metrics['val_r2']:.4f}")

print(f"\n--- Test Set Performance ---")
print(f"  RMSE:  ${metrics['test_rmse']:,.0f}")
print(f"  MAPE:  {metrics['test_mape']:.2f}%")
print(f"  R²:    {metrics['test_r2']:.4f}")

# Check if we met our goals
print(f"\n--- Goal Achievement ---")
print(f"  MAPE < 10%:      {'✓ YES' if metrics['test_mape'] < 10 else '✗ NO'} ({metrics['test_mape']:.2f}%)")
print(f"  R² > 0.85:       {'✓ YES' if metrics['test_r2'] > 0.85 else '✗ NO'} ({metrics['test_r2']:.4f})")
print(f"  RMSE < $15,000:  {'✓ YES' if metrics['test_rmse'] < 15000 else '✗ NO'} (${metrics['test_rmse']:,.0f})")


In [None]:
# =============================================================================
# CELL 5: Feature Importance Analysis
# =============================================================================
print("\n" + "=" * 60)
print("FEATURE IMPORTANCE")
print("=" * 60)

# Get feature importance
importance_df = model.feature_importance(importance_type='gain')

print(f"\nTop 20 Most Important Features:")
print(importance_df.head(20).to_string(index=False))

# Visualize
fig, ax = plt.subplots(figsize=(10, 8))
top_features = importance_df.head(20)
ax.barh(range(len(top_features)), top_features['importance'])
ax.set_yticks(range(len(top_features)))
ax.set_yticklabels(top_features['feature'])
ax.invert_yaxis()
ax.set_xlabel('Importance (Gain)')
ax.set_title('Top 20 Feature Importance')
plt.tight_layout()
plt.show()


In [None]:
# =============================================================================
# CELL 6: Prediction Analysis on Test Set
# =============================================================================
# Get test set predictions
test_start_idx = int(len(training_df) * 0.85)
test_df = training_df.iloc[test_start_idx:].copy()

test_predictions = model.predict(test_df)
test_df['predicted_price'] = test_predictions
test_df['error'] = test_df['price'] - test_df['predicted_price']
test_df['abs_error'] = test_df['error'].abs()
test_df['pct_error'] = (test_df['error'] / test_df['price'] * 100).abs()

print("=" * 60)
print("PREDICTION ANALYSIS (Test Set)")
print("=" * 60)

print(f"\nPrediction Statistics:")
print(f"  Mean actual price:     ${test_df['price'].mean():,.0f}")
print(f"  Mean predicted price:  ${test_df['predicted_price'].mean():,.0f}")
print(f"  Mean absolute error:   ${test_df['abs_error'].mean():,.0f}")
print(f"  Median absolute error: ${test_df['abs_error'].median():,.0f}")
print(f"  Mean % error:          {test_df['pct_error'].mean():.2f}%")
print(f"  Median % error:        {test_df['pct_error'].median():.2f}%")

print(f"\nError Distribution:")
print(f"  Within ±5%:   {(test_df['pct_error'] <= 5).sum():,} ({(test_df['pct_error'] <= 5).mean()*100:.1f}%)")
print(f"  Within ±10%:  {(test_df['pct_error'] <= 10).sum():,} ({(test_df['pct_error'] <= 10).mean()*100:.1f}%)")
print(f"  Within ±20%:  {(test_df['pct_error'] <= 20).sum():,} ({(test_df['pct_error'] <= 20).mean()*100:.1f}%)")


In [None]:
# =============================================================================
# CELL 7: Visualize Predictions
# =============================================================================
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Actual vs Predicted scatter
sample = test_df.sample(min(5000, len(test_df)))
axes[0, 0].scatter(sample['price'], sample['predicted_price'], alpha=0.3, s=1)
axes[0, 0].plot([sample['price'].min(), sample['price'].max()], 
                [sample['price'].min(), sample['price'].max()], 
                'r--', linewidth=2, label='Perfect prediction')
axes[0, 0].set_xlabel('Actual Price ($)')
axes[0, 0].set_ylabel('Predicted Price ($)')
axes[0, 0].set_title('Actual vs Predicted Prices')
axes[0, 0].legend()

# Residual plot
axes[0, 1].scatter(sample['predicted_price'], sample['error'], alpha=0.3, s=1)
axes[0, 1].axhline(y=0, color='r', linestyle='--', linewidth=2)
axes[0, 1].set_xlabel('Predicted Price ($)')
axes[0, 1].set_ylabel('Error (Actual - Predicted) ($)')
axes[0, 1].set_title('Residual Plot')

# Error distribution
axes[1, 0].hist(test_df['pct_error'], bins=100, edgecolor='black', alpha=0.7)
axes[1, 0].axvline(test_df['pct_error'].median(), color='red', linestyle='--',
                   label=f"Median: {test_df['pct_error'].median():.2f}%")
axes[1, 0].set_xlabel('Absolute Percentage Error (%)')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Error Distribution')
axes[1, 0].legend()
axes[1, 0].set_xlim(0, 50)  # Focus on 0-50% errors

# Cumulative error distribution
sorted_errors = np.sort(test_df['pct_error'])
cumulative = np.arange(1, len(sorted_errors) + 1) / len(sorted_errors) * 100
axes[1, 1].plot(sorted_errors, cumulative)
axes[1, 1].axvline(10, color='red', linestyle='--', label='10% threshold')
axes[1, 1].set_xlabel('Absolute Percentage Error (%)')
axes[1, 1].set_ylabel('Cumulative Percentage of Predictions')
axes[1, 1].set_title('Cumulative Error Distribution')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)
axes[1, 1].set_xlim(0, 30)

plt.tight_layout()
plt.show()


In [None]:
# =============================================================================
# CELL 8: Save Trained Model
# =============================================================================
model_path = Path('../models/fmv_model')
model.save(str(model_path))

print("\n" + "=" * 60)
print("MODEL TRAINING COMPLETE")
print("=" * 60)

print(f"\n✓ Model saved to: {model_path}")
print(f"  - model.lgb (LightGBM model)")
print(f"  - pipeline.joblib (Feature pipeline)")
print(f"  - metadata.json (Training info & metrics)")

print(f"\nFinal Test Set Performance:")
print(f"  MAPE:  {metrics['test_mape']:.2f}% {'✓ GOAL MET' if metrics['test_mape'] < 10 else '✗ Below target'}")
print(f"  R²:    {metrics['test_r2']:.4f} {'✓ GOAL MET' if metrics['test_r2'] > 0.85 else '✗ Below target'}")
print(f"  RMSE:  ${metrics['test_rmse']:,.0f} {'✓ GOAL MET' if metrics['test_rmse'] < 15000 else '✗ Below target'}")

print(f"\nNext steps:")
print("  - Phase 5: Train Future FMV model (forward-looking predictions)")
print("  - Phase 6: Build Streamlit interface for interactive predictions")
