# Ag IQ Data Exploration

This notebook explores 26 years of agricultural equipment auction data plus macro economic indicators.

**Goal**: Understand data quality, coverage, and characteristics to inform feature engineering and model training.


In [1]:
# =============================================================================
# CELL 1: Setup
# =============================================================================
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', '{:.2f}'.format)

print("Setup complete")


Setup complete


In [2]:
# =============================================================================
# CELL 2: Load all data
# =============================================================================
from src.data.loaders import load_all_data

data = load_all_data('../data/raw')

auctions = data['auctions']
barometer = data['barometer']
diesel = data['diesel']
el_nino = data['el_nino']
futures = data['futures']
makes = data['makes']
auctioneers = data['auctioneers']


LOADING ALL DATA FILES

Found 3 auction result files
  Loading auction_results_part1_of_3_20251227_170743.xlsx...
    → 250,000 rows, 40 columns
  Loading auction_results_part2_of_3_20251227_170824.xlsx...
    → 250,000 rows, 40 columns
  Loading auction_results_part3_of_3_20251227_170743.xlsx...
    → 233,413 rows, 40 columns

Total: 733,413 rows
Loaded ag_barometer: 120 rows
Loaded diesel_prices: 379 rows
Loaded el_nino: 909 rows
Loaded future_prices: 15660 rows
  Symbols: N/A
Loaded makes: 1034 rows
Loaded auctioneers: 2102 rows


In [None]:
# =============================================================================
# CELL 3: Auction Results Overview
# =============================================================================
print("=" * 60)
print("AUCTION RESULTS OVERVIEW")
print("=" * 60)

print(f"\nTotal records: {len(auctions):,}")
print(f"Columns: {len(auctions.columns)}")

print("\nColumn names:")
for i, col in enumerate(sorted(auctions.columns)):
    print(f"  {i+1:2}. {col}")


In [None]:
# =============================================================================
# CELL 4: Data Types and Coverage
# =============================================================================
print("\n" + "=" * 60)
print("DATA TYPES AND COVERAGE")
print("=" * 60)

coverage_df = pd.DataFrame({
    'dtype': auctions.dtypes,
    'non_null': auctions.count(),
    'null': auctions.isnull().sum(),
    'coverage_pct': (auctions.count() / len(auctions) * 100).round(1),
    'unique': auctions.nunique()
})

print(coverage_df.to_string())


In [None]:
# =============================================================================
# CELL 5: Key Fields Deep Dive
# =============================================================================
print("\n" + "=" * 60)
print("KEY FIELDS ANALYSIS")
print("=" * 60)

# Price
print("\n--- PRICE ---")
print(f"Non-null: {auctions['price'].notna().sum():,}")
print(f"Coverage: {auctions['price'].notna().mean()*100:.1f}%")
if auctions['price'].notna().any():
    valid_prices = auctions[auctions['price'] > 0]['price']
    print(f"Min: ${valid_prices.min():,.0f}")
    print(f"Max: ${valid_prices.max():,.0f}")
    print(f"Mean: ${valid_prices.mean():,.0f}")
    print(f"Median: ${valid_prices.median():,.0f}")

# Sold Date
print("\n--- SOLD_DATE ---")
auctions['sold_date'] = pd.to_datetime(auctions['sold_date'], errors='coerce')
print(f"Non-null: {auctions['sold_date'].notna().sum():,}")
print(f"Date range: {auctions['sold_date'].min()} to {auctions['sold_date'].max()}")

# Year (model year)
print("\n--- YEAR (Model Year) ---")
print(f"Non-null: {auctions['year'].notna().sum():,}")
if auctions['year'].notna().any():
    print(f"Range: {auctions['year'].min():.0f} to {auctions['year'].max():.0f}")

# Hours
print("\n--- HOURS ---")
print(f"Non-null: {auctions['hours'].notna().sum():,}")
print(f"Coverage: {auctions['hours'].notna().mean()*100:.1f}%")
if auctions['hours'].notna().any():
    valid_hours = auctions[auctions['hours'] > 0]['hours']
    print(f"Min: {valid_hours.min():,.0f}")
    print(f"Max: {valid_hours.max():,.0f}")
    print(f"Mean: {valid_hours.mean():,.0f}")
    print(f"Median: {valid_hours.median():,.0f}")

# Region
print("\n--- REGION ---")
print(f"Non-null: {auctions['region'].notna().sum():,}")
print(f"Coverage: {auctions['region'].notna().mean()*100:.1f}%")
print("Distribution:")
print(auctions['region'].value_counts())

# Make
print("\n--- MAKE_KEY ---")
print(f"Non-null: {auctions['make_key'].notna().sum():,}")
print(f"Unique makes: {auctions['make_key'].nunique()}")
print("\nTop 15 makes:")
print(auctions['make_key'].value_counts().head(15))

# Condition
print("\n--- RAW_CONDITION ---")
print(f"Non-null: {auctions['raw_condition'].notna().sum():,}")
print(f"Coverage: {auctions['raw_condition'].notna().mean()*100:.1f}%")
print("Sample values:")
print(auctions['raw_condition'].value_counts().head(10))

# Category
print("\n--- RAW_CATEGORY ---")
print(f"Non-null: {auctions['raw_category'].notna().sum():,}")
print("Distribution:")
print(auctions['raw_category'].value_counts().head(20))


In [None]:
# =============================================================================
# CELL 6: Price Distribution Visualization
# =============================================================================
print("\n" + "=" * 60)
print("PRICE DISTRIBUTION")
print("=" * 60)

valid_prices = auctions[(auctions['price'] > 0) & (auctions['price'] < 2_000_000)]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Raw distribution
axes[0].hist(valid_prices['price'], bins=100, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Price ($)')
axes[0].set_ylabel('Count')
axes[0].set_title(f'Price Distribution (n={len(valid_prices):,})')
axes[0].axvline(valid_prices['price'].median(), color='red', linestyle='--', label=f"Median: ${valid_prices['price'].median():,.0f}")
axes[0].legend()

# Log distribution
axes[1].hist(np.log10(valid_prices['price']), bins=100, edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Log10(Price)')
axes[1].set_ylabel('Count')
axes[1].set_title('Price Distribution (Log Scale)')

plt.tight_layout()
plt.show()


In [None]:
# =============================================================================
# CELL 7: Sales Over Time
# =============================================================================
print("\n" + "=" * 60)
print("SALES OVER TIME")
print("=" * 60)

# Monthly aggregation
monthly = auctions.groupby(auctions['sold_date'].dt.to_period('M')).agg({
    'price': ['count', 'median']
}).reset_index()
monthly.columns = ['month', 'count', 'median_price']
monthly['month'] = monthly['month'].dt.to_timestamp()

# Filter to reasonable date range
monthly = monthly[monthly['month'] >= '2000-01-01']

fig, axes = plt.subplots(2, 1, figsize=(14, 8), sharex=True)

axes[0].bar(monthly['month'], monthly['count'], width=20, alpha=0.7)
axes[0].set_ylabel('Number of Sales')
axes[0].set_title('Sales Volume Over Time')

axes[1].plot(monthly['month'], monthly['median_price'], marker='.', markersize=2)
axes[1].set_ylabel('Median Price ($)')
axes[1].set_title('Median Sale Price Over Time')
axes[1].set_xlabel('Date')

plt.tight_layout()
plt.show()

# Yearly summary
print("\nYearly Summary:")
yearly = auctions.groupby(auctions['sold_date'].dt.year).agg({
    'price': ['count', 'median', 'mean']
}).round(0)
yearly.columns = ['count', 'median_price', 'mean_price']
print(yearly.tail(20))


In [None]:
# =============================================================================
# CELL 8: Regional Analysis
# =============================================================================
print("\n" + "=" * 60)
print("REGIONAL ANALYSIS")
print("=" * 60)

regional = auctions[auctions['price'] > 0].groupby('region').agg({
    'price': ['count', 'mean', 'median', 'std']
}).round(0)
regional.columns = ['count', 'mean_price', 'median_price', 'std_price']
regional = regional.sort_values('count', ascending=False)
print(regional)

# Regional price box plot
fig, ax = plt.subplots(figsize=(12, 6))
valid_regional = auctions[(auctions['price'] > 0) & (auctions['price'] < 500000) & (auctions['region'].notna())]
valid_regional.boxplot(column='price', by='region', ax=ax)
plt.title('Price Distribution by Region')
plt.suptitle('')  # Remove automatic title
plt.ylabel('Price ($)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# =============================================================================
# CELL 9: Make Analysis
# =============================================================================
print("\n" + "=" * 60)
print("MAKE ANALYSIS")
print("=" * 60)

make_stats = auctions[auctions['price'] > 0].groupby('make_key').agg({
    'price': ['count', 'mean', 'median'],
    'hours': 'median'
}).round(0)
make_stats.columns = ['count', 'mean_price', 'median_price', 'median_hours']
make_stats = make_stats.sort_values('count', ascending=False)
print("\nTop 20 Makes by Volume:")
print(make_stats.head(20))


In [None]:
# =============================================================================
# CELL 10: Macro Data Overview
# =============================================================================
print("\n" + "=" * 60)
print("MACRO DATA OVERVIEW")
print("=" * 60)

# Barometer
print("\n--- AG ECONOMY BAROMETER ---")
print(f"Records: {len(barometer)}")
print(f"Date range: {barometer['date'].min()} to {barometer['date'].max()}")
print(f"Columns: {barometer.columns.tolist()}")
print(barometer.describe())

# Diesel
print("\n--- DIESEL PRICES ---")
print(f"Records: {len(diesel)}")
print(f"Date range: {diesel['month_date'].min()} to {diesel['month_date'].max()}")
print(diesel.describe())

# El Nino
print("\n--- EL NINO ---")
print(f"Records: {len(el_nino)}")
print(f"Columns: {el_nino.columns.tolist()}")
print(f"Phase distribution:")
print(el_nino['phase'].value_counts())

# Futures
print("\n--- FUTURES PRICES ---")
print(f"Records: {len(futures)}")
print(f"Symbols: {futures['future_symbol'].unique().tolist()}")
print(f"Date range: {futures['date'].min()} to {futures['date'].max()}")


In [None]:
# =============================================================================
# CELL 11: Makes Reference Data
# =============================================================================
print("\n" + "=" * 60)
print("MAKES REFERENCE DATA")
print("=" * 60)

print(f"Total makes: {len(makes)}")
print(f"Columns: {makes.columns.tolist()}")
print(f"\nStill in business: {makes['still_in_business'].sum() if 'still_in_business' in makes.columns else 'N/A'}")
print(f"\nSample:")
print(makes[['name', 'make_key', 'still_in_business', 'production_year_start', 'production_year_end']].head(10))


In [None]:
# =============================================================================
# CELL 12: Data Quality Summary
# =============================================================================
print("\n" + "=" * 60)
print("DATA QUALITY SUMMARY")
print("=" * 60)

total = len(auctions)

quality_summary = {
    'Total Records': f"{total:,}",
    'Has Price (>0)': f"{(auctions['price'] > 0).sum():,} ({(auctions['price'] > 0).mean()*100:.1f}%)",
    'Has Sold Date': f"{auctions['sold_date'].notna().sum():,} ({auctions['sold_date'].notna().mean()*100:.1f}%)",
    'Has Year': f"{auctions['year'].notna().sum():,} ({auctions['year'].notna().mean()*100:.1f}%)",
    'Has Hours': f"{auctions['hours'].notna().sum():,} ({auctions['hours'].notna().mean()*100:.1f}%)",
    'Has Make Key': f"{auctions['make_key'].notna().sum():,} ({auctions['make_key'].notna().mean()*100:.1f}%)",
    'Has Model ID': f"{auctions['model_id'].notna().sum():,} ({auctions['model_id'].notna().mean()*100:.1f}%)",
    'Has Region': f"{auctions['region'].notna().sum():,} ({auctions['region'].notna().mean()*100:.1f}%)",
    'Has Condition': f"{auctions['raw_condition'].notna().sum():,} ({auctions['raw_condition'].notna().mean()*100:.1f}%)",
    'Has Category': f"{auctions['raw_category'].notna().sum():,} ({auctions['raw_category'].notna().mean()*100:.1f}%)",
}

for k, v in quality_summary.items():
    print(f"{k:25} {v}")

# Usable records for training
usable = auctions[
    (auctions['price'] > 0) & 
    (auctions['sold_date'].notna()) & 
    (auctions['make_key'].notna())
]
print(f"\n{'Usable for Training':25} {len(usable):,} ({len(usable)/total*100:.1f}%)")

# Recent data (last 7 years)
recent_cutoff = pd.Timestamp.now() - pd.DateOffset(years=7)
recent = usable[usable['sold_date'] >= recent_cutoff]
print(f"{'Usable (Last 7 Years)':25} {len(recent):,} ({len(recent)/total*100:.1f}%)")


In [None]:
# =============================================================================
# CELL 13: Save exploration findings
# =============================================================================
# Create a summary dict to save
exploration_summary = {
    'total_records': len(auctions),
    'date_range': f"{auctions['sold_date'].min()} to {auctions['sold_date'].max()}",
    'price_coverage': auctions['price'].notna().mean(),
    'hours_coverage': auctions['hours'].notna().mean(),
    'region_coverage': auctions['region'].notna().mean(),
    'unique_makes': auctions['make_key'].nunique(),
    'unique_regions': auctions['region'].nunique(),
    'usable_records': len(usable),
    'usable_recent_7yr': len(recent),
}

print("\n" + "=" * 60)
print("EXPLORATION COMPLETE")
print("=" * 60)
print("\nKey findings saved. Proceed to data cleaning notebook.")
