# Data Quality Score (DQS) Pipeline

This notebook automates the complete DQS calculation workflow:
1. Download latest USDOT Motor Carrier Census data
2. Convert to Parquet format
3. Calculate DQS metrics (completeness, validity, timeliness)
4. Run LLM-powered semantic validity checks on sample
5. Store results in historical tracking CSV

## Setup and Configuration

In [2]:
import os
import json
from datetime import datetime
from pathlib import Path

import pandas as pd
import numpy as np
from sodapy import Socrata
from dotenv import load_dotenv
from openai import OpenAI

# Load environment variables
load_dotenv()

print("All packages imported successfully")

All packages imported successfully


In [3]:
# Configuration
DATA_DIR = Path("../data")
RESULTS_DIR = Path("../quality-scores")
HISTORY_FILE = RESULTS_DIR / "dqs_history.csv"

# Use existing, or fetch the latest dataset from US DOT FMCSA (takes 2 minutes to download)
USE_EXISTING_DATA=True

# Socrata API settings
SOCRATA_DOMAIN = "data.transportation.gov"
DATASET_ID = "kjg3-diqy"
FETCH_LIMIT = 5000000  # Fetch all records

# DQS Settings
KEY_FIELDS = [
    "dot_number", "legal_name", "dba_name", "carrier_operation",
    "us_mail", "authorized_for_hire", "private_only", "add_date",
    "email_address", "telephone", "oic_state", "phy_street",
    "phy_city", "phy_state", "phy_zip", "mailing_street",
    "mailing_city", "mailing_state", "mailing_zip", "driver_total",
    "nbr_power_unit", "mcs150_date", "mcs150_mileage", "mcs150_mileage_year",
    "recent_mileage", "recent_mileage_year", "vmt_source_id"
]
# Weights are configurable with initial weights set to: W1=0.4, W2=0.4, W3=0.2)
WEIGHTS = {"completeness": 4/10, "validity": 4/10, "timeliness": 2/10}

# LLM Settings
LLM_SAMPLE_SIZE = 10
LLM_MODEL = "o3-mini"
LLM_SEED = 27

# Create directories
DATA_DIR.mkdir(exist_ok=True)
RESULTS_DIR.mkdir(exist_ok=True)

print(f"Data directory: {DATA_DIR}")
print(f"Results directory: {RESULTS_DIR}")
print(f"Key fields tracked: {len(KEY_FIELDS)}")

Data directory: ../data
Results directory: ../quality-scores
Key fields tracked: 27


## Step 1: Download Latest USDOT Data

In [4]:
def save_to_parquet(df, timestamp=None):
    """Save DataFrame to Parquet with timestamp."""
    if timestamp is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"transportation_data_{timestamp}.parquet"
    filepath = DATA_DIR / filename
    print(f"Saving to Parquet: {filename}...")
    df.to_parquet(filepath, index=False, compression='snappy')
    file_size_mb = filepath.stat().st_size / (1024 * 1024)
    print(f"Saved {len(df):,} records ({file_size_mb:.1f} MB)")
    return filepath

def fetch_usdot_data(use_existing_data):
    """Load from existing parquet if available, or download from Socrata."""
    existing_files = sorted(DATA_DIR.glob("transportation_data_*.parquet"), reverse=True)
    
    # Use existing file if flag is set and file exists
    if use_existing_data and existing_files:
        latest_file = existing_files[0]
        print(f"Using existing data file: {latest_file.name}")
        df = pd.read_parquet(latest_file)
        print(f"Loaded {len(df):,} records with {len(df.columns)} columns")
        return df, latest_file

    # Otherwise fetch the latest dataset
    print(f"Connecting to {SOCRATA_DOMAIN}...")
    client = Socrata(SOCRATA_DOMAIN, None)
    print(f"Downloading dataset {DATASET_ID} (limit={FETCH_LIMIT:,})...")
    print("This will take 2-3 minutes...")
    results = client.get(DATASET_ID, limit=FETCH_LIMIT)
    df = pd.DataFrame.from_records(results)
    print(f"Downloaded {len(df):,} records with {len(df.columns)} columns")

    # Save to parquet
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    parquet_path = save_to_parquet(df, timestamp)
    return df, parquet_path

## Step 2: Convert and Save to Parquet

In [5]:
raw_df, parquet_path = fetch_usdot_data(use_existing_data=USE_EXISTING_DATA)
print(f"\nFile location: {parquet_path}")
print(f"\nSample columns: {raw_df.columns.tolist()[:10]}...")

Using existing data file: transportation_data_20251013_135544.parquet
Loaded 2,091,643 records with 42 columns

File location: ../data/transportation_data_20251013_135544.parquet

Sample columns: ['dot_number', 'legal_name', 'dba_name', 'carrier_operation', 'hm_flag', 'pc_flag', 'phy_street', 'phy_city', 'phy_state', 'phy_zip']...


## Step 3: Calculate DQS Metrics

Calculate completeness, structural validity, and timeliness scores.

In [6]:
def calc_completeness(df, key_fields=None):
    """Percentage of non-missing values in key fields."""
    # Remove duplicates from missing_vals
    missing_vals = ["", " ", "  ", "   ", "NA", "N/A", "na", "n/a", "na.", 
                    "none", "None", "null", "NULL", "nan", "NaN", "missing", 
                    "Missing", "-", "--"]
    
    # Use all columns if key_fields not specified
    if key_fields is None:
        key_fields = df.columns.tolist()
    
    # Calculate missing values: both standard NaN and custom missing strings
    def is_missing(col):
        # Standard missing values (NaN, None)
        standard_missing = col.isna()
        # Custom missing value strings (convert to string first to avoid type errors)
        custom_missing = col.astype(str).isin(missing_vals)
        # Combine both
        return (standard_missing | custom_missing).mean()
    
    # Calculate completeness for each key field
    missing_rate = df[key_fields].apply(is_missing)
    
    # Return average completeness across all key fields
    return 1 - missing_rate.mean()

def calc_structural_validity(df):
    """
    Calculate structural validity (format checks) for key transportation data fields.
    Returns the average validity across all performed checks.
    """
    checks = []

    # Telephone Numbers
    if "telephone" in df.columns:
        phone_pattern = r"^\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}$"
        phone_valid = df["telephone"].fillna("").astype(str).str.match(phone_pattern)
        checks.append(phone_valid.mean())

    # US ZIP codes (5-digit or 5+4 format)
    if "phy_zip" in df.columns:
        zip_pattern = r"^\d{5}(?:-\d{4})?$"
        zip_valid = df["phy_zip"].fillna("").astype(str).str.match(zip_pattern)
        checks.append(zip_valid.mean())

    # Email
    if "email_address" in df.columns:
        email_pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
        email_valid = df["email_address"].fillna("").astype(str).str.match(email_pattern)
        checks.append(email_valid.mean())

    # US_DOT Number format (6-8 digits)
    if "dot_number" in df.columns:
        dot_str = df["dot_number"].astype(str)
        dot_pattern = r"^\d{6,8}$"
        dot_valid = dot_str.str.match(dot_pattern)
        checks.append(dot_valid.mean())

    # Date column format (DD-MON-YY)
    if "mcs150_date" in df.columns:
        date_pattern = r"^\d{2}-[A-Z]{3}-\d{2}$"
        date_valid = df["mcs150_date"].fillna("").astype(str).str.match(date_pattern)
        checks.append(date_valid.mean())

    # State abbreviation format (exactly two uppercase letters)
    if "phy_state" in df.columns:
        state_pattern = r"^[A-Z]{2}$"
        state_valid = df["phy_state"].fillna("").astype(str).str.match(state_pattern)
        checks.append(state_valid.mean())

    # Non-negative integer values for counts
    for col in ["nbr_power_unit", "driver_total"]:
        if col in df.columns:
            # Step 1: Attempt to convert the column to numeric. Non-numeric strings become NaN.
            numeric_col = pd.to_numeric(df[col], errors='coerce')
            # Step 2: Define validity criteria on the numeric column:
            # 1. Not NaN (handles both original NaNs and strings that failed coercion)
            is_valid = (~numeric_col.isna())
            # 2. Non-negative
            is_valid &= (numeric_col >= 0)
            # 3. Has no fractional part (is an integer)
            is_valid &= (numeric_col == np.floor(numeric_col))
            checks.append(is_valid.mean())
    return np.mean(checks) if checks else 0.0

def calc_freshness(df, reference_date=None):
    """
    Improved freshness calculation with weighted scoring and robust statistics.
    """
    if reference_date is None:
        reference_date = pd.Timestamp(datetime.today())
    
    current_year = reference_date.year
    weighted_scores = []
    
    # PRIMARY: MCS-150 date (50% weight) - regulatory compliance indicator
    if "mcs150_date" in df.columns:
        mcs = pd.to_datetime(df["mcs150_date"], format="%d-%b-%y", errors="coerce")
        valid_mcs = mcs.dropna()
        
        if len(valid_mcs) > 0:
            # Use median instead of mean to reduce outlier impact
            median_age_days = (reference_date - valid_mcs).dt.days.median()
            # 2-year threshold (730 days) for regulatory compliance
            mcs_score = np.clip(1 - (median_age_days / 730), 0, 1)
            weighted_scores.append((mcs_score, 0.5))
    
    # SECONDARY: MCS-150 mileage year (25% weight)
    if "mcs150_mileage_year" in df.columns:
        mcs_years = pd.to_numeric(df["mcs150_mileage_year"], errors="coerce")
        # Explicitly filter out 0 values (not just NaN)
        valid_mcs_years = mcs_years[mcs_years > 0]
        
        if len(valid_mcs_years) > 0:
            median_year_diff = (current_year - valid_mcs_years).median()
            # 3-year threshold (more lenient than date field)
            mcs_year_score = np.clip(1 - (median_year_diff / 3), 0, 1)
            weighted_scores.append((mcs_year_score, 0.25))
    
    # TERTIARY: Recent mileage year (15% weight)
    if "recent_mileage_year" in df.columns:
        recent_years = pd.to_numeric(df["recent_mileage_year"], errors="coerce")
        # Critical: exclude 0 values which represent missing data
        valid_recent = recent_years[recent_years > 0]
        
        if len(valid_recent) > 0:
            median_year_diff = (current_year - valid_recent).median()
            recent_score = np.clip(1 - (median_year_diff / 3), 0, 1)
            weighted_scores.append((recent_score, 0.15))
    
    # QUATERNARY: Add date (10% weight) - less important for freshness
    if "add_date" in df.columns:
        adds = pd.to_datetime(df["add_date"], format="%d-%b-%y", errors="coerce")
        valid_adds = adds.dropna()
        
        if len(valid_adds) > 0:
            median_age_days = (reference_date - valid_adds).dt.days.median()
            # 10-year threshold (registration age, not freshness indicator)
            add_score = np.clip(1 - (median_age_days / 3650), 0, 1)
            weighted_scores.append((add_score, 0.1))
    
    # Calculate weighted average
    if weighted_scores:
        total_score = sum(score * weight for score, weight in weighted_scores)
        total_weight = sum(weight for _, weight in weighted_scores)
        return total_score / total_weight
    
    return 0.0


def calc_dqs(completeness, validity, timeliness, weights=WEIGHTS):
    """Calculate overall DQS from component scores."""
    return (weights["completeness"] * completeness +
            weights["validity"] * validity +
            weights["timeliness"] * timeliness)

print("DQS calculation functions defined")

DQS calculation functions defined


In [7]:
# Calculate DQS metrics
print("Calculating DQS metrics...\n")

completeness_score = calc_completeness(raw_df)
print(f"Completeness: {completeness_score:.3f}")

validity_score = calc_structural_validity(raw_df)
print(f"Structural Validity: {validity_score:.3f}")

freshness_score = calc_freshness(raw_df)
print(f"Freshness: {freshness_score:.3f}")

base_dqs = calc_dqs(completeness_score, validity_score, freshness_score)
print(f"\nBase DQS: {base_dqs:.3f}")

Calculating DQS metrics...

Completeness: 0.924
Structural Validity: 0.952
Freshness: 0.254

Base DQS: 0.801


## Step 4: LLM-Powered Semantic Validity Check

Use OpenAI's o3-mini model to assess semantic validity on a sample of records.

**Note:** This step costs approximately $0.20 per 100 records.

In [56]:
def llm_validity_check(df, sample_size=LLM_SAMPLE_SIZE):
    """Run LLM-powered semantic validity check on sample."""
    print(f"Running LLM validity check on {sample_size} records...")
    
    # Check for API key
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        print("Warning: OPENAI_API_KEY not found in .env file")
        print("Skipping LLM validity check")
        return None, None
    
    client = OpenAI(api_key=api_key)
    
    # Random sample
    sample_df = df.sample(n=min(sample_size, len(df)), random_state=LLM_SEED)
    
    # System prompt
    system_prompt = """You are a data quality analyst evaluating the VALIDITY of trucking company data records.

For each record, assess the **trustworthiness** of the data using these criteria:
- Company name plausibility (typos, placeholders, fake text like "ABC Company")
- Plausibility of metrics (e.g., 150,000 miles/truck/year is reasonable; 850,000 is not)
- Consistency between related fields (e.g., 2 drivers but 50 trucks is inconsistent)
- Format issues (emails, phone numbers, or addresses that look invalid)

Return your judgment as structured JSON with the following schema:
[
  {
    "record_id": <index or dot_number>,
    "validity_score": <float between 0 and 1>,
    "issues": "<short bullet summary of detected problems or 'None'>",
    "summary_comment": "<2-sentence human-readable summary>"
  }
]

Scoring guidelines:
- 1.0 = fully valid, realistic, consistent
- 0.5 = minor issues or plausible but slightly off
- 0.0 = clearly invalid, placeholder, or implausible

Output only valid JSON.
"""
    
    # Construct user prompt
    records = sample_df.to_dict(orient="records")
    user_prompt = "Evaluate the following records for data validity:\n\n"
    for rec in records:
        user_prompt += f"{rec}\n"
    
    # Call OpenAI API
    print("   Sending request to OpenAI API...")
    try:
        response = client.chat.completions.create(
            model=LLM_MODEL,
            seed=LLM_SEED,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        )
        
        response_text = response.choices[0].message.content
        
        # Parse JSON response
        results = json.loads(response_text)
        
        # Calculate average semantic validity score
        validity_scores = [r.get("validity_score", 0) for r in results]
        avg_semantic_validity = np.mean(validity_scores)
        
        print(f"LLM analysis complete")
        print(f"Average semantic validity: {avg_semantic_validity:.3f}")
        
        # Save detailed results
        results_df = pd.DataFrame(results)
        output_path = RESULTS_DIR / f"llm_validity_{run_timestamp}.csv"
        results_df.to_csv(output_path, index=False)
        print(f"Detailed results saved to: {output_path.name}")
        
        return avg_semantic_validity, results_df
        
    except json.JSONDecodeError:
        print("LLM did not return valid JSON")
        return None, None
    except Exception as e:
        print(f"Error during LLM check: {e}")
        return None, None

# Run LLM check
semantic_validity_score, llm_results = llm_validity_check(raw_df)

Running LLM validity check on 10 records...
   Sending request to OpenAI API...
LLM analysis complete
Average semantic validity: 0.750
Detailed results saved to: llm_validity_20251013_115227.csv


## Step 5: Calculate Final DQS and Save Results

In [58]:
# Calculate final DQS (blend structural and semantic validity if available)
if semantic_validity_score is not None:
    # Average structural and semantic validity
    combined_validity = (validity_score + semantic_validity_score) / 2
    print(f"\nCombined Validity Score: {combined_validity:.3f}")
    print(f"   (Structural: {validity_score:.3f} + Semantic: {semantic_validity_score:.3f})")
else:
    combined_validity = validity_score
    print(f"\nUsing Structural Validity Only: {combined_validity:.3f}")

# Calculate final DQS
final_dqs = calc_dqs(completeness_score, combined_validity, freshness_score)

print(f"\n{'='*50}")
print(f"FINAL DATA QUALITY SCORE (DQS): {final_dqs:.3f}")
print(f"{'='*50}")
print(f"   Completeness:  {completeness_score:.3f}")
print(f"   Validity:      {combined_validity:.3f}")
print(f"   Timeliness:    {freshness_score:.3f}")


Combined Validity Score: 0.851
   (Structural: 0.952 + Semantic: 0.750)

FINAL DATA QUALITY SCORE (DQS): 0.761
   Completeness:  0.924
   Validity:      0.851
   Timeliness:    0.254


## Step 6: Update Historical Tracking

In [59]:
def append_to_history(results_dict, filepath=HISTORY_FILE):
    """Append current run results to historical tracking CSV."""
    # Create DataFrame for current run
    current_run = pd.DataFrame([results_dict])
    
    # Load existing history if it exists
    if filepath.exists():
        history_df = pd.read_csv(filepath)
        updated_df = pd.concat([history_df, current_run], ignore_index=True)
        print(f"Appended to existing history ({len(history_df)} previous runs)")
    else:
        updated_df = current_run
        print(f"Created new history file")
    
    # Save updated history
    updated_df.to_csv(filepath, index=False)
    print(f"Saved to: {filepath}")
    
    return updated_df

# Prepare results dictionary
results = {
    "timestamp": run_timestamp,
    "run_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "record_count": len(raw_df),
    "completeness": round(completeness_score, 4),
    "structural_validity": round(validity_score, 4),
    "semantic_validity": round(semantic_validity_score, 4) if semantic_validity_score else None,
    "combined_validity": round(combined_validity, 4),
    "timeliness": round(timeliness_score, 4),
    "final_dqs": round(final_dqs, 4),
    "parquet_file": parquet_path.name,
    "llm_sample_size": LLM_SAMPLE_SIZE if semantic_validity_score else 0
}

# Save to history
print("\nSaving results to historical tracking...")
history_df = append_to_history(results)

print("\nHistorical DQS Trend:")
print(history_df[["run_date", "final_dqs", "completeness", "combined_validity", "timeliness"]].tail(5))


Saving results to historical tracking...
Appended to existing history (2 previous runs)
Saved to: ../quality-scores/dqs_history.csv

Historical DQS Trend:
              run_date  final_dqs  completeness  combined_validity  timeliness
0  2025-10-13 11:54:44     0.7097        1.0000             0.6416      0.2656
1  2025-10-13 13:53:09     0.7549        0.9244             0.8359      0.2539
2  2025-10-13 13:57:54     0.7609        0.9244             0.8509      0.2539


## Summary

Pipeline execution complete! Key artifacts generated:
- **Parquet file**: Latest dataset in optimized format
- **DQS history**: Cumulative tracking of all runs
- **LLM results**: Detailed semantic validity analysis (if run)

To re-run this pipeline for new data, simply execute all cells again.

In [60]:
print("\n" + "="*60)
print("DQS PIPELINE EXECUTION COMPLETE")
print("="*60)
print(f"\nFinal DQS: {final_dqs:.3f}")
print(f"Data saved: {parquet_path.name}")
print(f"History file: {HISTORY_FILE.name}")
print(f"Run completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("\nTo re-run for new data, execute all cells again.")


DQS PIPELINE EXECUTION COMPLETE

Final DQS: 0.761
Data saved: transportation_data_20251013_135544.parquet
History file: dqs_history.csv
Run completed: 2025-10-13 13:58:00

To re-run for new data, execute all cells again.
