# ‚úÖ Data Quality Report

**Purpose:** Comprehensive data quality assessment using industry standards.

**Quality Dimensions:**
1. **Completeness** - Are all required fields populated?
2. **Accuracy** - Are values within expected ranges?
3. **Consistency** - Are values consistent across records?
4. **Timeliness** - Is the data current?
5. **Validity** - Do values conform to business rules?

*This demonstrates understanding of data governance principles.*

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Load data
df = pd.read_parquet('../data/processed/shipments_processed.parquet')
print(f"‚úÖ Loaded {len(df):,} records for quality assessment")

## 1Ô∏è‚É£ Completeness Check

In [None]:
# Calculate completeness metrics
completeness = {}
for col in df.columns:
    non_null = df[col].notna().sum()
    total = len(df)
    completeness[col] = (non_null / total * 100)

completeness_df = pd.DataFrame.from_dict(completeness, orient='index', columns=['Completeness %'])
completeness_df = completeness_df.sort_values('Completeness %')

# Visualize
fig = px.bar(
    completeness_df,
    y=completeness_df.index,
    x='Completeness %',
    orientation='h',
    title='üìä Data Completeness by Column',
    color='Completeness %',
    color_continuous_scale='RdYlGn',
    range_color=[0, 100]
)

fig.add_vline(x=95, line_dash="dash", line_color="red", annotation_text="95% Threshold")
fig.update_layout(height=600)
fig.show()

# Quality score
avg_completeness = completeness_df['Completeness %'].mean()
print(f"\nüìà Overall Completeness Score: {avg_completeness:.2f}%")

if avg_completeness >= 95:
    print("‚úÖ EXCELLENT - Data meets quality standards")
elif avg_completeness >= 85:
    print("‚ö†Ô∏è GOOD - Minor improvements needed")
else:
    print("‚ùå POOR - Significant data quality issues")

## 2Ô∏è‚É£ Accuracy Check (Value Ranges)

In [None]:
# Define expected ranges (business rules)
validation_rules = {
    'cost': {'min': 0, 'max': 100000, 'type': 'numeric'},
    'weight': {'min': 0, 'max': 100000, 'type': 'numeric'},
    'transit_days': {'min': 0, 'max': 365, 'type': 'numeric'} if 'transit_days' in df.columns else None
}

# Check violations
violations = {}
for col, rules in validation_rules.items():
    if rules and col in df.columns:
        below_min = (df[col] < rules['min']).sum()
        above_max = (df[col] > rules['max']).sum()
        violations[col] = {'below_min': below_min, 'above_max': above_max}

# Display violations
violations_df = pd.DataFrame(violations).T
print("üîç Range Validation Results:")
print(violations_df)

total_violations = violations_df.sum().sum()
if total_violations == 0:
    print("\n‚úÖ No range violations detected")
else:
    print(f"\n‚ö†Ô∏è Found {int(total_violations)} range violations")

## 3Ô∏è‚É£ Consistency Check

In [None]:
# Check logical consistency
consistency_issues = []

# Rule 1: Arrival date should be after shipment date
if 'shipment_date' in df.columns and 'arrival_date' in df.columns:
    invalid_dates = df[df['arrival_date'] < df['shipment_date']]
    if len(invalid_dates) > 0:
        consistency_issues.append(f"‚ö†Ô∏è {len(invalid_dates)} records with arrival before shipment")

# Rule 2: Cost per kg should be positive
if 'cost_per_kg' in df.columns:
    invalid_cost = df[df['cost_per_kg'] <= 0]
    if len(invalid_cost) > 0:
        consistency_issues.append(f"‚ö†Ô∏è {len(invalid_cost)} records with invalid cost per kg")

# Rule 3: Transit days should match date difference
if all(col in df.columns for col in ['shipment_date', 'arrival_date', 'transit_days']):
    df['calculated_transit'] = (df['arrival_date'] - df['shipment_date']).dt.days
    mismatched = df[df['transit_days'] != df['calculated_transit']]
    if len(mismatched) > 0:
        consistency_issues.append(f"‚ö†Ô∏è {len(mismatched)} records with mismatched transit days")

# Display results
print("üîç Consistency Check Results:")
print("=" * 60)
if consistency_issues:
    for issue in consistency_issues:
        print(issue)
else:
    print("‚úÖ No consistency issues detected")

## 4Ô∏è‚É£ Uniqueness Check

In [None]:
# Check for duplicate IDs
if 'shipment_id' in df.columns:
    duplicates = df['shipment_id'].duplicated().sum()
    unique_pct = (1 - duplicates / len(df)) * 100
    
    print(f"üîë Shipment ID Uniqueness: {unique_pct:.2f}%")
    
    if duplicates == 0:
        print("‚úÖ All shipment IDs are unique")
    else:
        print(f"‚ö†Ô∏è Found {duplicates} duplicate shipment IDs")

## 5Ô∏è‚É£ Overall Quality Score

In [None]:
# Calculate overall quality score
scores = {
    'Completeness': avg_completeness,
    'Uniqueness': unique_pct if 'shipment_id' in df.columns else 100,
    'Accuracy': 100 - (total_violations / len(df) * 100) if total_violations > 0 else 100,
    'Consistency': 100 - (len(consistency_issues) * 5)  # -5 points per issue type
}

# Ensure scores are between 0-100
scores = {k: max(0, min(100, v)) for k, v in scores.items()}

overall_score = sum(scores.values()) / len(scores)

# Visualize quality dimensions
fig = go.Figure(data=[
    go.Scatterpolar(
        r=list(scores.values()),
        theta=list(scores.keys()),
        fill='toself',
        name='Quality Score'
    )
])

fig.update_layout(
    polar=dict(
        radialaxis=dict(visible=True, range=[0, 100])
    ),
    title='üìä Data Quality Dashboard',
    showlegend=False,
    height=500
)

fig.show()

# Final report
print("\n" + "=" * 60)
print("üìä FINAL QUALITY REPORT")
print("=" * 60)
for dimension, score in scores.items():
    print(f"{dimension:15s}: {score:6.2f}%")
print("=" * 60)
print(f"OVERALL SCORE: {overall_score:.2f}%")
print("=" * 60)

if overall_score >= 95:
    print("\n‚úÖ EXCELLENT - Production ready")
elif overall_score >= 85:
    print("\n‚ö†Ô∏è GOOD - Minor improvements recommended")
elif overall_score >= 70:
    print("\n‚ö†Ô∏è FAIR - Improvements needed before production")
else:
    print("\n‚ùå POOR - Significant remediation required")