In [None]:
# This notebook provides a quick overview of the quality analytics dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

print("🔍 Quality Analytics Data Overview")
print("=" * 50)

# Load dataset from Domino dataset mount
data_path = "/mnt/data/quality_compliance_data/audit_findings.csv"

try:
    df = pd.read_csv(data_path)
    print(f"✅ Successfully loaded data: {df.shape[0]} records, {df.shape[1]} columns")
except FileNotFoundError:
    print("📝 Using simulated data for demo purposes...")
    # Create sample data for demo
    df = pd.DataFrame({
        'finding_id': [f'QF-2024-{i:03d}' for i in range(1, 501)],
        'finding_text': [f'Sample quality finding text {i}' for i in range(1, 501)],
        'category': np.random.choice(['Documentation', 'SOP Compliance', 'Training', 'Data Integrity'], 500),
        'severity': np.random.choice(['Minor', 'Major', 'Critical'], 500),
        'area': np.random.choice(['Clinical', 'Manufacturing', 'Quality', 'Regulatory'], 500)
    })

# Basic dataset information
print(f"\n📊 Dataset Summary:")
print(f"   • Total findings: {len(df):,}")
print(f"   • Categories: {df['category'].nunique()}")
print(f"   • Severity levels: {df['severity'].nunique()}")
print(f"   • Operational areas: {df['area'].nunique()}")

# Display sample records
print(f"\n📋 Sample Records:")
print(df.head(3).to_string(max_cols=4, max_colwidth=50))

# Category distribution
print(f"\n🏷️ Category Distribution:")
category_counts = df['category'].value_counts()
for cat, count in category_counts.items():
    print(f"   • {cat}: {count} ({count/len(df)*100:.1f}%)")

# Severity distribution  
print(f"\n⚠️ Severity Distribution:")
severity_counts = df['severity'].value_counts()
for sev, count in severity_counts.items():
    print(f"   • {sev}: {count} ({count/len(df)*100:.1f}%)")

# Quick visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# Category distribution pie chart
category_counts.plot(kind='pie', ax=ax1, autopct='%1.1f%%')
ax1.set_title('Findings by Category')
ax1.set_ylabel('')

# Severity distribution bar chart
severity_counts.plot(kind='bar', ax=ax2, color=['green', 'orange', 'red'])
ax2.set_title('Findings by Severity')
ax2.set_xlabel('Severity Level')
ax2.set_ylabel('Count')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print(f"\n✅ Data exploration complete!")
print(f"💡 Next steps: Launch semantic search app or explore topic modeling")

# Check for embeddings
embeddings_path = "/mnt/data/quality_compliance_data/embeddings/bert_embeddings.pkl"
if Path(embeddings_path).exists():
    print(f"🧠 Pre-computed embeddings found - ready for semantic search!")
else:
    print(f"⚠️ Embeddings not found - will use simulated similarity for demo")