# Databricks Sample Data Exploration

## Overview
This notebook explores available sample datasets in Azure Databricks workspace and validates data sources for our multi-tier architecture.

## Steps
1. **Step 1:** Check available databases
2. **Step 2:** Explore TPC-H database tables
3. **Step 3:** Test TPC-H data access and validation

---

In [None]:
# Step 1: Check Available Databases
print("🔍 STEP 1: AVAILABLE DATABASES")
print("="*50)

databases = spark.sql("SHOW DATABASES").collect()
for db in databases:
    print(f"  📁 {db.databaseName}")

print(f"\n✅ Found {len(databases)} database(s)")

In [None]:
# Step 2: Explore TPC-H Database
print("🎯 STEP 2: EXPLORING TPC-H DATABASE")
print("="*50)

try:
    spark.sql("USE tpch")
    print("✅ 'tpch' database found!")
    
    print("\n📊 TABLES IN TPC-H DATABASE:")
    tables = spark.sql("SHOW TABLES").collect()
    for table in tables:
        print(f"  📋 {table.tableName}")
        
    # Show table schemas for key tables
    print("\n🔍 KEY TABLE SCHEMAS:")
    
    print("\n👥 CUSTOMER table schema:")
    spark.sql("DESCRIBE customer").show()
    
    print("\n📦 PART table schema:")
    spark.sql("DESCRIBE part").show()
    
    print("\n🛒 ORDERS table schema:")
    spark.sql("DESCRIBE orders").show()
    
    print("\n📋 LINEITEM table schema:")
    spark.sql("DESCRIBE lineitem").show()
        
except Exception as e:
    print(f"❌ 'tpch' database not available: {e}")

In [None]:
# Step 3: Test TPC-H Data Access and Validation
print("✅ STEP 3: TPC-H DATA ACCESS AND VALIDATION TESTING")
print("="*50)

# Use the real TPC-H database
try:
    spark.sql("USE tpch")
    tables = spark.sql("SHOW TABLES").collect()
    
    print("📋 TPC-H tables with record counts:")
    for table in tables:
        table_name = table.tableName
        try:
            count_df = spark.sql(f"SELECT COUNT(*) as count FROM {table_name}")
            count = count_df.collect()[0][0]  # Get first row, first column
            print(f"  ✅ {table_name}: {count} records")
        except Exception as count_error:
            print(f"  ❌ {table_name}: Error getting count - {count_error}")

    print("\n👥 TPC-H CUSTOMER DATA PREVIEW:")
    spark.sql("SELECT c_custkey, c_name, c_mktsegment, c_nationkey FROM customer LIMIT 5").show()
    
    print("\n📦 TPC-H PART (PRODUCT) DATA PREVIEW:")
    spark.sql("SELECT p_partkey, p_name, p_brand, p_type, p_retailprice FROM part LIMIT 5").show()
    
    print("\n🛒 TPC-H ORDERS DATA PREVIEW:")
    spark.sql("SELECT o_orderkey, o_custkey, o_orderstatus, o_totalprice, o_orderdate FROM orders LIMIT 5").show()
    
    print("\n📋 TPC-H LINEITEM DATA PREVIEW:")
    spark.sql("SELECT l_orderkey, l_partkey, l_quantity, l_extendedprice, l_discount FROM lineitem LIMIT 5").show()
    
    print("\n🔗 TPC-H JOIN TEST (Customer + Orders + LineItems):")
    spark.sql("""
        SELECT 
            c.c_name,
            o.o_orderkey,
            o.o_totalprice,
            o.o_orderstatus,
            COUNT(l.l_orderkey) as line_items
        FROM customer c
        JOIN orders o ON c.c_custkey = o.o_custkey
        JOIN lineitem l ON o.o_orderkey = l.l_orderkey
        GROUP BY c.c_name, o.o_orderkey, o.o_totalprice, o.o_orderstatus
        LIMIT 5
    """).show()
    
    print("\n🌍 TPC-H NATION/REGION DATA:")
    spark.sql("SELECT n_name, r_name FROM nation n JOIN region r ON n.n_regionkey = r.r_regionkey LIMIT 10").show()
    
    print("✅ All TPC-H data access tests successful!")
    print("\n🎯 PERFECT BRONZE TIER DATA FOR SOLUTION ACCELERATOR!")
    print("📊 This gives us realistic Customer, Product, Order data with different schemas")
    print("🔄 Ready for Bronze → Silver → Gold pipeline development")
    
except Exception as e:
    print(f"❌ Error during TPC-H validation: {e}")

print("\n" + "="*60)
print("📊 TPC-H DATA EXPLORATION COMPLETE - READY FOR ARCHITECTURE!")
print("="*60)