# Azure Databricks Unity Catalog Exploration

This notebook demonstrates how to explore data using Azure Databricks with Unity Catalog enabled. We'll cover:
- Unity Catalog basics
- Data discovery and exploration
- Sample data analysis
- Best practices for data governance

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

## 1. Unity Catalog Overview

Unity Catalog provides centralized governance for data and AI assets across Azure Databricks workspaces.

In [None]:
# Check current catalog and schema
print("Current catalog:", spark.catalog.currentCatalog())
print("Current database/schema:", spark.catalog.currentDatabase())

# List available catalogs
print("\nAvailable catalogs:")
catalogs = spark.sql("SHOW CATALOGS").collect()
for catalog in catalogs:
    print(f"- {catalog.catalog}")

In [None]:
# List schemas in current catalog
print("Available schemas in current catalog:")
schemas = spark.sql("SHOW SCHEMAS").collect()
for schema in schemas:
    print(f"- {schema.databaseName}")

## 2. Data Discovery

Let's explore what data is available in our Unity Catalog.

In [None]:
# Function to explore tables in a schema
def explore_schema(catalog_name, schema_name):
    """Explore tables and views in a given schema"""
    try:
        tables = spark.sql(f"SHOW TABLES IN {catalog_name}.{schema_name}").collect()
        print(f"\nTables in {catalog_name}.{schema_name}:")
        for table in tables:
            print(f"- {table.tableName} ({table.tableType})")
        return [table.tableName for table in tables]
    except Exception as e:
        print(f"Error exploring schema {catalog_name}.{schema_name}: {e}")
        return []

# Explore main catalog schemas
current_catalog = spark.catalog.currentCatalog()
for schema in schemas[:3]:  # Limit to first 3 schemas
    schema_name = schema.databaseName
    explore_schema(current_catalog, schema_name)

## 3. Sample Data Creation

Let's create some sample data for exploration if no existing data is available.

In [None]:
# Create sample sales data
sample_data = [
    ("2024-01-01", "Product_A", "Electronics", 1200.50, 2, "North"),
    ("2024-01-02", "Product_B", "Clothing", 450.75, 3, "South"),
    ("2024-01-03", "Product_C", "Electronics", 890.00, 1, "East"),
    ("2024-01-04", "Product_A", "Electronics", 1200.50, 1, "West"),
    ("2024-01-05", "Product_D", "Home", 320.25, 4, "North"),
    ("2024-01-06", "Product_B", "Clothing", 450.75, 2, "South"),
    ("2024-01-07", "Product_E", "Sports", 780.90, 1, "East"),
    ("2024-01-08", "Product_C", "Electronics", 890.00, 3, "West"),
    ("2024-01-09", "Product_F", "Books", 45.99, 10, "North"),
    ("2024-01-10", "Product_A", "Electronics", 1200.50, 1, "Central")
]

# Define schema
schema = StructType([
    StructField("sale_date", StringType(), True),
    StructField("product_name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("price", DoubleType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("region", StringType(), True)
])

# Create DataFrame
df_sales = spark.createDataFrame(sample_data, schema)

# Convert date string to date type
df_sales = df_sales.withColumn("sale_date", to_date("sale_date", "yyyy-MM-dd"))

print("Sample sales data created successfully!")
df_sales.show()

## 4. Basic Data Exploration

In [None]:
# Basic data info
print("Dataset Shape:", (df_sales.count(), len(df_sales.columns)))
print("\nSchema:")
df_sales.printSchema()

print("\nSample records:")
df_sales.show(5)

In [None]:
# Data summary statistics
print("Summary Statistics:")
df_sales.describe().show()

# Check for null values
print("\nNull value counts:")
for col in df_sales.columns:
    null_count = df_sales.filter(df_sales[col].isNull()).count()
    print(f"{col}: {null_count}")

In [None]:
# Unique values in categorical columns
categorical_cols = ["product_name", "category", "region"]

for col in categorical_cols:
    unique_count = df_sales.select(col).distinct().count()
    print(f"\nUnique values in {col}: {unique_count}")
    df_sales.select(col).distinct().orderBy(col).show()

## 5. Data Analysis and Insights

In [None]:
# Calculate total revenue
df_sales_with_revenue = df_sales.withColumn("revenue", col("price") * col("quantity"))

# Revenue by category
revenue_by_category = df_sales_with_revenue.groupBy("category").agg(
    sum("revenue").alias("total_revenue"),
    count("*").alias("transaction_count"),
    avg("revenue").alias("avg_revenue")
).orderBy(desc("total_revenue"))

print("Revenue by Category:")
revenue_by_category.show()

In [None]:
# Revenue by region
revenue_by_region = df_sales_with_revenue.groupBy("region").agg(
    sum("revenue").alias("total_revenue"),
    count("*").alias("transaction_count")
).orderBy(desc("total_revenue"))

print("Revenue by Region:")
revenue_by_region.show()

In [None]:
# Convert to Pandas for visualization
df_pandas = df_sales_with_revenue.toPandas()

# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Revenue by Category
category_revenue = df_pandas.groupby('category')['revenue'].sum().sort_values(ascending=False)
axes[0,0].bar(category_revenue.index, category_revenue.values)
axes[0,0].set_title('Total Revenue by Category')
axes[0,0].set_xlabel('Category')
axes[0,0].set_ylabel('Revenue')
axes[0,0].tick_params(axis='x', rotation=45)

# Revenue by Region
region_revenue = df_pandas.groupby('region')['revenue'].sum().sort_values(ascending=False)
axes[0,1].bar(region_revenue.index, region_revenue.values, color='orange')
axes[0,1].set_title('Total Revenue by Region')
axes[0,1].set_xlabel('Region')
axes[0,1].set_ylabel('Revenue')

# Price distribution
axes[1,0].hist(df_pandas['price'], bins=10, edgecolor='black', alpha=0.7)
axes[1,0].set_title('Price Distribution')
axes[1,0].set_xlabel('Price')
axes[1,0].set_ylabel('Frequency')

# Revenue over time
df_pandas['sale_date'] = pd.to_datetime(df_pandas['sale_date'])
daily_revenue = df_pandas.groupby('sale_date')['revenue'].sum()
axes[1,1].plot(daily_revenue.index, daily_revenue.values, marker='o')
axes[1,1].set_title('Daily Revenue Trend')
axes[1,1].set_xlabel('Date')
axes[1,1].set_ylabel('Revenue')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 6. Working with Unity Catalog Features

In [None]:
# Save data to Unity Catalog (if you have write permissions)
# This creates a managed table in Unity Catalog
table_name = "sample_sales_data"

try:
    # Write to Unity Catalog
    df_sales_with_revenue.write \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(f"default.{table_name}")
    
    print(f"Table {table_name} created successfully in Unity Catalog!")
    
    # Verify table creation
    spark.sql(f"DESCRIBE TABLE default.{table_name}").show()
    
except Exception as e:
    print(f"Note: Could not create table in Unity Catalog. This might be due to permissions.")
    print(f"Error: {e}")
    print("Creating temporary view instead...")
    df_sales_with_revenue.createOrReplaceTempView(table_name)
    print(f"Temporary view {table_name} created successfully!")

In [None]:
# Query the data using SQL
sql_query = f"""
SELECT 
    category,
    region,
    COUNT(*) as transaction_count,
    SUM(revenue) as total_revenue,
    AVG(revenue) as avg_revenue,
    MAX(revenue) as max_revenue
FROM {table_name}
GROUP BY category, region
ORDER BY total_revenue DESC
"""

result = spark.sql(sql_query)
print("SQL Query Results:")
result.show()

## 7. Data Quality Checks

In [None]:
# Data quality assessment
def data_quality_report(df, table_name):
    """Generate a data quality report"""
    print(f"=== Data Quality Report for {table_name} ===")
    
    # Row count
    row_count = df.count()
    print(f"Total rows: {row_count}")
    
    # Column count
    col_count = len(df.columns)
    print(f"Total columns: {col_count}")
    
    # Check for duplicates
    distinct_count = df.distinct().count()
    duplicate_count = row_count - distinct_count
    print(f"Duplicate rows: {duplicate_count}")
    
    # Null checks per column
    print("\nNull value analysis:")
    for col_name in df.columns:
        null_count = df.filter(df[col_name].isNull()).count()
        null_percentage = (null_count / row_count) * 100
        print(f"  {col_name}: {null_count} nulls ({null_percentage:.2f}%)")
    
    # Data type validation
    print(f"\nData types:")
    for field in df.schema.fields:
        print(f"  {field.name}: {field.dataType}")

# Run data quality report
data_quality_report(df_sales_with_revenue, table_name)

## 8. Next Steps and Best Practices

### Unity Catalog Best Practices:
1. **Data Governance**: Use proper naming conventions for catalogs, schemas, and tables
2. **Access Control**: Implement fine-grained permissions using Unity Catalog
3. **Data Lineage**: Unity Catalog automatically tracks data lineage
4. **Data Discovery**: Use Unity Catalog's search and discovery features
5. **Delta Lake Integration**: Use Delta Lake for ACID transactions and time travel

### Recommended Actions:
- Set up proper catalog structure for your organization
- Implement data quality monitoring
- Create documentation for your datasets
- Set up automated data pipelines
- Use Delta Live Tables for production workloads

In [None]:
# Clean up (optional)
print("Exploration completed successfully!")
print("You can now:")
print("1. Create your own datasets in Unity Catalog")
print("2. Set up data pipelines using Delta Live Tables")
print("3. Implement proper governance and access controls")
print("4. Explore advanced analytics and ML features")