# Storage Account Discovery & SQL Endpoint Mapping

**Purpose**: Automatically discover all data folders in the configured storage account and create shortcuts/tables for SQL endpoint access.

**Storage Account**: *Configured in setup cell*  
**Data Format**: Parquet files  
**Output**: Shortcuts and external tables for Fabric SQL endpoint  

---

## 📋 Configuration & Setup

In [None]:
# Storage Account Discovery Configuration
import pandas as pd
import os
from datetime import datetime
import json

# Load environment variables from .env file
try:
    from dotenv import load_dotenv
    load_dotenv('../.env')  # Load from parent directory
    print("✅ Environment variables loaded from .env file")
except ImportError:
    print("⚠️ python-dotenv not installed. Using default configuration.")
    print("💡 Run: pip install python-dotenv to use .env file")
except Exception as e:
    print(f"⚠️ Could not load .env file: {str(e)}")

# Storage account configuration (with .env fallback)
STORAGE_ACCOUNT = os.getenv('STORAGE_ACCOUNT_NAME', 'cpestaginglake')
BASE_URL = os.getenv('STORAGE_BASE_URL', f"abfss://{{container}}@{STORAGE_ACCOUNT}.dfs.core.windows.net/")
DATA_FORMAT = os.getenv('DATA_FORMAT', 'PARQUET')

# Workspace configuration for authentication compliance
WORKSPACE_ID = os.getenv('FABRIC_WORKSPACE_ID', None)

# Try to get workspace ID automatically if not in env
if not WORKSPACE_ID:
    try:
        from notebookutils import mssparkutils
        WORKSPACE_ID = mssparkutils.env.getWorkspaceId()
        print(f"🔍 Auto-detected Workspace ID: {WORKSPACE_ID}")
    except Exception as workspace_error:
        print(f"⚠️ Could not auto-detect workspace ID: {str(workspace_error)}")
        print("💡 You may need to set FABRIC_WORKSPACE_ID in .env file")
else:
    print(f"🔍 Using Workspace ID from .env: {WORKSPACE_ID}")

# Discovery settings (with .env fallback)
DISCOVERY_CONFIG = {
    "max_depth": int(os.getenv('MAX_SEARCH_DEPTH', '3')),
    "min_files": int(os.getenv('MIN_FILES_PER_FOLDER', '1')),
    "file_extensions": os.getenv('FILE_EXTENSIONS', '.parquet,.pqt').split(','),
    "exclude_folders": os.getenv('EXCLUDE_FOLDERS', '_tmp,_temp,_logs,.spark').split(','),
    "table_prefix": os.getenv('TABLE_PREFIX', 'staging_'),
    "use_workspace_id": os.getenv('USE_WORKSPACE_ID', 'true').lower() == 'true',
}

# Results tracking
discovery_results = {
    "containers": [],
    "data_folders": [],
    "shortcuts_created": [],
    "tables_created": [],
    "errors": []
}

# Environment information
FABRIC_WORKSPACE_NAME = os.getenv('FABRIC_WORKSPACE_NAME', 'Unknown')
FABRIC_ENVIRONMENT = os.getenv('FABRIC_ENVIRONMENT', 'Unknown')
PROJECT_NAME = os.getenv('PROJECT_NAME', 'Storage Discovery')

print(f"🚀 {PROJECT_NAME.upper()} Discovery Configuration Loaded")
print(f"📦 Storage Account: {STORAGE_ACCOUNT}")
print(f"🏢 Workspace: {FABRIC_WORKSPACE_NAME} ({WORKSPACE_ID or 'Not available'})")
print(f"🌐 Environment: {FABRIC_ENVIRONMENT}")
print(f"📄 Expected Format: {DATA_FORMAT}")
print(f"🔍 Max Search Depth: {DISCOVERY_CONFIG['max_depth']}")
print(f"📊 Table Prefix: {DISCOVERY_CONFIG['table_prefix']}")
print(f"🔗 Base URL Pattern: {BASE_URL}")
print(f"📁 File Extensions: {', '.join(DISCOVERY_CONFIG['file_extensions'])}")
print(f"🔐 Auth Compliance: {'Enabled' if DISCOVERY_CONFIG['use_workspace_id'] else 'Disabled'}")

# Display key environment variables for verification
env_vars = {
    'STORAGE_ACCOUNT_NAME': STORAGE_ACCOUNT,
    'FABRIC_WORKSPACE_ID': WORKSPACE_ID,
    'FABRIC_WORKSPACE_NAME': FABRIC_WORKSPACE_NAME,
    'DATA_FORMAT': DATA_FORMAT,
    'TABLE_PREFIX': DISCOVERY_CONFIG['table_prefix']
}

print(f"\n📋 Active Configuration:")
for key, value in env_vars.items():
    status = "✅" if value else "❌"
    print(f"   {status} {key}: {value or 'Not set'}")

🚀 CPE Staging Lake Discovery Configuration Loaded
📦 Storage Account: cpestaginglake
📄 Expected Format: PARQUET
🔍 Max Search Depth: 3
📊 Table Prefix: staging_


## 🔍 Step 1: Container Discovery

In [None]:
# Discover all containers in the storage account
def discover_containers():
    """
    Discover all containers/folders in the configured storage account
    """
    try:
        print(f"🔍 Discovering all containers in {STORAGE_ACCOUNT}...")
        
        # Try using mssparkutils to list everything
        try:
            from notebookutils import mssparkutils
            
            # List all top-level folders/containers at the storage account level
            storage_root = f"abfss://{STORAGE_ACCOUNT}.dfs.core.windows.net/"
            print(f"📂 Scanning: {storage_root}")
            
            items = mssparkutils.fs.ls(storage_root)
            
            container_list = []
            for item in items:
                if item.isDir:
                    container_name = item.name.rstrip('/')
                    container_list.append({
                        "name": container_name,
                        "path": item.path,
                        "size": item.size if hasattr(item, 'size') else 'unknown'
                    })
                    print(f"   📦 Found container: {container_name}")
                    
            discovery_results["containers"] = container_list
            print(f"✅ Found {len(container_list)} containers total")
            
        except Exception as mssparkutils_error:
            print(f"❌ mssparkutils failed: {str(mssparkutils_error)}")
            print("💡 This might be due to:")
            print("   - Storage account access permissions")
            print("   - Incorrect storage account name")
            print("   - Network connectivity issues")
            
            # Alternative: Try to list known containers if the root listing fails
            print("\n🔧 Attempting alternative discovery method...")
            print("💡 If you know specific container names, please update the list below:")
            
            # You can manually add known container names here if needed
            known_containers = [
                # Add your known container names here, for example:
                # "data",
                # "raw", 
                # "processed",
                # "staging"
            ]
            
            if known_containers:
                found_containers = []
                for container_name in known_containers:
                    try:
                        test_path = f"abfss://{container_name}@{STORAGE_ACCOUNT}.dfs.core.windows.net/"
                        test_files = mssparkutils.fs.ls(test_path)
                        
                        found_containers.append({
                            "name": container_name,
                            "path": test_path,
                            "size": len(test_files)
                        })
                        print(f"   ✅ Verified container: {container_name}")
                        
                    except Exception as test_error:
                        print(f"   ❌ Cannot access container: {container_name}")
                        continue
                        
                discovery_results["containers"] = found_containers
                print(f"✅ Found {len(found_containers)} accessible containers")
            else:
                print("⚠️ No known containers to test. Please check storage account access.")
                discovery_results["containers"] = []
        
        # Display results
        if discovery_results["containers"]:
            print(f"\n📦 Discovered Containers ({len(discovery_results['containers'])}):")
            for i, container in enumerate(discovery_results["containers"], 1):
                size_info = container.get('size', 'unknown')
                if isinstance(size_info, int):
                    size_display = f"{size_info} items"
                else:
                    size_display = str(size_info)
                print(f"   {i}. {container['name']} ({size_display})")
        else:
            print("❌ No containers discovered")
            print("💡 Possible solutions:")
            print(f"   - Check if '{STORAGE_ACCOUNT}' is the correct storage account name")
            print("   - Verify you have read access to the storage account")
            print("   - Ensure the lakehouse is properly connected to the storage account")
            
        return discovery_results["containers"]
        
    except Exception as e:
        error_msg = f"Container discovery failed: {str(e)}"
        discovery_results["errors"].append(error_msg)
        print(f"❌ {error_msg}")
        return []

# Execute container discovery
print(f"🚀 Starting container discovery for {STORAGE_ACCOUNT}...")
containers = discover_containers()

# Additional diagnostic information
if not containers:
    print("\n🔍 Troubleshooting Tips:")
    print(f"1. Verify storage account name: '{STORAGE_ACCOUNT}'")
    print("2. Check if you have the correct permissions")
    print("3. Ensure the storage account exists and is accessible")
    print("4. Try running this in a Fabric notebook environment")
else:
    print(f"\n✅ Ready to proceed with {len(containers)} containers")

🚀 Starting container discovery for cpestaginglake...
🔍 Discovering all containers in cpestaginglake...
❌ mssparkutils failed: No module named 'notebookutils'
💡 This might be due to:
   - Storage account access permissions
   - Incorrect storage account name
   - Network connectivity issues

🔧 Attempting alternative discovery method...
💡 If you know specific container names, please update the list below:
⚠️ No known containers to test. Please check storage account access.
❌ No containers discovered
💡 Possible solutions:
   - Check if 'cpestaginglake' is the correct storage account name
   - Verify you have read access to the storage account
   - Ensure the lakehouse is properly connected to the storage account

🔍 Troubleshooting Tips:
1. Verify storage account name: 'cpestaginglake'
2. Check if you have the correct permissions
3. Ensure the storage account exists and is accessible
4. Try running this in a Fabric notebook environment


## 📁 Step 2: Data Folder Discovery

In [None]:
# Discover data folders within each container
def discover_data_folders(containers):
    """
    Recursively discover folders containing Parquet files in each container
    """
    try:
        print(f"📁 Discovering data folders with {DATA_FORMAT} files...")
        from notebookutils import mssparkutils
        
        all_data_folders = []
        
        for container in containers:
            print(f"\n🔍 Exploring container: {container['name']}")
            container_path = f"abfss://{container['name']}@{STORAGE_ACCOUNT}.dfs.core.windows.net/"
            
            # Recursive folder exploration
            def explore_folder(folder_path, current_depth=0, relative_path=""):
                if current_depth > DISCOVERY_CONFIG["max_depth"]:
                    return []
                
                folder_results = []
                
                try:
                    items = mssparkutils.fs.ls(folder_path)
                    
                    # Count Parquet files in current folder
                    parquet_files = []
                    subfolders = []
                    
                    for item in items:
                        if item.isFile:
                            if any(item.name.lower().endswith(ext) for ext in DISCOVERY_CONFIG["file_extensions"]):
                                parquet_files.append(item.name)
                        elif item.isDir:
                            folder_name = item.name.rstrip('/')
                            if not any(excl in folder_name.lower() for excl in DISCOVERY_CONFIG["exclude_folders"]):
                                subfolders.append((item.path, folder_name))
                    
                    # If this folder has enough Parquet files, it's a data folder
                    if len(parquet_files) >= DISCOVERY_CONFIG["min_files"]:
                        folder_info = {
                            "container": container['name'],
                            "folder_path": folder_path,
                            "relative_path": relative_path,
                            "parquet_files": len(parquet_files),
                            "sample_files": parquet_files[:3],  # First 3 files as sample
                            "table_name": generate_table_name(container['name'], relative_path),
                            "depth": current_depth
                        }
                        folder_results.append(folder_info)
                        print(f"   📄 Data folder: {relative_path or '/'} ({len(parquet_files)} files)")
                    
                    # Explore subfolders
                    for subfolder_path, subfolder_name in subfolders:
                        new_relative_path = f"{relative_path}/{subfolder_name}" if relative_path else subfolder_name
                        subfolder_results = explore_folder(subfolder_path, current_depth + 1, new_relative_path)
                        folder_results.extend(subfolder_results)
                    
                except Exception as folder_error:
                    print(f"   ⚠️ Could not explore {relative_path}: {str(folder_error)[:50]}...")
                
                return folder_results
            
            # Start exploration from container root
            container_folders = explore_folder(container_path)
            all_data_folders.extend(container_folders)
            
            print(f"   ✅ Found {len(container_folders)} data folders in {container['name']}")
        
        discovery_results["data_folders"] = all_data_folders
        
        print(f"\n📊 Total Data Folders Discovered: {len(all_data_folders)}")
        return all_data_folders
        
    except Exception as e:
        error_msg = f"Data folder discovery failed: {str(e)}"
        discovery_results["errors"].append(error_msg)
        print(f"❌ {error_msg}")
        return []

def generate_table_name(container_name, relative_path):
    """
    Generate a clean table name from container and folder path
    """
    # Clean up the path to create a valid table name
    if relative_path:
        # Replace special characters and join with underscores
        clean_path = relative_path.replace("/", "_").replace("-", "_").replace(" ", "_")
        clean_path = ''.join(c for c in clean_path if c.isalnum() or c == '_')
        table_name = f"{DISCOVERY_CONFIG['table_prefix']}{container_name}_{clean_path}"
    else:
        table_name = f"{DISCOVERY_CONFIG['table_prefix']}{container_name}"
    
    # Ensure table name is valid (lowercase, no consecutive underscores)
    table_name = table_name.lower()
    while "__" in table_name:
        table_name = table_name.replace("__", "_")
    
    return table_name.strip("_")

# Execute data folder discovery
if containers:
    data_folders = discover_data_folders(containers)
else:
    print("⚠️ No containers found, skipping data folder discovery")
    data_folders = []

## 🔗 Step 3: Create Shortcuts

In [None]:
# Create lakehouse shortcuts for discovered data folders
def create_shortcuts(data_folders):
    """
    Create lakehouse shortcuts for all discovered data folders with workspace ID for auth compliance
    """
    try:
        print("🔗 Creating lakehouse shortcuts with authentication compliance...")
        from notebookutils import mssparkutils
        
        # Ensure we have workspace ID for auth compliance
        current_workspace_id = WORKSPACE_ID
        if not current_workspace_id and DISCOVERY_CONFIG['use_workspace_id']:
            try:
                current_workspace_id = mssparkutils.env.getWorkspaceId()
                print(f"🔍 Retrieved Workspace ID: {current_workspace_id}")
            except Exception as workspace_error:
                print(f"⚠️ Could not retrieve workspace ID: {str(workspace_error)}")
                print("💡 Proceeding without workspace ID - shortcuts may fail with auth policies")
        
        shortcuts_created = []
        
        for folder in data_folders:
            try:
                # Define shortcut paths
                source_path = folder["folder_path"]
                shortcut_name = folder["table_name"]
                target_path = f"/lakehouse/default/Files/shortcuts/{shortcut_name}/"
                
                print(f"   🔗 Creating shortcut: {shortcut_name}")
                print(f"      Source: {source_path}")
                print(f"      Target: {target_path}")
                
                # Create the shortcut with workspace ID for auth compliance
                if current_workspace_id and DISCOVERY_CONFIG['use_workspace_id']:
                    print(f"      🔐 Using Workspace ID: {current_workspace_id}")
                    result = mssparkutils.lakehouse.create_shortcut(
                        source_path=source_path,
                        target_path=target_path,
                        shortcut_name=shortcut_name,
                        workspace_id=current_workspace_id
                    )
                else:
                    print(f"      ⚠️ Creating shortcut without workspace ID")
                    result = mssparkutils.lakehouse.create_shortcut(
                        source_path=source_path,
                        target_path=target_path,
                        shortcut_name=shortcut_name
                    )
                
                shortcut_info = {
                    "name": shortcut_name,
                    "source_path": source_path,
                    "target_path": target_path,
                    "workspace_id": current_workspace_id,
                    "status": "success",
                    "result": str(result)
                }
                shortcuts_created.append(shortcut_info)
                
                print(f"      ✅ Shortcut created successfully")
                
            except Exception as shortcut_error:
                error_msg = f"Failed to create shortcut {folder['table_name']}: {str(shortcut_error)}"
                print(f"      ❌ {error_msg}")
                
                # Check if error is related to authentication
                if "auth" in str(shortcut_error).lower() or "permission" in str(shortcut_error).lower():
                    print(f"      💡 This may be an authentication issue - ensure workspace ID is correct")
                    print(f"      🔐 Current Workspace ID: {current_workspace_id or 'None'}")
                
                shortcut_info = {
                    "name": folder["table_name"],
                    "source_path": folder["folder_path"],
                    "workspace_id": current_workspace_id,
                    "status": "failed",
                    "error": error_msg
                }
                shortcuts_created.append(shortcut_info)
                discovery_results["errors"].append(error_msg)
        
        discovery_results["shortcuts_created"] = shortcuts_created
        successful_shortcuts = [s for s in shortcuts_created if s["status"] == "success"]
        
        print(f"\n📊 Shortcut Creation Summary:")
        print(f"   ✅ Successful: {len(successful_shortcuts)}")
        print(f"   ❌ Failed: {len(shortcuts_created) - len(successful_shortcuts)}")
        if current_workspace_id:
            print(f"   🔐 Used Workspace ID: {current_workspace_id}")
        else:
            print(f"   ⚠️ No Workspace ID used - may cause auth issues")
        
        return shortcuts_created
        
    except Exception as e:
        error_msg = f"Shortcut creation failed: {str(e)}"
        discovery_results["errors"].append(error_msg)
        print(f"❌ {error_msg}")
        return []

# Execute shortcut creation
if data_folders:
    shortcuts = create_shortcuts(data_folders)
else:
    print("⚠️ No data folders found, skipping shortcut creation")
    shortcuts = []

## 📊 Step 4: Create External Tables

In [None]:
# Create external tables for SQL endpoint access
def create_external_tables(data_folders):
    """
    Create external Parquet tables for direct SQL endpoint access
    """
    try:
        print("📊 Creating external Parquet tables...")
        
        tables_created = []
        
        for folder in data_folders:
            try:
                table_name = folder["table_name"]
                source_path = folder["folder_path"]
                
                print(f"   📋 Creating table: {table_name}")
                
                # Create external Parquet table
                create_table_sql = f"""
                CREATE TABLE IF NOT EXISTS {table_name}
                USING PARQUET
                LOCATION '{source_path}'
                """
                
                # Execute the SQL command
                spark.sql(create_table_sql)
                
                # Test the table by getting row count
                try:
                    count_sql = f"SELECT COUNT(*) as row_count FROM {table_name}"
                    row_count = spark.sql(count_sql).collect()[0]['row_count']
                    
                    # Get column information
                    columns_sql = f"DESCRIBE {table_name}"
                    columns_df = spark.sql(columns_sql).toPandas()
                    column_count = len(columns_df)
                    
                    table_info = {
                        "name": table_name,
                        "source_path": source_path,
                        "status": "success",
                        "row_count": row_count,
                        "column_count": column_count,
                        "columns": columns_df['col_name'].tolist()[:5]  # First 5 columns
                    }
                    
                    print(f"      ✅ Table created: {row_count:,} rows, {column_count} columns")
                    
                except Exception as test_error:
                    table_info = {
                        "name": table_name,
                        "source_path": source_path,
                        "status": "created_but_untested",
                        "error": str(test_error)
                    }
                    print(f"      ⚠️ Table created but testing failed: {str(test_error)[:50]}...")
                
                tables_created.append(table_info)
                
            except Exception as table_error:
                error_msg = f"Failed to create table {folder['table_name']}: {str(table_error)}"
                print(f"      ❌ {error_msg}")
                
                table_info = {
                    "name": folder["table_name"],
                    "source_path": folder["folder_path"],
                    "status": "failed",
                    "error": error_msg
                }
                tables_created.append(table_info)
                discovery_results["errors"].append(error_msg)
        
        discovery_results["tables_created"] = tables_created
        successful_tables = [t for t in tables_created if t["status"] == "success"]
        
        print(f"\n📊 Table Creation Summary:")
        print(f"   ✅ Successful: {len(successful_tables)}")
        print(f"   ❌ Failed: {len([t for t in tables_created if t['status'] == 'failed'])}")
        print(f"   ⚠️ Created but untested: {len([t for t in tables_created if t['status'] == 'created_but_untested'])}")
        
        return tables_created
        
    except Exception as e:
        error_msg = f"Table creation failed: {str(e)}"
        discovery_results["errors"].append(error_msg)
        print(f"❌ {error_msg}")
        return []

# Execute table creation
if data_folders:
    tables = create_external_tables(data_folders)
else:
    print("⚠️ No data folders found, skipping table creation")
    tables = []

## 📋 Step 5: Verification & Summary Report

In [None]:
# Verify created tables and generate summary report
def generate_summary_report():
    """
    Generate a comprehensive summary report of the discovery and mapping process
    """
    try:
        print("📋 Generating Summary Report...")
        print("=" * 80)
        
        # Overall statistics
        total_containers = len(discovery_results["containers"])
        total_data_folders = len(discovery_results["data_folders"])
        total_shortcuts = len([s for s in discovery_results["shortcuts_created"] if s["status"] == "success"])
        total_tables = len([t for t in discovery_results["tables_created"] if t["status"] == "success"])
        total_errors = len(discovery_results["errors"])
        
        print(f"📊 {STORAGE_ACCOUNT.upper()} DISCOVERY & MAPPING SUMMARY")
        print(f"📅 Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"📦 Storage Account: {STORAGE_ACCOUNT}")
        print(f"🏢 Workspace ID: {WORKSPACE_ID or 'Not available'}")
        print(f"📄 Data Format: {DATA_FORMAT}")
        print(f"🏷️ Table Prefix: {DISCOVERY_CONFIG['table_prefix']}")
        print(f"🔐 Auth Compliance: {'Enabled' if DISCOVERY_CONFIG['use_workspace_id'] else 'Disabled'}")
        print()
        print(f"📈 Discovery Results:")
        print(f"   📦 Containers Found: {total_containers}")
        print(f"   📁 Data Folders: {total_data_folders}")
        print(f"   🔗 Shortcuts Created: {total_shortcuts}")
        print(f"   📊 Tables Created: {total_tables}")
        print(f"   ❌ Errors: {total_errors}")
        
        # Shortcut authentication summary
        if discovery_results.get("shortcuts_created"):
            shortcuts_with_workspace = [s for s in discovery_results["shortcuts_created"] 
                                       if s.get("workspace_id") and s["status"] == "success"]
            print(f"   🔐 Shortcuts with Workspace ID: {len(shortcuts_with_workspace)}")
            if WORKSPACE_ID:
                print(f"   🆔 Used Workspace ID: {WORKSPACE_ID}")
        
        # Container details
        if discovery_results["containers"]:
            print(f"\n📦 Container Details:")
            for i, container in enumerate(discovery_results["containers"], 1):
                print(f"   {i}. {container['name']} ({container.get('size', 'unknown')} items)")
        
        # Data folder details
        if discovery_results["data_folders"]:
            print(f"\n📁 Data Folder Details:")
            for i, folder in enumerate(discovery_results["data_folders"], 1):
                print(f"   {i}. {folder['container']}{folder['relative_path']} → {folder['table_name']}")
                print(f"      📄 {folder['parquet_files']} {DATA_FORMAT} files")
        
        # Table verification
        print(f"\n🧪 Table Verification:")
        try:
            all_tables = spark.sql("SHOW TABLES").collect()
            prefix_tables = [t for t in all_tables if t.tableName.startswith(DISCOVERY_CONFIG['table_prefix'])]
            
            print(f"   📊 Total {DISCOVERY_CONFIG['table_prefix']} tables in SQL endpoint: {len(prefix_tables)}")
            
            if prefix_tables:
                print(f"   📋 Available {DISCOVERY_CONFIG['table_prefix']} tables:")
                for table in prefix_tables:
                    try:
                        # Quick row count test
                        count_result = spark.sql(f"SELECT COUNT(*) as count FROM {table.tableName}").collect()[0]
                        row_count = count_result['count']
                        print(f"      ✅ {table.tableName}: {row_count:,} rows")
                    except Exception as test_error:
                        print(f"      ❌ {table.tableName}: Test failed ({str(test_error)[:30]}...)")
        
        except Exception as verification_error:
            print(f"   ⚠️ Table verification failed: {str(verification_error)[:50]}...")
        
        # Error summary
        if discovery_results["errors"]:
            print(f"\n❌ Error Summary:")
            for i, error in enumerate(discovery_results["errors"], 1):
                print(f"   {i}. {error}")
        
        # Success summary
        if total_tables > 0:
            print(f"\n🎉 SUCCESS! {total_tables} tables are now available in the SQL endpoint")
            print(f"📝 You can now query these tables using standard SQL:")
            print(f"   SELECT * FROM {DISCOVERY_CONFIG['table_prefix']}your_table_name LIMIT 100")
        else:
            print(f"\n⚠️ No tables were successfully created. Check errors above.")
        
        print("=" * 80)
        
        # Save results to file
        save_results_to_file()
        
    except Exception as e:
        print(f"❌ Report generation failed: {str(e)}")

def save_results_to_file():
    """
    Save discovery results to a JSON file for future reference
    """
    try:
        results_with_timestamp = {
            "timestamp": datetime.now().isoformat(),
            "storage_account": STORAGE_ACCOUNT,
            "data_format": DATA_FORMAT,
            "config": DISCOVERY_CONFIG,
            "results": discovery_results
        }
        
        # Convert to pandas DataFrame for easy viewing
        results_df = pd.DataFrame({
            "Container": [f["container"] for f in discovery_results["data_folders"]],
            "Folder_Path": [f["relative_path"] for f in discovery_results["data_folders"]],
            "Table_Name": [f["table_name"] for f in discovery_results["data_folders"]],
            f"{DATA_FORMAT}_Files": [f["parquet_files"] for f in discovery_results["data_folders"]],
            "Full_Path": [f["folder_path"] for f in discovery_results["data_folders"]]
        })
        
        print(f"\n📊 Discovery Results DataFrame:")
        print(results_df.to_string(index=False))
        
        # Save as JSON (commented out to avoid file system issues in Fabric)
        # filename = f"{STORAGE_ACCOUNT}_discovery_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        # with open(filename, 'w') as f:
        #     json.dump(results_with_timestamp, f, indent=2)
        # print(f"💾 Results saved to: {filename}")
        
        return results_df
        
    except Exception as e:
        print(f"⚠️ Could not save results to file: {str(e)}")
        return None

# Generate the final report
results_df = generate_summary_report()

## 🚀 Quick Usage Guide

After running this script, you can:

### Query Your Data
```sql
-- List all tables with your configured prefix
SHOW TABLES LIKE '{DISCOVERY_CONFIG['table_prefix']}*'

-- Query a specific table
SELECT * FROM {DISCOVERY_CONFIG['table_prefix']}your_table_name LIMIT 100

-- Get table statistics
DESCRIBE {DISCOVERY_CONFIG['table_prefix']}your_table_name
```

### Verify Table Access
```python
# List all tables with your configured prefix
prefix_tables = spark.sql("SHOW TABLES").filter(col("tableName").startswith("{DISCOVERY_CONFIG['table_prefix']}")).collect()
for table in prefix_tables:
    print(f"Table: {table.tableName}")
```

### Authentication & Workspace Configuration
```python
# Check current workspace ID
from notebookutils import mssparkutils
current_workspace_id = mssparkutils.env.getWorkspaceId()
print(f"Current Workspace ID: {current_workspace_id}")

# Manually set workspace ID if needed
# WORKSPACE_ID = "your-workspace-id-here"
```

### Troubleshooting
- Check the error summary above for any issues
- Verify your access permissions to {STORAGE_ACCOUNT}
- Ensure the storage account name and containers are correct
- **Authentication Issues**: If shortcuts fail, verify workspace ID is correct
- **Permission Errors**: Ensure workspace has access to the storage account
- Run individual cells to debug specific steps

### Configuration Notes
- **Storage Account**: {STORAGE_ACCOUNT}
- **Workspace ID**: {WORKSPACE_ID or 'Auto-detected'}
- **Data Format**: {DATA_FORMAT}
- **Table Prefix**: {DISCOVERY_CONFIG['table_prefix']}
- **Search Depth**: {DISCOVERY_CONFIG['max_depth']} levels
- **File Extensions**: {', '.join(DISCOVERY_CONFIG['file_extensions'])}
- **Auth Compliance**: {'Enabled' if DISCOVERY_CONFIG['use_workspace_id'] else 'Disabled'}

### Manual Workspace ID Setup
If automatic detection fails, you can manually set the workspace ID:
```python
# Option 1: Set in configuration cell
WORKSPACE_ID = "your-workspace-id-here"

# Option 2: Get from Fabric portal URL
# The workspace ID is in the URL: https://app.fabric.microsoft.com/groups/WORKSPACE_ID/...
```

---
**Note**: This notebook automatically discovers and maps all {DATA_FORMAT} data in {STORAGE_ACCOUNT} to your Fabric SQL endpoint using workspace ID for authentication compliance.