# Explore Databricks File System (DBFS)

## Overview
This notebook explores the DBFS structure and capabilities in your Azure Databricks workspace.

## Learning Steps
- **Step 1:** Explore DBFS root structure
- **Step 2:** Discover sample datasets (including TPC-H search)
- **Step 3:** Examine FileStore for uploads
- **Step 4:** Check mounted external storage
- **Step 5:** Test DBFS utility functions
- **Step 6:** Learn useful DBFS commands
- **Step 7:** Run DBFS health check
- **Step 8:** Search for TPC-H and benchmark datasets

## DBFS Key Locations
- `/` - Root directory
- `/databricks-datasets/` - Sample datasets provided by Databricks
- `/FileStore/` - Files uploaded via web UI
- `/mnt/` - Mounted external storage (Azure Data Lake, Blob Storage)
- `/tmp/` - Temporary files
- `/user/` - User-specific directories

---

In [None]:
# Step 1: Explore DBFS Root Structure
print("🔍 STEP 1: EXPLORING DBFS ROOT STRUCTURE")
print("="*50)

# List root directory contents
print("\n📁 Root Directory Contents:")
root_contents = dbutils.fs.ls("/")
for item in root_contents:
    item_type = "📁" if item.isDir else "📄"
    size = f"({item.size} bytes)" if not item.isDir else ""
    print(f"  {item_type} {item.name} {size}")

print("\n✅ Step 1 Complete: Root structure explored")

In [None]:
# Step 2: Explore Databricks Sample Datasets
print("\n🎯 STEP 2: DATABRICKS SAMPLE DATASETS")
print("="*50)

try:
    datasets = dbutils.fs.ls("/databricks-datasets/")
    print(f"\n📊 Found {len(datasets)} sample dataset categories:")
    
    # Show first 10 categories
    for i, item in enumerate(datasets[:10]):
        print(f"  📁 {item.name}")
    
    if len(datasets) > 10:
        print(f"  ... and {len(datasets) - 10} more categories")
    
    # ENHANCED: Search through ALL 56 datasets for useful ones
    print(f"\n🔍 COMPREHENSIVE DATASET SEARCH:")
    print(f"Searching through all {len(datasets)} datasets for business-relevant data...")
    
    # Categorize datasets by usefulness for your solution accelerator
    business_datasets = []
    benchmark_datasets = []
    tpch_datasets = []
    retail_datasets = []
    
    for item in datasets:
        dataset_name = item.name.lower()
        
        # Check for TPC-H
        if 'tpch' in dataset_name or 'tpc-h' in dataset_name or 'tpc_h' in dataset_name:
            tpch_datasets.append(item.name)
        
        # Check for retail/sales data
        elif any(keyword in dataset_name for keyword in ['retail', 'sales', 'ecommerce', 'store', 'customer', 'product']):
            retail_datasets.append(item.name)
        
        # Check for benchmark data
        elif any(keyword in dataset_name for keyword in ['benchmark', 'test', 'demo', 'sample']):
            benchmark_datasets.append(item.name)
        
        # Check for other business datasets
        elif any(keyword in dataset_name for keyword in ['airline', 'amazon', 'financial', 'bank', 'order']):
            business_datasets.append(item.name)
    
    # Report findings
    if tpch_datasets:
        print(f"\n🎯 TPC-H DATASETS FOUND ({len(tpch_datasets)}):")
        for dataset in tpch_datasets:
            print(f"  📊 {dataset}")
    else:
        print(f"\n⚠️ No TPC-H datasets found by name")
    
    if retail_datasets:
        print(f"\n🛒 RETAIL/SALES DATASETS ({len(retail_datasets)}):")
        for dataset in retail_datasets[:5]:  # Show first 5
            print(f"  📊 {dataset}")
        if len(retail_datasets) > 5:
            print(f"  ... and {len(retail_datasets) - 5} more retail datasets")
    
    if business_datasets:
        print(f"\n💼 BUSINESS DATASETS ({len(business_datasets)}):")
        for dataset in business_datasets[:5]:  # Show first 5
            print(f"  📊 {dataset}")
        if len(business_datasets) > 5:
            print(f"  ... and {len(business_datasets) - 5} more business datasets")
    
    # Explore specific useful datasets
    useful_datasets = {
        'airlines/': 'Airline data (good for order/booking patterns)',
        'amazon/': 'E-commerce data (product/customer patterns)',
        'retail-org/': 'Retail organization data',
        'online_retail/': 'Online retail transactions',
        'ecommerce/': 'E-commerce transaction data'
    }
    
    print(f"\n🎯 EXPLORING PRIORITY DATASETS FOR YOUR PROJECT:")
    for dataset_path, description in useful_datasets.items():
        try:
            contents = dbutils.fs.ls(f"/databricks-datasets/{dataset_path}")
            print(f"\n📁 {dataset_path} - {description}")
            print(f"   Contents ({len(contents)} items):")
            for item in contents[:3]:  # Show first 3 items
                print(f"     📄 {item.name}")
        except:
            print(f"\n📁 {dataset_path} - Not found")
    
    # Explore a specific dataset (like NYC taxi or retail)
    print("\n🚕 Exploring NYC Taxi Data (if available):")
    try:
        nyc_taxi = dbutils.fs.ls("/databricks-datasets/nyctaxi/")
        for item in nyc_taxi[:5]:  # Show first 5 items
            print(f"  📄 {item.name}")
    except:
        print("  NYC Taxi dataset not found")
        
    print("\n🛒 Exploring Retail Data (if available):")
    try:
        retail = dbutils.fs.ls("/databricks-datasets/retail-org/")
        for item in retail[:5]:  # Show first 5 items
            print(f"  📄 {item.name}")
    except:
        print("  Retail dataset not found")
    
    # Show ALL dataset names for reference
    print(f"\n📋 COMPLETE DATASET LIST (all {len(datasets)} categories):")
    for i, item in enumerate(datasets, 1):
        print(f"  {i:2d}. {item.name}")
        
except Exception as e:
    print(f"Error accessing datasets: {e}")

print("\n✅ Step 2 Complete: Sample datasets explored")

In [None]:
# Step 3: Explore FileStore (for uploaded files)
print("\n📂 STEP 3: FILESTORE EXPLORATION")
print("="*50)

try:
    filestore = dbutils.fs.ls("/FileStore/")
    print(f"\n📁 FileStore Contents ({len(filestore)} items):")
    
    for item in filestore:
        item_type = "📁" if item.isDir else "📄"
        size = f"({item.size} bytes)" if not item.isDir else ""
        print(f"  {item_type} {item.name} {size}")
        
        # Explore each directory found
        if item.isDir:
            try:
                dir_contents = dbutils.fs.ls(f"/FileStore/{item.name}")
                if dir_contents:
                    print(f"    📊 Contains {len(dir_contents)} items:")
                    for sub_item in dir_contents[:5]:  # Show first 5 items
                        sub_type = "📁" if sub_item.isDir else "📄"
                        sub_size = f" ({sub_item.size} bytes)" if not sub_item.isDir else ""
                        print(f"      {sub_type} {sub_item.name}{sub_size}")
                    if len(dir_contents) > 5:
                        print(f"      ... and {len(dir_contents) - 5} more items")
            except Exception as e:
                print(f"    ❌ Error exploring {item.name}: {e}")
            
except Exception as e:
    print(f"❌ FileStore not found: {e}")
    print("💡 This means no files have been uploaded yet")

print("\n✅ Step 3 Complete: FileStore examined")

In [None]:
# Step 4: Check for mounted storage
print("\n🔗 STEP 4: MOUNTED STORAGE EXPLORATION")
print("="*50)

try:
    mounts = dbutils.fs.mounts()
    print(f"\n📎 Found {len(mounts)} mounted storage locations:")
    
    for mount in mounts:
        print(f"  🔗 Mount: {mount.mountPoint}")
        print(f"      Source: {mount.source}")
        print(f"      Extras: {mount.extraConfigs}")
        print()
        
    # If mounts exist, explore the first one
    if mounts:
        first_mount = mounts[0].mountPoint
        print(f"🔍 Exploring first mount: {first_mount}")
        try:
            mount_contents = dbutils.fs.ls(first_mount)
            for item in mount_contents[:5]:  # Show first 5 items
                item_type = "📁" if item.isDir else "📄"
                print(f"  {item_type} {item.name}")
        except Exception as e:
            print(f"  Error exploring mount: {e}")
    else:
        print("💡 No external storage mounted yet - this is normal for new workspaces")
            
except Exception as e:
    print(f"Error checking mounts: {e}")

print("\n✅ Step 4 Complete: Mounted storage checked")

In [None]:
# Step 5: DBFS Utility Functions Demo
print("\n🛠️ STEP 5: DBFS UTILITY FUNCTIONS")
print("="*50)

# Create a temporary directory and file for testing
test_dir = "/tmp/dbfs_exploration"
test_file = f"{test_dir}/test_file.txt"

print(f"\n📝 Creating test directory: {test_dir}")
try:
    dbutils.fs.mkdirs(test_dir)
    print("  ✅ Directory created successfully")
    
    # Create a test file
    print(f"\n📄 Creating test file: {test_file}")
    dbutils.fs.put(test_file, "Hello from DBFS exploration!\nThis is a test file.", overwrite=True)
    print("  ✅ File created successfully")
    
    # Read the file back
    print(f"\n📖 Reading test file:")
    file_content = dbutils.fs.head(test_file)
    print(f"  Content: {file_content}")
    
    # Check file info
    print(f"\n📊 File information:")
    file_info = dbutils.fs.ls(test_dir)
    for item in file_info:
        print(f"  📄 {item.name} - {item.size} bytes")
    
    # Clean up
    print(f"\n🗑️ Cleaning up test files:")
    dbutils.fs.rm(test_dir, recurse=True)
    print("  ✅ Test files removed")
    
except Exception as e:
    print(f"  ❌ Error in file operations: {e}")

print("\n✅ Step 5 Complete: DBFS utility functions tested")

In [None]:
# Step 6: Display useful DBFS commands reference
print("\n📚 STEP 6: USEFUL DBFS COMMANDS REFERENCE")
print("="*50)

commands = """
🔍 EXPLORATION COMMANDS:
  dbutils.fs.ls("/path/")              # List directory contents
  dbutils.fs.head("/path/file.txt")    # Read first part of file
  dbutils.fs.mounts()                  # Show mounted storage

📁 DIRECTORY OPERATIONS:
  dbutils.fs.mkdirs("/path/new_dir")   # Create directory
  dbutils.fs.rm("/path/", True)        # Remove directory recursively
  dbutils.fs.cp("/src", "/dest")       # Copy files/directories

📄 FILE OPERATIONS:
  dbutils.fs.put("/path/file.txt", "content")  # Create/write file
  dbutils.fs.mv("/old_path", "/new_path")      # Move/rename file
  
🔗 MOUNT OPERATIONS:
  dbutils.fs.mount(source, mount_point, extra_configs)  # Mount storage
  dbutils.fs.unmount(mount_point)                       # Unmount storage

📊 INFORMATION COMMANDS:
  dbutils.fs.ls("/path/")              # Detailed file/directory info
  %fs ls /path/                        # Magic command alternative
  
💡 TIPS:
  - Use %fs magic commands for quick operations: %fs ls /
  - DBFS paths start with /dbfs/ when accessed from driver node
  - Use display(dbutils.fs.ls("/path/")) for formatted output
"""

print(commands)
print("\n✅ Step 6 Complete: DBFS commands reference provided")

In [None]:
# Step 7: Quick DBFS Health Check
print("\n🏥 STEP 7: DBFS HEALTH CHECK")
print("="*50)

health_checks = [
    ("Root access", lambda: len(dbutils.fs.ls("/")) > 0),
    ("Sample datasets", lambda: len(dbutils.fs.ls("/databricks-datasets/")) > 0),
    ("FileStore access", lambda: dbutils.fs.ls("/FileStore/") is not None),
    ("Temp directory writable", lambda: dbutils.fs.mkdirs("/tmp/health_check") and dbutils.fs.rm("/tmp/health_check", True))
]

for check_name, check_func in health_checks:
    try:
        result = check_func()
        status = "✅ PASS" if result else "⚠️ FAIL"
        print(f"  {status} {check_name}")
    except Exception as e:
        print(f"  ❌ ERROR {check_name}: {e}")

print("\n✅ Step 7 Complete: DBFS health check finished")

In [None]:
# Step 8: TPC-H Dataset Search (Based on Your Results)
print("\n🔍 STEP 8: SEARCHING FOR TPC-H DATASET")
print("="*50)

# Since we found databricks-datasets/, let's explore it thoroughly
try:
    datasets = dbutils.fs.ls("/databricks-datasets/")
    print(f"\n📊 Found {len(datasets)} dataset categories:")
    
    # Look specifically for TPC-H related datasets
    tpch_found = False
    for item in datasets:
        dataset_name = item.name.lower()
        if 'tpch' in dataset_name or 'tpc-h' in dataset_name or 'tpc_h' in dataset_name:
            print(f"  🎯 FOUND TPC-H: {item.name}")
            tpch_found = True
            
            # Explore TPC-H contents
            try:
                tpch_contents = dbutils.fs.ls(f"/databricks-datasets/{item.name}")
                print(f"     📋 TPC-H Contents ({len(tpch_contents)} items):")
                for tpch_item in tpch_contents[:10]:  # Show first 10
                    print(f"       📄 {tpch_item.name}")
            except Exception as e:
                print(f"     ❌ Error exploring TPC-H: {e}")
    
    if not tpch_found:
        print("\n🔍 TPC-H not found by name, checking all datasets...")
        for item in datasets[:20]:  # Check first 20 datasets
            print(f"  📁 {item.name}")
        
        # Let's check if there are any benchmark or sample datasets
        benchmark_keywords = ['benchmark', 'sample', 'demo', 'test', 'retail', 'sales']
        print(f"\n🎯 Looking for benchmark/sample datasets:")
        for item in datasets:
            for keyword in benchmark_keywords:
                if keyword in item.name.lower():
                    print(f"  📊 Potential dataset: {item.name}")
                    break
    
    # Also check for any SQL or relational datasets
    print(f"\n🔍 Looking for SQL/Relational datasets:")
    sql_keywords = ['sql', 'db', 'table', 'relational']
    for item in datasets:
        for keyword in sql_keywords:
            if keyword in item.name.lower():
                print(f"  🗄️ SQL-related: {item.name}")
                break
                
except Exception as e:
    print(f"❌ Error exploring datasets: {e}")

print(f"\n✅ Step 8 Complete: TPC-H and benchmark dataset search finished")

print(f"\n🎉 DBFS EXPLORATION COMPLETE!")
print("="*50)
print(f"📝 Summary of completed steps:")
print("  ✅ Step 1: Root structure explored")
print("  ✅ Step 2: Sample datasets discovered")
print("  ✅ Step 3: FileStore examined")
print("  ✅ Step 4: Mounted storage checked")
print("  ✅ Step 5: Utility functions tested")
print("  ✅ Step 6: Commands reference provided")
print("  ✅ Step 7: Health check completed")
print("  ✅ Step 8: TPC-H dataset search finished")

print(f"\n📝 Next Steps:")
print("  1. ✅ DBFS exploration successful")
print("  2. 🔍 Dataset discovery complete")
print("  3. 🚀 Ready for Unity Catalog and Delta Lake learning")
print("  4. 📊 Time to import the 2-hour crash course notebook!")
print("\nYou can now navigate and work with the Databricks File System confidently! 🚀")

In [None]:
# Step 1 Explanation: Understanding DBFS Root Folders
print("\n📚 STEP 1 EXPLANATION: UNDERSTANDING YOUR DBFS FOLDERS")
print("="*50)

folder_explanations = {
    "Volume/": {
        "purpose": "Unity Catalog Volumes (lowercase 'v')",
        "description": "Legacy or alternative volume access path",
        "usage": "Less commonly used, prefer /Volumes/",
        "importance": "🟡 Medium - Legacy access"
    },
    "Volumes/": {
        "purpose": "Unity Catalog Volumes (primary)",
        "description": "Modern Databricks managed storage for Unity Catalog",
        "usage": "Primary way to access Unity Catalog volumes",
        "importance": "🟢 High - Modern Unity Catalog feature"
    },
    "databricks-datasets/": {
        "purpose": "Sample Datasets",
        "description": "Pre-loaded sample data provided by Databricks",
        "usage": "Learning, testing, demos (includes potential TPC-H data)",
        "importance": "🟢 High - Essential for learning and your solution accelerator"
    },
    "databricks-results/": {
        "purpose": "Query Results Storage",
        "description": "Temporary storage for SQL query results and downloads",
        "usage": "System-managed, stores query outputs",
        "importance": "🟡 Medium - System managed"
    },
    "volume/": {
        "purpose": "Volume Access (lowercase)",
        "description": "Alternative lowercase access to volumes",
        "usage": "Alternative path, prefer /Volumes/",
        "importance": "🟡 Medium - Alternative access"
    },
    "volumes/": {
        "purpose": "Volume Access (lowercase plural)",
        "description": "Another alternative access to volumes",
        "usage": "Alternative path, prefer /Volumes/",
        "importance": "🟡 Medium - Alternative access"
    }
}

print("\n📁 FOLDER BREAKDOWN:")
for folder, info in folder_explanations.items():
    print(f"\n📂 /{folder}")
    print(f"   🎯 Purpose: {info['purpose']}")
    print(f"   📝 Description: {info['description']}")
    print(f"   🔧 Usage: {info['usage']}")
    print(f"   {info['importance']}")

print(f"\n🚀 KEY INSIGHTS FOR YOUR SOLUTION ACCELERATOR:")
print("   ✅ databricks-datasets/ - Your source for sample data and potential TPC-H")
print("   ✅ Volumes/ - Modern Unity Catalog storage (enterprise feature)")
print("   ✅ Multiple volume paths - Databricks provides flexibility")
print("   ⚠️ Notice: No /FileStore/ or /mnt/ in root - this workspace config")

print(f"\n💡 WHAT THIS TELLS US:")
print("   🏢 Modern Databricks workspace - Has Unity Catalog features")
print("   📊 Sample data available - Perfect for learning and testing")
print("   🔧 Enterprise ready - Volume support indicates advanced features")
print("   🎯 Clean setup - Minimal clutter, ready for development")

print(f"\n📝 MISSING FOLDERS (normal for new workspaces):")
missing_folders = [
    ("/FileStore/", "File uploads via UI - will appear when you upload files"),
    ("/mnt/", "Mounted external storage - appears when you mount Azure Data Lake"),
    ("/tmp/", "Temporary files - created as needed"),
    ("/user/", "User directories - created when users access workspace")
]

for folder, explanation in missing_folders:
    print(f"   📁 {folder} - {explanation}")

print("\n✅ Step 1 Extended: DBFS folder structure explained")

In [None]:
# Step 2 Follow-up: Deep Dive into Business Datasets (Real Stories!)
print("\n🎉 STEP 2 FOLLOW-UP: BUSINESS DATASETS DEEP DIVE")
print("="*50)

print("🎯 FOCUSING ON REAL BUSINESS STORIES:")
print("   ✅ retail-org/ - Real retail business structure")
print("   ✅ online_retail/ - Transaction patterns for online channel")
print("   ✅ amazon/ - E-commerce customer/product relationships")
print("   ❌ Skipping TPC-H - Synthetic data doesn't tell stories")

# Explore retail-org/ in detail
try:
    print("\n🛒 EXPLORING RETAIL-ORG DATASET:")
    retail_contents = dbutils.fs.ls("/databricks-datasets/retail-org/")
    print(f"   Found {len(retail_contents)} items in retail-org:")
    
    for item in retail_contents:
        if item.isDir:
            print(f"     📁 {item.name}")
            # Detailed peek at business-relevant folders
            try:
                folder_contents = dbutils.fs.ls(f"/databricks-datasets/retail-org/{item.name}")
                print(f"        📊 {len(folder_contents)} files - ", end="")
                
                # Show file types for business context
                if len(folder_contents) > 0:
                    sample_file = folder_contents[0].name
                    if 'csv' in sample_file.lower():
                        print("CSV data files")
                    elif 'parquet' in sample_file.lower():
                        print("Parquet data files")
                    else:
                        print("Data files")
                else:
                    print("Empty folder")
                    
            except Exception as e:
                print(f"        ❌ Error: {e}")
        else:
            print(f"     📄 {item.name}")

    # Focus on key business folders
    business_folders = ['customers', 'active_promotions', 'loyalty_segments', 'company_employees']
    print(f"\n🎯 KEY BUSINESS FOLDERS:")
    for folder in business_folders:
        try:
            folder_path = f"/databricks-datasets/retail-org/{folder}/"
            folder_contents = dbutils.fs.ls(folder_path)
            print(f"     📊 {folder}/ - {len(folder_contents)} files")
            
            # Show sample file for business context
            if len(folder_contents) > 0:
                sample_file = folder_contents[0].name
                print(f"        Sample: {sample_file}")
        except:
            print(f"     ⚠️ {folder}/ - Not found or empty")

except Exception as e:
    print(f"❌ Error exploring retail-org: {e}")

# Explore online_retail/ dataset
try:
    print("\n🛍️ EXPLORING ONLINE_RETAIL DATASET:")
    online_retail_contents = dbutils.fs.ls("/databricks-datasets/online_retail/")
    print(f"   Found {len(online_retail_contents)} items in online_retail:")
    
    for item in online_retail_contents:
        item_type = "📁" if item.isDir else "📄"
        size_info = f" ({item.size} bytes)" if not item.isDir else ""
        print(f"     {item_type} {item.name}{size_info}")
        
        # If it's a directory, explore it
        if item.isDir:
            try:
                sub_contents = dbutils.fs.ls(f"/databricks-datasets/online_retail/{item.name}")
                print(f"        📊 Contains {len(sub_contents)} files")
                if len(sub_contents) > 0:
                    print(f"        Sample: {sub_contents[0].name}")
            except:
                print(f"        ⚠️ Couldn't explore contents")

except Exception as e:
    print(f"❌ Error exploring online_retail: {e}")

# Explore amazon/ dataset
try:
    print("\n📦 EXPLORING AMAZON E-COMMERCE DATASET:")
    amazon_contents = dbutils.fs.ls("/databricks-datasets/amazon/")
    print(f"   Found {len(amazon_contents)} items in amazon:")
    
    for item in amazon_contents:
        item_type = "📁" if item.isDir else "📄"
        size_info = f" ({item.size} bytes)" if not item.isDir else ""
        print(f"     {item_type} {item.name}{size_info}")
        
        # If it's a directory, explore it for e-commerce patterns
        if item.isDir:
            try:
                sub_contents = dbutils.fs.ls(f"/databricks-datasets/amazon/{item.name}")
                print(f"        📊 Contains {len(sub_contents)} files")
                
                # Look for data patterns
                data_files = [f for f in sub_contents if not f.isDir]
                if data_files:
                    print(f"        Data files: {len(data_files)}")
                    print(f"        Sample: {data_files[0].name}")
            except:
                print(f"        ⚠️ Couldn't explore contents")

except Exception as e:
    print(f"❌ Error exploring amazon: {e}")

print(f"\n🎯 PERFECT BUSINESS STORY DATASETS:")
print("   ✅ retail-org/ - Complete retail business (customers, loyalty, promotions)")
print("   ✅ online_retail/ - Transaction patterns and customer behavior")
print("   ✅ amazon/ - E-commerce product/customer relationships")

print(f"\n📊 YOUR SOLUTION ACCELERATOR STORY:")
story = """
🏪 RETAIL BUSINESS SCENARIO:
├── 🛒 Physical Stores (retail-org data)
│   ├── Customer demographics and loyalty programs
│   ├── Employee management and store operations
│   ├── Promotional campaigns and effectiveness
│   └── Multi-location retail organization
├── 🛍️ Online Channel (online_retail data)
│   ├── E-commerce transaction patterns
│   ├── Digital customer behavior
│   ├── Product performance online
│   └── Cross-channel customer journey
└── 🎯 Market Intelligence (amazon patterns)
    ├── Competitive product analysis
    ├── Customer review patterns
    ├── E-commerce best practices
    └── Market trend insights
"""
print(story)

print(f"\n🚀 WHY THESE DATASETS TELL GREAT STORIES:")
print("   📈 Real business challenges - Multi-channel retail operations")
print("   👥 Relatable scenarios - Customer loyalty, promotions, online shopping")
print("   💰 Clear ROI - Revenue optimization, customer retention, channel performance")
print("   🎯 Executive appeal - Practical business insights, not synthetic benchmarks")

print(f"\n✅ Step 2 Follow-up Complete: Business datasets analyzed")
print(f"🎊 READY TO BUILD A COMPELLING SOLUTION ACCELERATOR!")

In [None]:
# Step 3 Follow-up: Create FileStore Structure for Your Project
print("\n🏗️ STEP 3 FOLLOW-UP: CREATING FILESTORE STRUCTURE")
print("="*50)

print("🎯 CREATING ORGANIZED FILESTORE FOR SOLUTION ACCELERATOR:")

# Define the folder structure for your project
project_folders = [
    "/FileStore/",
    "/FileStore/solution_accelerator/",
    "/FileStore/solution_accelerator/sample_data/",
    "/FileStore/solution_accelerator/sample_data/products/",
    "/FileStore/solution_accelerator/sample_data/customers/",
    "/FileStore/solution_accelerator/sample_data/orders/",
    "/FileStore/solution_accelerator/retail_data/",
    "/FileStore/tables/",
    "/FileStore/shared_uploads/"
]

# Create the folder structure
print("\n📁 Creating project folder structure:")
for folder in project_folders:
    try:
        dbutils.fs.mkdirs(folder)
        print(f"   ✅ Created: {folder}")
    except Exception as e:
        print(f"   ⚠️ {folder}: {str(e)}")

# Create a README file in the main project folder
readme_content = """
# Solution Accelerator Data Upload Area

## Folder Structure:
- sample_data/products/   - Upload Product_samples.csv here
- sample_data/customers/  - Upload customer data when generated
- sample_data/orders/     - Upload order data when generated
- retail_data/           - Upload any real retail dataset files

## Upload Instructions:
1. Via Databricks UI: Data > Create > Upload File
2. Via drag-and-drop: Drag files into any notebook
3. Files uploaded via UI go to /FileStore/tables/
4. Files uploaded via drag-drop go to /FileStore/shared_uploads/

Created: {date}
""".format(date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

readme_path = "/FileStore/solution_accelerator/README.txt"
try:
    dbutils.fs.put(readme_path, readme_content, overwrite=True)
    print(f"\n📄 Created project README: {readme_path}")
except Exception as e:
    print(f"\n❌ Could not create README: {e}")

# Verify the structure was created
print(f"\n🔍 Verifying FileStore structure:")
try:
    filestore_contents = dbutils.fs.ls("/FileStore/")
    for item in filestore_contents:
        item_type = "📁" if item.isDir else "📄"
        print(f"   {item_type} {item.name}")
        
        # Show contents of solution_accelerator folder
        if item.name == "solution_accelerator/" and item.isDir:
            try:
                sa_contents = dbutils.fs.ls("/FileStore/solution_accelerator/")
                for sa_item in sa_contents:
                    sa_type = "📁" if sa_item.isDir else "📄"
                    print(f"     {sa_type} {sa_item.name}")
                    
                    # Show sample_data contents
                    if sa_item.name == "sample_data/" and sa_item.isDir:
                        sample_contents = dbutils.fs.ls("/FileStore/solution_accelerator/sample_data/")
                        for sample_item in sample_contents:
                            sample_type = "📁" if sample_item.isDir else "📄"
                            print(f"       {sample_type} {sample_item.name}")
            except Exception as e:
                print(f"     ❌ Error exploring solution_accelerator: {e}")
                
except Exception as e:
    print(f"❌ Error verifying structure: {e}")

print(f"\n🎯 HOW TO UPLOAD YOUR PRODUCT_SAMPLES.CSV:")
upload_instructions = """
OPTION 1 - Databricks UI Upload:
1. Click 'Data' in left sidebar
2. Click 'Create' button
3. Select 'Upload File'
4. Browse to C:\\temp\\samples\\Product_samples.csv
5. Upload (will go to /FileStore/tables/)
6. Move to organized folder if needed

OPTION 2 - Drag and Drop:
1. Open this notebook in Databricks
2. Open Windows Explorer to C:\\temp\\samples\\
3. Drag Product_samples.csv into this notebook
4. File will appear in /FileStore/shared_uploads/
5. Move to /FileStore/solution_accelerator/sample_data/products/

OPTION 3 - Programmatic Upload (Advanced):
Use dbutils.fs.cp() to copy from local path (if accessible)
"""

print(upload_instructions)

print(f"\n🎯 UPLOAD TARGETS FOR YOUR FILES:")
print("   📄 Product_samples.csv → /FileStore/solution_accelerator/sample_data/products/")
print("   📊 Future customer data → /FileStore/solution_accelerator/sample_data/customers/")
print("   🛒 Future order data → /FileStore/solution_accelerator/sample_data/orders/")
print("   📈 Retail datasets → /FileStore/solution_accelerator/retail_data/")

print(f"\n✅ FileStore structure created and ready for uploads!")

In [None]:
# Step 3 Update: Check for Your Uploaded File
print("\n📤 STEP 3 UPDATE: CHECKING FOR YOUR UPLOADED FILE")
print("="*50)

print("🎯 LOOKING FOR YOUR UPLOADED FILES:")

# Check if FileStore now exists after your upload
try:
    filestore_contents = dbutils.fs.ls("/FileStore/")
    print(f"\n📁 FileStore Contents ({len(filestore_contents)} items):")
    
    for item in filestore_contents:
        item_type = "📁" if item.isDir else "📄"
        size = f"({item.size} bytes)" if not item.isDir else ""
        print(f"   {item_type} {item.name} {size}")
        
        # Look specifically for your source_data folder
        if "source_data" in item.name:
            print(f"   🎯 Found your source_data folder!")
            
            # Explore the source_data folder
            try:
                source_data_contents = dbutils.fs.ls(f"/FileStore/{item.name}")
                print(f"      📊 Contents of {item.name} ({len(source_data_contents)} items):")
                
                for source_item in source_data_contents:
                    source_type = "📁" if source_item.isDir else "📄"
                    source_size = f" ({source_item.size} bytes)" if not source_item.isDir else ""
                    print(f"        {source_type} {source_item.name}{source_size}")
                    
                    # Check if this is your product_tents.csv file
                    if "product_tents.csv" in source_item.name.lower() or "product" in source_item.name.lower():
                        print(f"        🎯 Found your product CSV file!")
                        
                        # Try to read a sample of the file
                        file_path = f"/FileStore/{item.name}{source_item.name}"
                        print(f"        📋 File path: {file_path}")
                        
                        try:
                            # Read first few lines to verify structure
                            file_content = dbutils.fs.head(file_path, max_bytes=500)
                            print(f"        📖 First few lines:")
                            lines = file_content.split('\n')[:5]  # Show first 5 lines
                            for i, line in enumerate(lines):
                                print(f"           {i+1}: {line}")
                        except Exception as read_error:
                            print(f"        ❌ Could not read file: {read_error}")
                            
            except Exception as folder_error:
                print(f"      ❌ Could not explore source_data folder: {folder_error}")

    # Also check common upload locations
    common_locations = ["/FileStore/tables/", "/FileStore/shared_uploads/"]
    for location in common_locations:
        try:
            location_contents = dbutils.fs.ls(location)
            if location_contents:
                print(f"\n📁 {location} ({len(location_contents)} items):")
                for item in location_contents[:3]:  # Show first 3
                    item_type = "📁" if item.isDir else "📄"
                    size = f" ({item.size} bytes)" if not item.isDir else ""
                    print(f"     {item_type} {item.name}{size}")
                    
                    if "product" in item.name.lower():
                        print(f"     🎯 Found product-related file: {item.name}")
        except:
            print(f"\n📁 {location}: Empty or doesn't exist")

except Exception as e:
    print(f"❌ Error checking FileStore: {e}")

print(f"\n🎯 WORKING WITH YOUR UPLOADED FILE:")
working_instructions = """
Now that you've uploaded your file, you can:

1. 📊 Read the CSV file into a DataFrame:
   df = spark.read.csv("/FileStore/source_data/product_tents.csv", header=True, inferSchema=True)
   df.show()

2. 🔍 Explore the data structure:
   df.printSchema()
   df.count()
   df.columns

3. 📈 Create Delta tables from your data:
   df.write.format("delta").mode("overwrite").saveAsTable("products_sample")

4. 🛒 Use this as basis for retail business scenarios:
   # Filter by brand, analyze date patterns, etc.
   df.filter(df.BrandName == "Fabrikam").show()
"""

print(working_instructions)

print(f"\n🚀 NEXT STEPS WITH YOUR DATA:")
print("   1. ✅ File uploaded successfully to FileStore")
print("   2. 📊 Ready to create Delta tables for Unity Catalog learning")
print("   3. 🎯 Perfect foundation for your retail business scenarios")
print("   4. 📈 Use this data in the 2-hour crash course notebook")

print(f"\n✅ Upload verification complete!")