In [None]:
# Notebook: List New Tables Added to Lakehouse
# This notebook identifies tables added beyond the original 49 tables

from pyspark.sql import functions as F

# Configuration - Schema name for CREATE TABLE statements output
TARGET_SCHEMA_NAME = "retail"  # Change this to your desired schema name

# Original 49 tables list
original_tables = [
    "CustomerAccount",
    "CustomerAccountEmail", 
    "CustomerAccountLocation",
    "CustomerAccountTelephoneNumber",
    "CustomerGroup",
    "CustomerLocation",
    "CustomerName",
    "CustomerRelationshipType",
    "CustomerStatusType",
    "CustomerTelephoneNumber",
    "CustomerTradeName",
    "CustomerType",
    "HouseholdLocation",
    "IndividualCustomer",
    "Invoice",
    "InvoiceLine",
    "Location",
    "Order",
    "OrderActivityType",
    "OrderAdjustment",
    "OrderCharge",
    "OrderChargeType",
    "OrderClassification",
    "OrderCondition",
    "OrderDeliveryTerm",
    "OrderFinanceTerm",
    "OrderHold",
    "OrderLanguageUsage",
    "OrderLine",
    "OrderLineAdjustment",
    "OrderLineAdjustmentReason",
    "OrderLineCharge",
    "OrderLineHold",
    "OrderLineStatus",
    "OrderPartyRelationshipType",
    "OrderPayment",
    "OrderProcessingStatus",
    "OrderRelatedParty",
    "OrderSalesTerm",
    "OrderStatus",
    "OrderStatusType",
    "OrderType",
    "Party",
    "PartyLocation",
    "PartyTelephoneNumber",
    "Retailer",
    "SalesOrderCondition",
    "UsaLocation",
    "UsLocation"
]

print(f"🔍 Analyzing tables in lakehouse")
print(f"📊 Original table count: {len(original_tables)}")
print(f"🎯 Target schema for new tables: {TARGET_SCHEMA_NAME}")
print("-" * 50)

In [None]:
# Get all current tables in the lakehouse
try:
    # Use SHOW TABLES without specifying schema for lakehouse
    current_tables_df = spark.sql("SHOW TABLES")
    current_tables = [row['tableName'] for row in current_tables_df.collect()]
    
    print(f"📋 Current total tables in lakehouse: {len(current_tables)}")
    
    # Find new tables (case-insensitive comparison)
    original_tables_lower = [table.lower() for table in original_tables]
    new_tables = [table for table in current_tables 
                  if table.lower() not in original_tables_lower]
    
    if new_tables:
        print(f"\n🆕 NEW TABLES ADDED ({len(new_tables)}):")
        print("=" * 40)
        for i, table in enumerate(new_tables, 1):
            print(f"{i:2d}. {table}")
        
        print(f"\n📈 Summary:")
        print(f"   • Original tables: {len(original_tables)}")
        print(f"   • Current tables:  {len(current_tables)}")
        print(f"   • New tables:      {len(new_tables)}")
        
    else:
        print(f"\n✅ No new tables found beyond the original {len(original_tables)} tables")
        
except Exception as e:
    print(f"❌ Error accessing lakehouse tables: {str(e)}")
    print("💡 Make sure you have access permissions to the lakehouse")

In [None]:
# Generate CREATE TABLE statements for new tables only
if new_tables:
    print(f"\n🔧 GENERATING CREATE TABLE STATEMENTS FOR NEW TABLES:")
    print("=" * 60)
    
    # First, generate schema creation statement
    print(f"\n# Create {TARGET_SCHEMA_NAME} schema")
    print(f'SCHEMA_NAME = "{TARGET_SCHEMA_NAME}"')
    print(f'spark.sql(f"CREATE DATABASE IF NOT EXISTS {{SCHEMA_NAME}}")')
    print(f'print(f"✅ {{SCHEMA_NAME}} schema ready!")')
    print("-" * 50)
    
    for i, table_name in enumerate(new_tables, 1):
        try:
            # Get table schema information - no schema prefix needed
            describe_df = spark.sql(f"DESCRIBE {table_name}")
            columns = describe_df.collect()
            
            # Filter out non-column rows
            valid_columns = [col for col in columns 
                           if not col['col_name'].startswith('#') and col['col_name'].strip() != '']
            
            print(f"\n# {i}. Create {table_name} table")
            print(f'create_table_sql = f"""')
            print(f'CREATE TABLE IF NOT EXISTS {{SCHEMA_NAME}}.{table_name} (')
            
            # Generate column definitions
            for j, col in enumerate(valid_columns):
                col_name = col['col_name']
                data_type = col['data_type']
                
                # Convert Spark types to more standard SQL types if needed
                if 'bigint' in data_type.lower():
                    data_type = 'BIGINT'
                elif 'int' in data_type.lower():
                    data_type = 'INT'
                elif 'string' in data_type.lower():
                    data_type = 'STRING'
                elif 'double' in data_type.lower():
                    data_type = 'DOUBLE'
                elif 'decimal' in data_type.lower():
                    data_type = data_type.upper()
                elif 'boolean' in data_type.lower():
                    data_type = 'BOOLEAN'
                elif 'timestamp' in data_type.lower():
                    data_type = 'TIMESTAMP'
                elif 'date' in data_type.lower():
                    data_type = 'DATE'
                
                # Add comma for all but last column
                comma = "," if j < len(valid_columns) - 1 else ""
                print(f'    {col_name} {data_type}{comma}')
            
            print(')')
            print('USING DELTA')
            print('"""')
            print(f'spark.sql(create_table_sql)')
            print(f'print(f"✅ {{SCHEMA_NAME}}.{table_name} table created!")')
            print("-" * 50)
            
        except Exception as e:
            print(f"❌ Error describing table {table_name}: {str(e)}")
            
else:
    print("\n💡 No new tables to generate CREATE statements for")