Generate Create Table Statments for all tables in the lakehouse and (not working yet: save it to as text file in the Lakehouse Files folder)

In [None]:
# Step 1
# Import required libraries and setup
from pyspark.sql import functions as F
import os
from datetime import datetime

# Configuration
TARGET_SCHEMA_NAME = "retail"  # Change this to your desired schema name

print(f"üîç Generating CREATE TABLE statements for all lakehouse tables")
print(f"üéØ Target schema name: {TARGET_SCHEMA_NAME}")
print(f"üïê Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("-" * 70)

In [None]:
# Step 2
# Setup output path using a different folder
import os

# Try Files/create_tables_text folder instead
output_folder = 'Files/create_tables_text/'
output_filename = 'CreateTablesText.txt'
output_path = os.path.join(output_folder, output_filename)

print(f"üìÅ Output file path: {output_path}")
print("-" * 50)

In [None]:
# Step 3
# Get all tables in the lakehouse
try:
    tables_df = spark.sql("SHOW TABLES")
    all_tables = [row['tableName'] for row in tables_df.collect()]
    
    print(f"üìã Found {len(all_tables)} tables in lakehouse:")
    for i, table in enumerate(all_tables, 1):
        print(f"  {i:2d}. {table}")
    print("-" * 70)
    
except Exception as e:
    print(f"‚ùå Error getting tables: {str(e)}")
    all_tables = []

In [None]:
# Step 4
#  Generate CREATE TABLE statements
if all_tables:
    output_content = []
    
    # Add header
    output_content.append("# Generated CREATE TABLE Statements")
    output_content.append(f"# Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    output_content.append(f"# Total tables: {len(all_tables)}")
    output_content.append(f"# Target schema: {TARGET_SCHEMA_NAME}")
    output_content.append("")
    output_content.append("# Configuration")
    output_content.append(f'SCHEMA_NAME = "{TARGET_SCHEMA_NAME}"')
    output_content.append('spark.sql(f"CREATE DATABASE IF NOT EXISTS {SCHEMA_NAME}")')
    output_content.append('print(f"‚úÖ {SCHEMA_NAME} schema ready!")')
    output_content.append("")
    
    successful_tables = 0
    failed_tables = []
    
    for i, table_name in enumerate(all_tables, 1):
        try:
            print(f"üîÑ Processing table {i}/{len(all_tables)}: {table_name}")
            
            # Get table schema
            describe_df = spark.sql(f"DESCRIBE {table_name}")
            columns = describe_df.collect()
            
            # Filter valid columns
            valid_columns = [col for col in columns 
                           if not col['col_name'].startswith('#') and col['col_name'].strip() != '']
            
            if valid_columns:
                # Generate CREATE TABLE statement
                output_content.append(f"# {i}. Create {table_name} table")
                output_content.append('create_table_sql = f"""')
                output_content.append(f'CREATE TABLE IF NOT EXISTS {{SCHEMA_NAME}}.{table_name} (')
                
                # Add column definitions
                for j, col in enumerate(valid_columns):
                    col_name = col['col_name']
                    data_type = col['data_type']
                    
                    # Standardize data types
                    if 'bigint' in data_type.lower():
                        data_type = 'BIGINT'
                    elif 'int' in data_type.lower():
                        data_type = 'INT'
                    elif 'string' in data_type.lower():
                        data_type = 'STRING'
                    elif 'double' in data_type.lower():
                        data_type = 'DOUBLE'
                    elif 'decimal' in data_type.lower():
                        data_type = data_type.upper()
                    elif 'boolean' in data_type.lower():
                        data_type = 'BOOLEAN'
                    elif 'timestamp' in data_type.lower():
                        data_type = 'TIMESTAMP'
                    elif 'date' in data_type.lower():
                        data_type = 'DATE'
                    elif 'binary' in data_type.lower():
                        data_type = 'BINARY'
                    
                    # Add comma for all but last column
                    comma = "," if j < len(valid_columns) - 1 else ""
                    output_content.append(f'    {col_name} {data_type}{comma}')
                
                output_content.append(')')
                output_content.append('USING DELTA')
                output_content.append('"""')
                output_content.append('spark.sql(create_table_sql)')
                output_content.append(f'print(f"‚úÖ {{SCHEMA_NAME}}.{table_name} table created!")')
                output_content.append("")
                
                successful_tables += 1
                
            else:
                print(f"‚ö†Ô∏è  No valid columns found for table: {table_name}")
                failed_tables.append(table_name)
                
        except Exception as e:
            print(f"‚ùå Error processing table {table_name}: {str(e)}")
            failed_tables.append(table_name)
    
    print(f"\nüìä Processing Summary:")
    print(f"   ‚Ä¢ Total tables: {len(all_tables)}")
    print(f"   ‚Ä¢ Successful: {successful_tables}")
    print(f"   ‚Ä¢ Failed: {len(failed_tables)}")
    
    if failed_tables:
        print(f"   ‚Ä¢ Failed tables: {', '.join(failed_tables)}")
    
else:
    output_content = ["# No tables found in lakehouse"]
    print("‚ö†Ô∏è  No tables found to process")

In [None]:
# Step 5
# Display full generated content for manual copying
print("üîç FULL GENERATED CONTENT:")
print("=" * 80)
print("üìã Copy the content below and paste into your GeneratedCreateTablesStmts.ipynb:")
print("=" * 80)

try:
    if 'output_content' in locals() and output_content:
        full_content = "\n".join(output_content)
        
        print(f"üìä Content Statistics:")
        print(f"   ‚Ä¢ Total lines: {len(output_content)}")
        print(f"   ‚Ä¢ Total characters: {len(full_content)}")
        print(f"   ‚Ä¢ First table: {all_tables[0] if all_tables else 'None'}")
        print(f"   ‚Ä¢ Last table: {all_tables[-1] if all_tables else 'None'}")
        print(f"   ‚Ä¢ Target schema: {TARGET_SCHEMA_NAME}")
        print()
        
        print("üìÑ GENERATED CREATE TABLE STATEMENTS:")
        print("-" * 80)
        print(full_content)
        print("-" * 80)
        
        print(f"\n‚úÖ Content displayed successfully!")
        print(f"üéØ Next Steps:")
        print(f"   1. Copy the above content")
        print(f"   2. Paste into GeneratedCreateTablesStmts.ipynb")
        print(f"   3. Execute to create all {len(all_tables)} tables in {TARGET_SCHEMA_NAME} schema")
        
    else:
        print("‚ùå No content generated yet - run previous cells first")
        
except Exception as e:
    print(f"‚ùå Error displaying content: {str(e)}")
    
print("\n" + "=" * 80)

In [None]:
# Step 6
# Save output to a file in the same lakehouse, to folder and file defined in Step 2,
