# TCGA Project Setup

This notebook initializes the TCGA project by:
- Loading and validating configuration
- Creating Unity Catalog resources (catalog, schema, volume)
- Setting up Databricks widgets for runtime configuration
- Exporting variables for use in other notebooks

In [None]:
# Import configuration manager
import sys
import os

# Add project root to path
project_root = os.path.abspath('.')
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from config.config_manager import load_config

# Load configuration
# Use environment variable to specify environment (dev, staging, production)
environment = os.getenv('TCGA_ENVIRONMENT', 'production')
print(f"Loading configuration for environment: {environment}")

config = load_config(environment=environment)
print(f"Configuration loaded successfully: {config}")

In [None]:
# Create Databricks widgets for runtime configuration
config.get_databricks_widgets(dbutils)

# Export commonly used variables
catalog = config.lakehouse.catalog
schema = config.lakehouse.schema
volume = config.lakehouse.volume
volume_path = config.lakehouse.volume_path
database_name = config.lakehouse.database_name

# API endpoints
cases_endpt = config.api.cases_endpt
files_endpt = config.api.files_endpt
data_endpt = config.api.data_endpt

# Display configuration
print("="*60)
print("TCGA Project Configuration")
print("="*60)
print(f"Catalog: {catalog}")
print(f"Schema: {schema}")
print(f"Volume: {volume}")
print(f"Volume Path: {volume_path}")
print(f"Database: {database_name}")
print(f"")
print(f"API Endpoints:")
print(f"  Cases: {cases_endpt}")
print(f"  Files: {files_endpt}")
print(f"  Data: {data_endpt}")
print("="*60)

## Create Unity Catalog Resources

Create the catalog, schema, and volume if they don't already exist.

In [None]:
# Note: Uncomment the line below to create catalog (requires appropriate permissions)
# spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog}")

# Create schema if not exists
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{schema}")
print(f"✓ Schema {catalog}.{schema} created or already exists")

# Create volume if not exists  
spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog}.{schema}.{volume}")
print(f"✓ Volume {catalog}.{schema}.{volume} created or already exists")

print(f"\nSetup complete! Unity Catalog resources are ready.")

## Verify Setup

Verify that the resources were created successfully.

In [None]:
# Verify schema exists
schemas_df = spark.sql(f"SHOW SCHEMAS IN {catalog}")
print(f"Schemas in {catalog}:")
display(schemas_df)

# Verify volume exists
volumes_df = spark.sql(f"SHOW VOLUMES IN {catalog}.{schema}")
print(f"\nVolumes in {catalog}.{schema}:")
display(volumes_df)

## Next Steps

1. Run `01-data-download` to download TCGA data from GDC API
2. Run DLT pipeline to create managed tables
3. Run `02-tcga-expression-clustering` for analysis