# Atlas Transformer for Microsoft Purview

This notebook transforms extracted metadata into Apache Atlas format for Purview ingestion.

**Parameters:**
- `metadata_path`: Path to extracted metadata JSON file
- `source_type`: Type of data source
- `collection_name`: Purview collection name

In [None]:
# Import required libraries
from notebookutils import mssparkutils
from datetime import datetime
import json
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Get parameters
metadata_path = mssparkutils.notebook.getArgument("metadata_path", "")
source_type = mssparkutils.notebook.getArgument("source_type", "sql_server")
collection_name = mssparkutils.notebook.getArgument("collection_name", "default")

logger.info(f"Metadata Path: {metadata_path}")
logger.info(f"Source Type: {source_type}")
logger.info(f"Collection: {collection_name}")

In [None]:
# Load metadata
logger.info(f"Loading metadata from {metadata_path}...")

with open(metadata_path, 'r') as f:
    metadata = json.load(f)

logger.info("Metadata loaded successfully")

In [None]:
# Import SDK
import sys
sys.path.append("/lakehouse/default/Files/libs")

from purview_connector_sdk import DatabaseConnector, FileSystemConnector, PurviewClient

# Create mock client for transformation only
purview_client = PurviewClient(
    account_name="mock",
    use_managed_identity=True
)

In [None]:
# Create connector based on source type
if source_type == "sql_server":
    connector = DatabaseConnector(
        purview_client=purview_client,
        source_type=source_type,
        connection_string="mock",
        collection_name=collection_name
    )
elif source_type == "file_system":
    connector = FileSystemConnector(
        purview_client=purview_client,
        root_path="mock",
        collection_name=collection_name
    )
else:
    raise ValueError(f"Unsupported source type: {source_type}")

In [None]:
# Transform to Atlas format
logger.info("Transforming metadata to Atlas format...")

atlas_entities = connector.transform_to_atlas(metadata)

logger.info(f"Transformed {len(atlas_entities)} entities")

In [None]:
# Display sample entities
print("=" * 60)
print("Sample Atlas Entities")
print("=" * 60)

for i, entity in enumerate(atlas_entities[:3]):  # Show first 3
    print(f"\nEntity {i+1}:")
    print(f"  Type: {entity['typeName']}")
    print(f"  Name: {entity['attributes'].get('name')}")
    print(f"  QualifiedName: {entity['attributes'].get('qualifiedName')}")

print(f"\n... and {len(atlas_entities) - 3} more entities")
print("=" * 60)

In [None]:
# Validate entities
logger.info("Validating Atlas entities...")

try:
    connector.validate_entities(atlas_entities)
    logger.info("✓ All entities are valid")
except Exception as e:
    logger.error(f"✗ Validation failed: {e}")
    raise

In [None]:
# Prepare Atlas JSON for Purview
atlas_json = {
    "entities": atlas_entities,
    "referredEntities": {},
    "metadata": {
        "source_type": source_type,
        "collection": collection_name,
        "timestamp": datetime.now().isoformat(),
        "entity_count": len(atlas_entities)
    }
}

logger.info(f"Prepared Atlas JSON with {len(atlas_entities)} entities")

In [None]:
# Save Atlas JSON to processed folder
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = f"/lakehouse/default/Files/processed/atlas_{source_type}_{timestamp}.json"

logger.info(f"Saving Atlas JSON to: {output_path}")

with open(output_path, 'w') as f:
    json.dump(atlas_json, f, indent=2, default=str)

logger.info("Atlas JSON saved successfully")

In [None]:
# Save transformation log
log_path = f"/lakehouse/default/Files/logs/transformation_{timestamp}.log"

log_entry = {
    "timestamp": timestamp,
    "source_type": source_type,
    "metadata_path": metadata_path,
    "atlas_json_path": output_path,
    "entity_count": len(atlas_entities),
    "status": "success"
}

with open(log_path, 'w') as f:
    json.dump(log_entry, f, indent=2)

logger.info(f"Log saved to: {log_path}")

In [None]:
# Return output for next activity
output = {
    "status": "success",
    "atlas_json_path": output_path,
    "log_path": log_path,
    "entity_count": len(atlas_entities),
    "timestamp": timestamp
}

logger.info("Atlas transformation complete")
logger.info(f"Output: {json.dumps(output, indent=2)}")

mssparkutils.notebook.exit(json.dumps(output))