In [0]:
"""
This script automates the onboarding and deletion of institutions within a Databricks workspace,
including setting up storage, Unity Catalog schemas and volumes, and copying/deleting ML models.
"""

from databricks.sdk import WorkspaceClient
from databricks.sdk.service import catalog

import mlflow
from mlflow import MlflowClient

from google.cloud import storage
import random
import toml
import os
import shutil

# Initialize clients for Databricks, Google Cloud Storage, and MLflow
w = WorkspaceClient()
storage_client = storage.Client()
mlflow_client = MlflowClient()

# Configuration variables
catalog_name = "dev_sst_02"  # Name of the Unity Catalog
medallion_levels = ["silver", "gold", "bronze"]  # List of data medallion levels


In [0]:
# Helper functions
def onboard_institution(institution_id):
    """
    Onboards a new institution by creating necessary resources.

    Args:
        institution_id: The ID of the institution to onboard.
    """

    # Create a GCS bucket for SST application data.
    sst_app_bucket = storage_client.create_bucket(f"{catalog_name}_{institution_id}_sst_application", location="us-east4")

    # TODO CREATE A SERVICE ACCOUNT PER WORKSPACE 

    # Grant object admin access to the specified service account.
    policy = sst_app_bucket.get_iam_policy(requested_policy_version=3)
    policy.bindings.append({"role": "roles/storage.objectAdmin", 
                            "members": {"serviceAccount:pedro-pdp-inference-pipeline@dev-sst-02.iam.gserviceaccount.com"}})
    sst_app_bucket.set_iam_policy(policy)

    # Create Unity Catalog schemas for each medallion level.
    for medallion in medallion_levels:
        w.schemas.create(name=f'{institution_id}_{medallion}', catalog_name=catalog_name)

    # Create a managed volume in the bronze schema for internal pipeline data.
    created_volume = w.volumes.create(catalog_name=catalog_name,
                                  schema_name=f"{institution_id}_bronze",
                                  name=f'pdp_pipeline_internal',
                                  volume_type=catalog.VolumeType.MANAGED)
    


    # Copy an existing model to the new institution's bronze schema.  This is a placeholder.
    # In a real scenario, models would likely be trained specifically for the institution.
    model_name = "latest_enrollment_model"
    existing_model_uri = f"models:/{catalog_name}.institution_x_bronze.{model_name}/1" 
    new_institution_model_uri = f"{catalog_name}.{institution_id}_gold.{model_name}"

    mlflow.register_model(existing_model_uri, new_institution_model_uri)

    # Creating an institution configuration file copying from a template. This is a placeholder.
    # In a real scenario, toml file is specifically built for the institution.
    toml_template_path ='/Workspace/Users/pedro.melendez@datakind.org/pedro-clone-student-success-tool-develop/tests/pedro_test_folder/template_toml_file.toml'
    with open(toml_template_path, 'r') as f:
        toml_template = toml.load(f)

    toml_template['institution_id'], toml_template['institution_name'] = institution_id, institution_id

    # Create directory on the volume
    os.makedirs(f"/Volumes/{catalog_name}/{institution_id}_bronze/pdp_pipeline_internal/configuration_files/", exist_ok=True)

    with open(f"/Volumes/{catalog_name}/{institution_id}_bronze/pdp_pipeline_internal/configuration_files/{institution_id}.toml", 'w') as f:
        toml.dump(toml_template, f)

    


def delete_institution(institution_id):
    """
    Deletes all resources associated with a given institution.

    Args:
        institution_id: The ID of the institution to delete.
    """
    try:
        # Delete the GCS bucket.  Force=True handles non-empty buckets.
        bucket = storage_client.get_bucket(f"{catalog_name}_{institution_id}_sst_application")
        bucket.delete(force=True)
    except Exception as e:
        print(f"Error deleting bucket: {e}")

    try:
        # Delete the managed volume.
        w.volumes.delete(name=f'{catalog_name}.{institution_id}_bronze.pdp_pipeline_internal')
    except Exception as e:
        print(f"Error deleting volume: {e}")

    try:
        # Delete the MLflow model.
        model_name = "latest_enrollment_model"
        new_institution_model_uri = f"{catalog_name}.{institution_id}_gold.{model_name}"
        mlflow_client.delete_registered_model(name=new_institution_model_uri)
    except Exception as e:
        print(f"Error deleting model: {e}")

    # Delete tables and schemas for each medallion level.
    for medallion in medallion_levels:
        try:
            all_tables = [table.name for table in w.tables.list(catalog_name=catalog_name, schema_name=f"{institution_id}_{medallion}")]
            for table in all_tables:
                w.tables.delete(full_name=f'{catalog_name}.{institution_id}_{medallion}.{table}')
            w.schemas.delete(full_name=f'{catalog_name}.{institution_id}_{medallion}')
        except Exception as e:
             print(f"Error deleting schema or tables for {medallion}: {e}")

In [0]:
# Example usage (comment out to test)
institution_id  = "standard_pdp_institution_15" # Example institution ID.  Should be parameterized in real use cases.
print(institution_id)

In [0]:
# Onboarding a sample institution
onboard_institution(institution_id)

In [0]:
# Running an inference job with the new institution

job_name = "PDP_inference_pipeline_on_personal_cluster"
# job_name = "PDP_inference_pipeline"
job_id = next(w.jobs.list(name=job_name)).job_id

run_job = w.jobs.run_now(job_id, job_parameters={'synthetic_needed': "True", 'institution_id': institution_id, 'sst_job_id': f"{institution_id}_inference_job_id_{str(random.randint(1, 1000))}"})
print(run_job.response)


In [0]:
print(run_job.result())

In [0]:
# Example usage (comment out to test)
delete_institution(institution_id)