# Update Workflows and Clusters ♻️

## Requirements
### Databricks
* A Databricks Workspace & Workspace Access Token
* At least one runnable cluster within the workspace



#### Please note: job and wokflow is used interchangeably throughout 



In [None]:
import json

import requests
from typing import Optional, Callable

## Steps 📊


### 1. Fetch workflow/cluster Configurations 📬

We fetch all the workflows/clusters present in your workspace, each fetched workflow config will also contain the individual task config present in the workflow and their respective job cluster configs.

### 2. Filter and Parse Information 🧩

We perform the needed parsing on the obtained info so as to add/remove necessary fields.

### 3. Update existing workflow / config 👶🏽

Using the parsed info we update existing workflows/clusters in the current workspace.


### Set up workspace urls and access tokens


In [None]:
dbutils.widgets.removeAll()

dbutils.widgets.text("workspace_url", "")
workspace_url: str = getArgument("workspace_url")

dbutils.widgets.text("workspace_token", "")
workspace_token: str = getArgument("workspace_token")


query_params = {
    "LIST_JOBS_LIMIT": 100,  # max limit
    "EXPAND_TASKS": "true",  # provides the complete config info for each job
}

In [None]:
def paginate(
    can_paginate: bool,
    next_page_token: Optional[str],
    url: str,
    workspace_token: str,
    function_to_call: Callable,
) -> None:
    """
    Paginates to the next page if possible
    input:
        can_paginate [bool]: Boolean info about wheather there is additional info.
        next_page_token [str]: Token needed in url query param to paginate to next page.
        url [str]: Url used to list the needed info.
        function_to_call [Callable]: Function that gets called with the paginated url to paginate further.
    output:
        None
    """

    if next_page_token and can_paginate:
        if "&page_token" in url:
            url = f"{url[:url.find('&page_token')]}&page_token={next_page_token}"
        else:
            url = f"{url}&page_token={next_page_token}"

        function_to_call(url, workspace_token)
    else:
        return

## List Clusters 
#### Fetches all clusters in current workspace and its respective configs
<a href="https://docs.databricks.com/api/workspace/clusters/list">API Docs</a>


In [None]:
def getAllClusters(list_clusters_url: str, workspace_token: str) -> None:
    """
    Fetches all the clusters and metadata about them.
    input:
        list_clusters_url [str]: Databricks API used to fetch all the clusters.
        workspace_token [str]: Databricks workspace access token.
    output:
        None
    """

    response = requests.get(
        list_clusters_url,
        headers={"Authorization": f"Bearer {workspace_token}"},
    )
    assert response.status_code == 200

    response_data = response.json()

    for cluster_info in response_data.get("clusters", []):
        clusters.append(cluster_info)

    paginate(
        response_data.get("has_more", False),
        response_data.get("next_page_token"),
        list_clusters_url,
        workspace_token,
        getAllClusters,
    )


clusters = []  # holds all cluster' info
List_clusters_url = str(workspace_url + "/api/2.0/clusters/list")
getAllClusters(List_clusters_url, workspace_token)

## Filter and Parse info

In [None]:
def filterClusters(cluster_info: dict) -> bool:
    """Filter clusters based on custom logic"""
    return True


def parseClusters(cluster_info: dict) -> dict:
    """Modefies the cluster config.
    input:
        cluster_info [dict]: Dict containing all the config info about the cluster.
    output:
        dict : parsed result in accordance with the `create cluster` api payload."""

    # add more custom parsing logic if needed
    return cluster_info


filtered_clusters = []

# filter
for cluster_info in clusters:
    if filterClusters(cluster_info):
        filtered_clusters.append(cluster_info)

# parse
for idx in range(len(filtered_clusters)):
    cluster_info = filtered_clusters[idx]
    parsed_cluster_info = parseClusters(cluster_info)
    filtered_clusters[idx] = parsed_cluster_info

clusters = filtered_clusters

## Update existing cluster
#### Use the parsed info as payload to update clusters in the current workspace
<a href="https://docs.databricks.com/api/workspace/clusters/edit">API Docs</a>



In [None]:
for cluster_info in clusters:
    response = requests.post(
        f"{workspace_url}/api/2.0/clusters/edit",
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {workspace_token}",
        },
        data=json.dumps(cluster_info),
    )
    assert response.status_code in {
        200,
        201,
    }

## List Workflows 
#### Fetches all workflows in current workspace and its respective configs
<a href="https://docs.databricks.com/api/jobs/clusters/list">API Docs</a>


In [None]:
def getAllJobs(list_jobs_url: str, workspace_token: str) -> None:
    """
    Fetches all the jobs and metadata about them.
    input:
        lists_jobs_url [str]: Databricks API used to fetch all the jobs.
        workspace_token [str]: Databricks workspace access token.
    output:
        None
    """

    response = requests.get(
        list_jobs_url,
        headers={"Authorization": f"Bearer {workspace_token}"},
    )
    assert response.status_code == 200

    response_data = response.json()

    for job in response_data.get("jobs", []):
        jobs[job.get("job_id")] = job.get("settings")

    paginate(
        response_data.get("has_more", False),
        response_data.get("next_page_token"),
        list_jobs_url,
        workspace_token,
        getAllJobs,
    )


jobs = {}  # holds all jobs' info
List_jobs_url = str(
    workspace_url
    + "/api/2.1/jobs/list?"
    + f"limit={query_params.get('LIST_JOBS_LIMIT')}"
    + f"&expand_tasks={query_params.get('EXPAND_TASKS')}"
)
getAllJobs(List_jobs_url, workspace_token)

## Filter and Parse info

In [None]:
def filterWorkflows(workflow_info: dict) -> bool:
    """Filter Workflow based on custom logic"""
    return True


def parseWorkflows(workflow_info: dict) -> dict:
    """Modefies the workflow config.
    input:
        workflow_info [dict]: Dict containing all the config info about the workflow.
    output:
        dict : parsed result in accordance with the `create job` api payload."""

    # add more custom parsing logic if needed
    return workflow_info


filtered_jobs = {}

# filter
for job_id, workflow_info in jobs.items():
    if filterWorkflows(workflow_info):
        filtered_jobs[job_id] = workflow_info

# parse
for job_id, workflow_info in filtered_jobs.items():
    parsed_workflow_info = parseWorkflows(workflow_info)
    filtered_jobs[job_id] = parsed_workflow_info

jobs = filtered_jobs

## Update Workflow
#### Use the parsed info to update workflow in existing workspace
<a href="https://docs.databricks.com/api/workspace/jobs/reset">API Docs</a>


In [None]:
for job_id, workflow_info in jobs.item():
    response = requests.post(
        url=f"{workspace_url}/api/2.1/jobs/reset",
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {workspace_token}",
        },
        data=json.dumps({"job_id": job_id, "new_settings": workflow_info}),
    )
    assert response.status_code in {
        200,
        201,
    }