In [0]:
%sh 
pip install requests-cache
curl -sSfL https://raw.githubusercontent.com/trufflesecurity/trufflehog/main/scripts/install.sh | sh -s -- -b /tmp
echo "detectors:
    - name: DkeaToken
      keywords:
        - dkea
      regex:
        id: (?i)\b(dkea[a-h0-9]{32})
    - name: DapiToken
      keywords:
        - dapi
      regex:
        id: (?i)\b(dapi[a-h0-9]{32})
    - name: DoseToken
      keywords:
        - dose
      regex:
        id: (?i)\b(dose[a-h0-9]{32}) " > /tmp/tuf.conf


In [0]:
import os, requests, time, json, base64, subprocess, hashlib, re
from datetime import timedelta, datetime
from urllib.parse import quote

faketoken = "dkea12345678901234567890123456789012"

# Extract token and API URL from Databricks notebook context
token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None)
base_url = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().getOrElse(None)


def convert_time_to_databricks_format(env_time):
    # Assuming env_time is in milliseconds
    return int(env_time)

def get_yesterday_utc_midnight():
    # Get yesterday's date in UTC with a time of 00:00
    today_utc_midnight = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
    yesterday_utc_midnight = today_utc_midnight - timedelta(days=1)
    return int(yesterday_utc_midnight.timestamp() * 1000)

def make_request(url, headers, data):
    try:
        response = requests.get(url, headers=headers, json=data)
        if response.status_code == 200:
            json_response = response.json()
            print(json_response)
            return json_response
        else:
            print(f"URL: {url}, gave response code: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return None


def generate_sha(secret):
    # Encode the secret string to bytes
    secret_bytes = secret.encode("utf-8")

    # Create a SHA-256 hash object
    sha = hashlib.sha256()

    # Update the hash object with the secret bytes
    sha.update(secret_bytes)

    # Get the hexadecimal representation of the digest
    sha_hex = sha.hexdigest()

    return sha_hex


def get_notebook_permissions(object_id):
    try:
        headers = {"Authorization": f"Bearer {token}"}
        url = f"{base_url}/api/2.0/permissions/notebooks/{object_id}"
        response = requests.get(url, headers=headers)
        if response.status_code == 403:
            print(f"Permission Denied for notebook {object_id}: {url}")
        response.raise_for_status()  # Raise an exception for 4xx and 5xx status codes
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error getting notebook for {object_id}: {str(e)}")
        return None


def export_notebook_content(notebook_path):
    try:
        headers = {"Authorization": f"Bearer {token}"}
        url = f"{base_url}/api/2.0/workspace/export?path={notebook_path}"
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for 4xx and 5xx status codes
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error exporting notebook content for {notebook_path}: {str(e)}")
        return None


def decode_and_write_content(content, output_path):
    decoded_content = base64.b64decode(content).decode("utf-8")
    with open(output_path, "w") as file:
        file.write(decoded_content)


def scan_for_secrets(file_path):
    trufflehog_command = f"/tmp/trufflehog filesystem {file_path} --exclude-detectors DatabricksToken --no-update --config /tmp/tuf.conf -j"
    try:
        result = subprocess.run(trufflehog_command, shell=True, check=True, capture_output=True, text=True)
        return result.stdout
    except subprocess.CalledProcessError as e:
        print(f"Error running Trufflehog: {e}")
        print(e.stderr)
        return None


# Check if notebook is deleted or not.
def check_notebook_status(notebook_path):
    check_url = f"{base_url}/api/2.0/workspace/get-status?path={notebook_path}"
    headers = {"Authorization": f"Bearer {token}"}

    try:
        response = requests.get(check_url, headers=headers)
        response.raise_for_status()  # Raise an exception for non-200 status codes
    except requests.exceptions.HTTPError as e:
        if e.response.status_code in (404, 403):
            return e.response.status_code
        else:
            return f"Error: Unexpected status code - {e.response.status_code}"
    except requests.exceptions.RequestException as e:
        return f"Error: {e}"
    return response.status_code

# Define a function to check secret presence and call dummy function
def check_secret_presence(notebook_path, object_id):
    try:
        notebook_status = check_notebook_status(notebook_path)

        if notebook_status == 200:
            export_response = export_notebook_content(notebook_path)
            if export_response is not None:
                content = export_response["content"]
                output_file_path = output_path + f"/notebook_content_{object_id}.txt"
                decode_and_write_content(content, output_file_path)
                print(f"Notebook content successfully exported to {output_file_path}")

                # Scan for secrets using Trufflehog
                trufflehog_output = scan_for_secrets(output_file_path)
                if trufflehog_output is not None:
                    # Process Trufflehog output
                    results = process_trufflehog_output(trufflehog_output)
                    print("RESULTS\n" + json.dumps(results, indent=4))
                    
        elif notebook_status == 403:
            print(f"Getting a Not authorized error for {notebook_path} : {notebook_status} ")

    except Exception as e:
        print(f"Error getting notebook content for {notebook_path}: {str(e)}")

def scan_for_secrets(file_path):
    trufflehog_command = f"/tmp/trufflehog filesystem {file_path} --exclude-detectors DatabricksToken --no-update --config /tmp/tuf.conf -j"
    try:
        result = subprocess.run(trufflehog_command, shell=True, check=True, capture_output=True, text=True)
        return result.stdout
    except subprocess.CalledProcessError as e:
        print(f"Error running Trufflehog: {e}")
        print(e.stderr)
        return None

def process_trufflehog_output(trufflehog_output):
    results = []
    for line in trufflehog_output.splitlines():
        data = json.loads(line)
        detector_name = data["DetectorName"]
        raw_value = data["Raw"]
        raw_sha = generate_sha(raw_value)
        encoded = raw_sha[0:round(len(raw_sha)*.2)]
        results.append({"DetectorName": detector_name, "Raw_SHA": raw_sha})
    return results

def process_response(response, results_list, output_filename):
    if response:
        results = response.get("results", [])
        for notebook in results:
            notebook_id = notebook.get("id", "")
            notebook_name = notebook.get("name", "")
            parent_path = notebook.get("workspace_path", "")

            temp_path = f"{parent_path}/{notebook_name}"
            path = quote(temp_path)
            print(notebook_id, notebook_name, temp_path)
            
            if output_filename:
              results_list.append({"object_id": notebook_id, "path": path})

              # Write the result to the output file as soon as it's processed
              with open(output_filename, mode="a") as output_file:
                  json.dump({"object_id": notebook_id, "path": path}, output_file)
                  output_file.write("\n")
            check_secret_presence(path, notebook_id)

        # Use None as the default value for nextPageKey
        return response.get("next_page_token")

    # Return None for an empty response
    return None

# Get time from environment variable or use today's date in UTC midnight if not provided
env_time = int(os.environ.get("TIME", get_yesterday_utc_midnight()))

# Convert time to Databricks format
last_edited_after = convert_time_to_databricks_format(env_time)

# Storing a log of notebooks that were found
output_filename = "/tmp/test_file.json" # Disabling for testing

# Where to write temp notebooks to
output_path = "/tmp/notebooks"
if output_path and not os.path.exists(output_path):
  os.mkdir(output_path)

# Initialize a list to store the results
results_list = []

# Initial request (using GET) without pageKey
url = f"{base_url}/api/2.0/search-midtier/unified-search"
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
data = {
    "query": {"query": ""},
    "filters": {"result_types": ["NOTEBOOK"], "last_edited_after": last_edited_after},
    "page_size": 50,
}

next_page_key = ""  # Initial value is an empty string
while next_page_key is not None:
    # Add next_page_key to data for subsequent requests
    data["page_token"] = next_page_key
    print(datetime.now())
    # Make request
    response = make_request(url, headers, data)
    time.sleep(
        10
    )  # Sleeping for 10 second before making other API call, requirement from global search team to prevent rate limiting.
    next_page_key = process_response(response, results_list, output_filename)

In [0]:
%sh ls -l /tmp/notebooks