Initialize connection

In [None]:
from azure.kusto.data import KustoConnectionStringBuilder
from azure.kusto.data.helpers import dataframe_from_result_table
from azure.kusto.data.aio import KustoClient
from datetime import datetime, timedelta
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.identity import DefaultAzureCredential
from bs4 import BeautifulSoup
from markdownify import MarkdownConverter
import shutil
import time
import json

import pandas as pd
import os

In [26]:
cluster = "https://icmclusterlb.kustomfa.windows.net"
kcsb = KustoConnectionStringBuilder.with_az_cli_authentication(cluster)
db = "IcmDataWarehouse"
results = []


In [28]:
async def get_record_set(client: KustoClient, current_top: datetime, current_floor: datetime):
    query = f"""
            Incidents
        | where OwningTenantId  == 34793
        | where IsNoise == false
        | where CreateDate >= datetime({current_floor.year}-{current_floor.month}-{current_floor.day}) and CreateDate < datetime({current_top.year}-{current_top.month}-{current_top.day})
        | sort by IncidentId desc
        | project IncidentId, SourceType, CreateDate, RoutingId, OwningTeamName, OwningContactAlias, Severity, Status, Title, ReproSteps, Mitigation, RootCauseId, ImpactStartDate, HowFixed, Summary, ModifiedDate
    """
    response = await client.execute(db, query)
    return response

async def get_records():
    increment_weeks = 4
    # floor = datetime(2025, 1, 1)
    floor = datetime(2023, 1, 1)
    current_floor = datetime.now()
    current_top = current_floor + timedelta(weeks = increment_weeks)

    async with KustoClient(kcsb) as client:
        while True:
            try:
                print(f"getting incidents for {current_floor.strftime("%Y-%m-%d")} through {current_top.strftime("%Y-%m-%d")}")
                response = await get_record_set(client, current_top, current_floor)
                results.append(response)  # Store KustoResponseDataSet

                # Decrement
                current_top = current_floor
                current_floor = current_floor - timedelta(weeks = increment_weeks )

                # Break the loop when reaching January 2023
                if current_floor < floor:
                    break
            except Exception as e:
                rest = 10
                print(f"sleeping for {rest} seconds")
                time.sleep(rest)

await get_records()

getting incidents for 2025-02-28 through 2025-03-28
getting incidents for 2025-01-31 through 2025-02-28
getting incidents for 2025-01-03 through 2025-01-31
getting incidents for 2024-12-06 through 2025-01-03
getting incidents for 2024-11-08 through 2024-12-06
getting incidents for 2024-10-11 through 2024-11-08
getting incidents for 2024-09-13 through 2024-10-11
getting incidents for 2024-08-16 through 2024-09-13
getting incidents for 2024-07-19 through 2024-08-16
getting incidents for 2024-06-21 through 2024-07-19
getting incidents for 2024-05-24 through 2024-06-21
getting incidents for 2024-04-26 through 2024-05-24
getting incidents for 2024-03-29 through 2024-04-26
getting incidents for 2024-03-01 through 2024-03-29
getting incidents for 2024-02-02 through 2024-03-01
getting incidents for 2024-01-05 through 2024-02-02
getting incidents for 2023-12-08 through 2024-01-05
getting incidents for 2023-11-10 through 2023-12-08
getting incidents for 2023-10-13 through 2023-11-10
getting inci

In [30]:
def format_html(text):
    soup = BeautifulSoup(text, "html.parser")

    for nbsp in soup.find_all('nbsp'):
        nbsp.replace_with(" ")

    # remove images for now
    for img in soup.find_all('img'):
        img.decompose()

    return MarkdownConverter().convert_soup(soup)

def format_records():
    output_dir = "../output/incidents"
    if (os.path.exists(output_dir)):
        shutil.rmtree(output_dir)

    os.makedirs(output_dir)

    df_results = list(map(lambda result:  dataframe_from_result_table(result.primary_results[0]), results))
    df_raw = pd.concat(df_results)
    df_raw['Summary'] = df_raw['Summary'].apply(format_html)
    df_raw['Mitigation'] = df_raw['Mitigation'].apply(format_html)
    df_raw['HowFixed'] = df_raw['HowFixed'].apply(format_html)
    df_raw['VectorText'] = '#Title\n' + df_raw['Title'] + '\n#Summary\n' + df_raw['Summary']

    df_sorted = df_raw.sort_values(by="ModifiedDate", ascending=False)
    df = df_sorted.drop_duplicates(subset="IncidentId")

    for index, row in df.iterrows():
        file_name = f"{row['IncidentId']}.json"
        file_path = os.path.join(output_dir, file_name)
        row_data = row.to_json(indent=4)

        with open(file_path, "w", encoding='utf-8') as json_file:
            json_file.write(row_data)

format_records()

In [None]:
def load_json_files(folder_path):
    json_entries = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                data = json.load(file)
                json_entries.append(data)

    return json_entries

# Compute similarity matrix using RapidFuzz
def compute_similarity_matrix(entries):
    size = len(entries)
    similarity_matrix = np.zeros((size, size))

    for i in range(size):
        for j in range(i, size):
            if i == j:
                similarity_matrix[i, j] = 100
            else:
                similarity_score = fuzz.ratio(entries[i].get('VectorText'), entries[j].get('VectorText'))
                similarity_matrix[i, j] = similarity_score
                similarity_matrix[j, i] = similarity_score

    return similarity_matrix

# Perform Agglomerative Clustering
def cluster_entries(entries, threshold=50):
    similarity_matrix = compute_similarity_matrix(entries)

    # Convert similarity to distance (1 - similarity)
    distance_matrix = 100 - similarity_matrix

    clustering_model = AgglomerativeClustering(
        n_clusters=None,
        metric="precomputed",
        linkage="average",
        distance_threshold=threshold
    )

    cluster_labels = clustering_model.fit_predict(distance_matrix)
    return cluster_labels

folder_path = "../output/incidents"  # Update this
json_entries = load_json_files(folder_path)
clusters = cluster_entries(json_entries)

# Group JSON entries by cluster
clustered_entries = {}
for i, cluster in enumerate(clusters):
    clustered_entries.setdefault(cluster, []).append(json_entries[i])

# Convert to an array of arrays
result = list(clustered_entries.values())

output_dir = f"../output/incidentClusters"
if (os.path.exists(output_dir)):
    shutil.rmtree(output_dir)

os.makedirs(output_dir)

for idx, cluster in enumerate(result):
    file_name = f"cluster_{idx + 1}.json"
    file_path = os.path.join(output_dir, file_name)

    incident_ids = [item["IncidentId"] for item in cluster]
    for item in cluster:
        item_incident_id = item["IncidentId"]
        incident_ids_copy = copy.deepcopy(incident_ids)
        incident_ids_filter = [id for id in incident_ids_copy if id != item_incident_id]
        item["SimilarIncidents"] = incident_ids_filter

    data = {
        "entries": cluster
    }
    data_output = json.dumps(data, indent=4)

    with open(file_path, "w", encoding='utf-8', errors="replace") as cluster_file:
        cluster_file.write(data_output)

In [68]:
def upload_files_to_blob():
    BLOB_URL = "https://stvzac3zroquyd4.blob.core.windows.net/"
    CONTAINER_NAME = "incidents"
    LOCAL_FOLDER = "../output/incidents"

    credential = DefaultAzureCredential()
    blob_service_client = BlobServiceClient(account_url=BLOB_URL, credential=credential)
    container_client = blob_service_client.get_container_client(CONTAINER_NAME)

    # Ensure the container exists (create if necessary)
    if not container_client.exists():
        container_client.create_container()
        print(f"Created container: {CONTAINER_NAME}")

    # List all files in the local folder
    for file_name in os.listdir(LOCAL_FOLDER):
        file_path = os.path.join(LOCAL_FOLDER, file_name)

        # Only upload files (skip directories)
        if os.path.isfile(file_path):
            # Create a Blob Client for each file
            blob_client = container_client.get_blob_client(file_name)

            # Upload file to Azure Blob Storage
            with open(file_path, "rb") as data:
                blob_client.upload_blob(data, overwrite=True)

            print(f"Uploaded: {file_name} -> {CONTAINER_NAME}/{file_name}")

upload_files_to_blob()

Uploaded: 31000000287166.json -> incidents/31000000287166.json
Uploaded: 31000000287737.json -> incidents/31000000287737.json
Uploaded: 467741615.json -> incidents/467741615.json
Uploaded: 468157282.json -> incidents/468157282.json
Uploaded: 468157290.json -> incidents/468157290.json
Uploaded: 468168273.json -> incidents/468168273.json
Uploaded: 468168286.json -> incidents/468168286.json
Uploaded: 468168837.json -> incidents/468168837.json
Uploaded: 468170351.json -> incidents/468170351.json
Uploaded: 468170572.json -> incidents/468170572.json
Uploaded: 468171227.json -> incidents/468171227.json
Uploaded: 468171254.json -> incidents/468171254.json
Uploaded: 468171314.json -> incidents/468171314.json
Uploaded: 468171339.json -> incidents/468171339.json
Uploaded: 468174473.json -> incidents/468174473.json
Uploaded: 468174495.json -> incidents/468174495.json
Uploaded: 468181086.json -> incidents/468181086.json
Uploaded: 468181096.json -> incidents/468181096.json
Uploaded: 468182023.json -