In [1]:
import requests
import os
import re
from html import unescape
import subprocess
import json
import re
import time
import pandas as pd

In [42]:
# Remove ANSI escape sequences (like \x1b[1m or \033[0;31m)
def strip_ansi_codes(text):
    ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
    return ansi_escape.sub('', text)

In [44]:
# Downloads Diagrams and Saves to Directory
def download_diagrams_from_gh(json_file, output_dir, must_contain=None):
    os.makedirs(output_dir, exist_ok=True)

    with open(json_file, "r") as f:
        data = json.load(f)

    saved = 0
    for i, item in enumerate(data):
        repo = item["repository"]["nameWithOwner"]
        path = item["path"]
        raw_url = f"https://raw.githubusercontent.com/{repo}/HEAD/{path}"  # HEAD handles main/master

        try:
            response = requests.get(raw_url)
            content = response.text

            if must_contain:
                if not any(keyword in content for keyword in must_contain):
                    continue

            filename = os.path.join(output_dir, f"diagram_{i+1}.txt")
            with open(filename, "w") as f_out:
                f_out.write(content)
            saved += 1
        except Exception as e:
            print(f"Failed to download {raw_url}: {e}")

    print(f"\nDownloaded and saved {saved} diagram files to '{output_dir}'")



In [46]:
# Helper function to run gh search
def run_gh_search(query, output_file="results.json", total_limit=1000, batch_size=100):
    all_results = []

    for batch_start in range(0, total_limit, batch_size):
        cmd = [
            "gh", "search", "code",
            query,
            "--limit", str(batch_size),
            "--json", "repository,path,url"
        ]

        print(f"Running batch starting at {batch_start}...")

        result = subprocess.run(cmd, capture_output=True, text=True, env={**os.environ, "NO_COLOR": "1"})
        if result.stderr:
            print("Error:", result.stderr)

        clean_output = strip_ansi_codes(result.stdout).strip()

        if not clean_output:
            print("GitHub search returned no results for this batch")
            continue

        batch_data = json.loads(clean_output)
        if not batch_data:
            print("No more data returned by GitHub.")
            break

        all_results.extend(batch_data)

        print(f"Fetched {len(batch_data)} items in batch {batch_start}. Waiting to avoid rate limits...")
        time.sleep(65)

    if not all_results:
        print("No results found at all.")
    else:
        with open(output_file, "w") as f:
            json.dump(all_results, f, indent=2)
        print(f"Saved {len(all_results)} GitHub search results to {output_file}")

# Wrapper for Mermaid diagrams
def run_gh_search_mermaid(output_file="mermaid_results.json", total_limit=1000, batch_size=100):
    run_gh_search("erDiagram", output_file=output_file, total_limit=total_limit, batch_size=batch_size)

# Wrapper for PlantUML diagrams
def run_gh_search_plantuml(output_file="plantuml_results.json", total_limit=1000, batch_size=100):
    run_gh_search("@startuml", output_file=output_file, total_limit=total_limit, batch_size=batch_size)

In [52]:
# Run search for mermaid and plantUML diagrams
run_gh_search_mermaid(total_limit=1500, batch_size=100)
run_gh_search_plantuml(total_limit=1500, batch_size=100)

download_diagrams_from_gh(
    json_file="mermaid_results.json",
    output_dir="gh_mermaid_data_models",
    must_contain=["erDiagram", "classDiagram"]
)

download_diagrams_from_gh(
    json_file="plantuml_results.json",
    output_dir="gh_plantuml_data_models",
    must_contain=["entity", "class", "database"]
)

Running batch starting at 0...
Fetched 100 items in batch 0. Waiting to avoid rate limits...
Running batch starting at 100...
Fetched 100 items in batch 100. Waiting to avoid rate limits...
Running batch starting at 200...
Fetched 100 items in batch 200. Waiting to avoid rate limits...
Running batch starting at 300...
Fetched 100 items in batch 300. Waiting to avoid rate limits...
Running batch starting at 400...
Fetched 100 items in batch 400. Waiting to avoid rate limits...
Running batch starting at 500...
Fetched 100 items in batch 500. Waiting to avoid rate limits...
Running batch starting at 600...
Fetched 100 items in batch 600. Waiting to avoid rate limits...
Running batch starting at 700...
Fetched 100 items in batch 700. Waiting to avoid rate limits...
Running batch starting at 800...
Fetched 100 items in batch 800. Waiting to avoid rate limits...
Running batch starting at 900...
Fetched 100 items in batch 900. Waiting to avoid rate limits...
Running batch starting at 1000...


In [3]:
# Extract code blocks from mermaid diagram files
def extract_mermaid_blocks(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # For fenced blocks first
    matches = re.findall(r"```mermaid\s*(.*?)```", text, re.DOTALL)
    if matches:
        return matches

    # For raw mermaid code
    if "erDiagram" in text or "classDiagram" in text:
        return [text]

    return []

# Extract code blocks from plantUML diagram files
def extract_plantuml_blocks(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # Extract all plantUML blocks
    all_blocks = re.findall(r"@startuml\s*(.*?)\s*@enduml", text, re.DOTALL)

    # Keep only data models 
    data_model_blocks = []
    for block in all_blocks:
        lower_block = block.lower()

        if any(token in lower_block for token in [
            '||--', '|o--', 'o--o{', 'class ', 'entity', '{', '}'
        ]):
            # Must have no method definitions (brackets)
            if not re.search(r'\w+\s*\(.*?\)', block):
                data_model_blocks.append(block.strip())

    return data_model_blocks

diagrams = []
mermaid_path = 'gh_mermaid_data_models'
plantuml_path = 'gh_plantuml_data_models'

# Process mermaid files
for filename in os.listdir(mermaid_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(mermaid_path, filename)
        blocks = extract_mermaid_blocks(file_path)
        for block in blocks:
            lines = [line.strip().lower() for line in block.strip().splitlines() if line.strip()]
            if lines and (lines[0].startswith("erdiagram") or lines[0].startswith("classdiagram")):
                diagrams.append({
                    'filename': f"mermaid_{filename}",
                    'file_path': file_path,
                    'diagram_code': block,
                    'diagram_type': 'mermaid'
                })

# Process plantUML files
for filename in os.listdir(plantuml_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(plantuml_path, filename)
        blocks = extract_plantuml_blocks(file_path)
        for block in blocks:
            diagrams.append({
                'filename': f"plantuml_{filename}",
                'file_path': file_path,
                'diagram_code': block,
                'diagram_type': 'plantuml'
            })

mermaid_dir = "diagrams/mermaid_1"
plantuml_dir = "diagrams/plantuml_1"
os.makedirs(mermaid_dir, exist_ok=True)
os.makedirs(plantuml_dir, exist_ok=True)

# Save diagrams as mermaid/plantUML files
for i, diagram in enumerate(diagrams):
    ext = ".mmd" if diagram["diagram_type"] == "mermaid" else ".puml"
    safe_name = diagram["filename"].replace(".txt", "").replace("/", "_")
    dir_path = mermaid_dir if diagram["diagram_type"] == "mermaid" else plantuml_dir
    file_name = f"{i:04d}_{safe_name}{ext}"
    file_path = os.path.join(dir_path, file_name)

    with open(file_path, "w", encoding="utf-8") as f:
        if diagram["diagram_type"] == "plantuml":
            f.write(f"@startuml\n{diagram['diagram_code'].strip()}\n@enduml")
        else:
            f.write(diagram["diagram_code"].strip())

# Save meta data to dataframe
df = pd.DataFrame(diagrams)
df.to_csv("diagrams/diagram_metadata.csv", index=False)