In [66]:
import requests
import os
import re
from html import unescape
import subprocess
import json
import re

In [None]:
# Collects Mermaid Diagrams from Stack Overflow
def search_stackoverflow_mermaid_diagrams(query="mermaid", pages=5):
    base_url = "https://api.stackexchange.com/2.3/search/advanced"
    diagrams = []
    headers = {"User-Agent": "mermaid-diagram-collector"}

    for page in range(1, pages + 1):
        print(f" Searching page {page}...")
        params = {
            "order": "desc",
            "sort": "relevance",
            "q": query,
            "site": "stackoverflow",
            "filter": "withbody",
            "pagesize": 20,
            "page": page,
        }

        response = requests.get(base_url, params=params, headers=headers)
        data = response.json()

        for item in data.get("items", []):
            body = unescape(item["body"])
            matches = re.findall(r"```mermaid(.*?)```", body, re.DOTALL)
            for match in matches:
                diagrams.append(match.strip())

    return diagrams

def save_diagrams(diagrams, output_dir="so_mermaid_diagrams"):
    os.makedirs(output_dir, exist_ok=True)
    for i, d in enumerate(diagrams):
        with open(os.path.join(output_dir, f"diagram_{i+1}.mmd"), "w") as f:
            f.write("```mermaid\n" + d + "\n```")
    print(f"\n Saved {len(diagrams)} Mermaid diagrams to '{output_dir}/'")

# Run the pipeline
if __name__ == "__main__":
    diagrams = search_stackoverflow_mermaid_diagrams(query="mermaid", pages=5)
    save_diagrams(diagrams)


In [6]:
# Collects Mermaid ER and Class Diagrams from Stack Overflow
def search_stackoverflow_mermaid_diagrams(query="mermaid", pages=100):
    base_url = "https://api.stackexchange.com/2.3/search/advanced"
    diagrams = []
    headers = {"User-Agent": "mermaid-diagram-collector"}

    data_model_keywords = ["erDiagram", "classDiagram"]

    for page in range(1, pages + 1):
        print(f" Searching page {page}...")
        params = {
            "order": "desc",
            "sort": "relevance",
            "q": query,
            "site": "stackoverflow",
            "filter": "withbody",
            "pagesize": 20,
            "page": page,
        }

        response = requests.get(base_url, params=params, headers=headers)
        data = response.json()

        for item in data.get("items", []):
            body = unescape(item["body"])
            matches = re.findall(r"```mermaid(.*?)```", body, re.DOTALL)
            for match in matches:
                stripped = match.strip()
                if any(kw in stripped for kw in data_model_keywords):
                    diagrams.append(stripped)

    return diagrams

def save_diagrams(diagrams, output_dir="so_mermaid_data_models"):
    os.makedirs(output_dir, exist_ok=True)
    for i, d in enumerate(diagrams):
        with open(os.path.join(output_dir, f"diagram_{i+1}.mmd"), "w") as f:
            f.write("```mermaid\n" + d + "\n```")
    print(f"\n Saved {len(diagrams)} data model Mermaid diagrams to '{output_dir}/'")

if __name__ == "__main__":
    diagrams = search_stackoverflow_mermaid_diagrams(query="mermaid", pages=100)
    save_diagrams(diagrams)


 Searching page 1...
 Searching page 2...
 Searching page 3...
 Searching page 4...
 Searching page 5...
 Searching page 6...
 Searching page 7...
 Searching page 8...
 Searching page 9...
 Searching page 10...
 Searching page 11...
 Searching page 12...
 Searching page 13...
 Searching page 14...
 Searching page 15...
 Searching page 16...
 Searching page 17...
 Searching page 18...
 Searching page 19...
 Searching page 20...
 Searching page 21...
 Searching page 22...
 Searching page 23...
 Searching page 24...
 Searching page 25...
 Searching page 26...
 Searching page 27...
 Searching page 28...
 Searching page 29...
 Searching page 30...
 Searching page 31...
 Searching page 32...
 Searching page 33...
 Searching page 34...
 Searching page 35...
 Searching page 36...
 Searching page 37...
 Searching page 38...
 Searching page 39...
 Searching page 40...
 Searching page 41...
 Searching page 42...
 Searching page 43...
 Searching page 44...
 Searching page 45...
 Searching page 46.

In [8]:
# Collects PlantUML Diagrams from Stack Overflow
def search_stackoverflow_plantuml_diagrams(query="plantuml", pages=50):
    base_url = "https://api.stackexchange.com/2.3/search/advanced"
    diagrams = []
    headers = {"User-Agent": "plantuml-diagram-collector"}

    data_model_keywords = ["entity", "class", "database", "table"]

    for page in range(1, pages + 1):
        print(f" Searching page {page}...")
        params = {
            "order": "desc",
            "sort": "relevance",
            "q": query,
            "site": "stackoverflow",
            "filter": "withbody",
            "pagesize": 20,
            "page": page,
        }

        response = requests.get(base_url, params=params, headers=headers)
        data = response.json()

        for item in data.get("items", []):
            body = unescape(item["body"])
            matches = re.findall(r"```(?:plantuml)?(.*?)```", body, re.DOTALL | re.IGNORECASE)
            for match in matches:
                stripped = match.strip()
                if "@startuml" in stripped.lower() and any(kw in stripped.lower() for kw in data_model_keywords):
                    diagrams.append(stripped)

    return diagrams

def save_diagrams(diagrams, output_dir="so_plantuml_data_models"):
    os.makedirs(output_dir, exist_ok=True)
    for i, d in enumerate(diagrams):
        with open(os.path.join(output_dir, f"diagram_{i+1}.puml"), "w") as f:
            f.write(d)
    print(f"\n Saved {len(diagrams)} PlantUML data model diagrams to '{output_dir}/'")

# Run the pipeline
if __name__ == "__main__":
    diagrams = search_stackoverflow_plantuml_diagrams(query="plantuml", pages=50)
    save_diagrams(diagrams)


 Searching page 1...
 Searching page 2...
 Searching page 3...
 Searching page 4...
 Searching page 5...
 Searching page 6...
 Searching page 7...
 Searching page 8...
 Searching page 9...
 Searching page 10...
 Searching page 11...
 Searching page 12...
 Searching page 13...
 Searching page 14...
 Searching page 15...
 Searching page 16...
 Searching page 17...
 Searching page 18...
 Searching page 19...
 Searching page 20...
 Searching page 21...
 Searching page 22...
 Searching page 23...
 Searching page 24...
 Searching page 25...
 Searching page 26...
 Searching page 27...
 Searching page 28...
 Searching page 29...
 Searching page 30...
 Searching page 31...
 Searching page 32...
 Searching page 33...
 Searching page 34...
 Searching page 35...
 Searching page 36...
 Searching page 37...
 Searching page 38...
 Searching page 39...
 Searching page 40...
 Searching page 41...
 Searching page 42...
 Searching page 43...
 Searching page 44...
 Searching page 45...
 Searching page 46.

In [75]:
# Remove ANSI escape sequences (like \x1b[1m or \033[0;31m)
def strip_ansi_codes(text):
    ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
    return ansi_escape.sub('', text)

# Searches GitHub for Mermaid Data Model Diagrams with 'erDiagram'
def run_gh_search_mermaid(output_file="mermaid_results.json", limit=50):
    cmd = [
        "gh", "search", "code",
        "erDiagram",
        "--limit", str(limit),
        "--json", "repository,path,url"
        
    ]
    result = subprocess.run(cmd, capture_output=True, text=True, env={**os.environ, "NO_COLOR": "1"})
    if result.stderr:
        print("Error:", result.stderr)
    
    clean_output = strip_ansi_codes(result.stdout).strip()
    if not clean_output:
        print("GitHub search returned no results")
    else:
        with open(output_file, "w") as f:
            f.write(clean_output)
        print(f"Saved Mermaid GitHub search results to {output_file}")



# Searches Github for PlantUML Diagrams 
def run_gh_search_plantuml(output_file="plantuml_results.json", limit=50):
    cmd = [
        "gh", "search", "code",
        "@startuml",
        "--limit", str(limit),
        "--json", "repository,path,url"
    ]
    result = subprocess.run(cmd, capture_output=True, text=True, env={**os.environ, "NO_COLOR": "1"})
    if result.stderr:
        print("Error:", result.stderr)
    clean_output = strip_ansi_codes(result.stdout).strip()
    if not clean_output:
        print("GitHub search returned no results")
    else:
        with open(output_file, "w") as f:
            f.write(clean_output)
        print(f"PlantUML GitHub search results to {output_file}")



In [77]:
# Downloads Diagrams and Saves to Directory
def download_diagrams_from_gh(json_file, output_dir, must_contain=None):
    os.makedirs(output_dir, exist_ok=True)

    with open(json_file, "r") as f:
        data = json.load(f)

    saved = 0
    for i, item in enumerate(data):
        repo = item["repository"]["nameWithOwner"]
        path = item["path"]
        raw_url = f"https://raw.githubusercontent.com/{repo}/HEAD/{path}"  # HEAD handles main/master

        try:
            response = requests.get(raw_url)
            content = response.text

            if must_contain:
                if not any(keyword in content for keyword in must_contain):
                    continue

            filename = os.path.join(output_dir, f"diagram_{i+1}.txt")
            with open(filename, "w") as f_out:
                f_out.write(content)
            saved += 1
        except Exception as e:
            print(f"Failed to download {raw_url}: {e}")

    print(f"\nDownloaded and saved {saved} diagram files to '{output_dir}'")



In [85]:
# Run GitHub searches
run_gh_search_mermaid(limit=100)
run_gh_search_plantuml(limit=100)

# Download and filter data model diagrams
download_diagrams_from_gh(
    json_file="mermaid_results.json",
    output_dir="gh_mermaid_data_models",
    must_contain=["erDiagram", "classDiagram"]
)

download_diagrams_from_gh(
    json_file="plantuml_results.json",
    output_dir="gh_plantuml_data_models",
    must_contain=["entity", "class", "database"]
)

Saved Mermaid GitHub search results to mermaid_results.json
PlantUML GitHub search results to plantuml_results.json

Downloaded and saved 88 diagram files to 'gh_mermaid_data_models'

Downloaded and saved 69 diagram files to 'gh_plantuml_data_models'
