In [69]:
import re
import os
import pandas as pd
import networkx as nx

In [51]:
# Extract code blocks from mermaid diagram files
def extract_mermaid_blocks(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    matches = re.findall(r"```mermaid\s+(.*?)```", text, re.DOTALL)
    return matches

# Extract code blocks from plantUML diagram files
def extract_plantuml_blocks(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    matches = re.findall(r"@startuml\s+(.*?)@enduml", text, re.DOTALL)
    return matches

diagrams = []
mermaid_path = 'gh_mermaid_data_models'
plantuml_path = 'gh_plantuml_data_models'

# Process mermaid files
for filename in os.listdir(mermaid_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(mermaid_path, filename)
        blocks = extract_mermaid_blocks(file_path)
        for block in blocks:
            diagrams.append({
                'filename': f"mermaid_{filename}",
                'diagram_code': block,
                'diagram_type': 'mermaid'
            })

# Process plantUML files
for filename in os.listdir(plantuml_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(plantuml_path, filename)
        blocks = extract_plantuml_blocks(file_path)
        for block in blocks:
            diagrams.append({
                'filename': f"plantuml_{filename}",
                'diagram_code': block,
                'diagram_type': 'plantuml'
            })


df = pd.DataFrame(diagrams)
df.head()

Unnamed: 0,filename,diagram_code,diagram_type
0,mermaid_diagram_65.txt,erDiagram\n\tSong {\n\t\tint id PK\n\t\tstring...,mermaid
1,mermaid_diagram_59.txt,erDiagram\n CUSTOMER ||--o{ ORDER : places\...,mermaid
2,mermaid_diagram_4.txt,erDiagram\n USER_OAUTH_PROVIDERS {\n ...,mermaid
3,mermaid_diagram_4.txt,erDiagram\n PROFILES }|--|| USERS : user_id...,mermaid
4,mermaid_diagram_5.txt,"erDiagram\n users ||--o{ messages : ""writes...",mermaid


In [44]:
# Count of each diagram type
df['diagram_type'].value_counts()

diagram_type
plantuml    216
mermaid     148
Name: count, dtype: int64

In [57]:
# Count of diagrams per file
df['filename'].value_counts().head(20)

filename
plantuml_diagram_74.txt    21
plantuml_diagram_87.txt    14
mermaid_diagram_51.txt     13
plantuml_diagram_5.txt     13
plantuml_diagram_91.txt    11
mermaid_diagram_2.txt      10
mermaid_diagram_1.txt      10
mermaid_diagram_30.txt      9
plantuml_diagram_13.txt     9
plantuml_diagram_36.txt     8
mermaid_diagram_46.txt      7
mermaid_diagram_64.txt      7
plantuml_diagram_24.txt     7
plantuml_diagram_1.txt      7
mermaid_diagram_96.txt      6
plantuml_diagram_16.txt     5
mermaid_diagram_69.txt      5
plantuml_diagram_89.txt     5
plantuml_diagram_80.txt     5
plantuml_diagram_29.txt     5
Name: count, dtype: int64

In [67]:
# Most common start line
df['start_line'] = df['diagram_code'].str.split('\n').str[0]
df['start_line'].value_counts().head(10)

start_line
erDiagram          83
!theme spacelab    21
---                12
!theme plain       11
sequenceDiagram     9
classDiagram        6
graph TD            6
graph LR            6
scale 720 width     5
actor User          5
Name: count, dtype: int64

In [65]:
# Average diagram length by type
# Which type of diagram is typically more complex
df['char_count'] = df['diagram_code'].str.len()
df['line_count'] = df['diagram_code'].str.count('\n') + 1

df.groupby('diagram_type')['line_count'].mean()

diagram_type
mermaid     37.783784
plantuml    30.861111
Name: line_count, dtype: float64

In [79]:
# Extract edges ***** Need to edit *****
def extract_edges(code, diagram_type):
    if diagram_type == 'mermaid':
        return re.findall(r'(\w+)\s*||--o\s*(\w+)', code) 
    elif diagram_type == 'plantuml':
        return re.findall(r'(\w+)\s*[-<]+[->]+\s*(\w+)', code)
    else:
        return []

# Build graph
def build_graph(code, diagram_type):
    edges = extract_edges(code, diagram_type)
    G = nx.DiGraph()
    G.add_edges_from(edges)
    return G

# Store each graph in a new column
df['graph'] = df.apply(lambda row: build_graph(row['diagram_code'], row['diagram_type']), axis=1)