## Chargement du dataset

In [1]:
from datasets import load_dataset
import pandas as pd

# Charger le dataset
ds = load_dataset("colinfrisch/diagrams_with_mermaid_codes")

# Convertir en DataFrame pour faciliter la manipulation
df = pd.DataFrame(ds['train'])

print(f"Dataset charg√©: {len(df)} exemples")
print(f"\nColonnes disponibles: {df.columns.tolist()}")
print(f"\nAper√ßu:")
df.head()

  from .autonotebook import tqdm as notebook_tqdm


Dataset charg√©: 3524 exemples

Colonnes disponibles: ['code', 'caption']

Aper√ßu:


Unnamed: 0,code,caption
0,```mermaid\nmindmap\n root((Multiple Intellig...,"Multiple Intelligences, Circular Diagram, elem..."
1,```mermaid\nflowchart TB\n\n%% Big Five Person...,"Big Five Personality Traits, Grid Diagram, ele..."
2,```mermaid\nflowchart TB\n %% Constructivism ...,"Constructivism in Learning, Infographic, eleme..."
3,```mermaid\nflowchart TB\n %% Steps in the Sc...,"Steps in the Scrum Iterative Process, Circular..."
4,```mermaid\nflowchart LR\n %% Iterative Produ...,"Iterative Product Development, Circular Flow D..."


# Parser Mermaid ‚Üí Graphes NetworkX

Objectif: Transformer chaque code Mermaid en un graphe exploitable
- N≈ìuds: entit√©s (bo√Ætes, cercles, concepts)
- Ar√™tes: relations (fl√®ches, liens)

In [None]:
import networkx as nx
import re
from typing import Tuple, List, Dict

def clean_label(text: str) -> str:
    """
    Nettoie les labels pour visualisation.
    - Enl√®ve HTML (<br/>, <br>)
    - Enl√®ve Markdown (```)
    - Enl√®ve les descriptions entre parenth√®ses
    - Garde seulement le concept principal
    - Limite la longueur
    """
    # Enlever markdown
    text = re.sub(r'```\w*', '', text)
    
    # Enlever HTML
    text = re.sub(r'<br\s*/?>', ' ', text)
    
    # Enlever parenth√®ses avec descriptions
    text = re.sub(r'\([^)]+\)', '', text)
    
    # Enlever les brackets/chevrons
    text = re.sub(r'[\[\]\{\}<>]', '', text)
    
    # Garder seulement le premier concept (avant virgule ou deux-points)
    text = text.split(',')[0].split(':')[0]
    
    # Nettoyer les espaces multiples
    text = ' '.join(text.split())
    
    # Limiter longueur √† 30 caract√®res
    if len(text) > 30:
        text = text[:27] + '...'
    
    return text.strip()

def detect_diagram_type(code: str) -> str:
    """
    D√©tecte le type de diagramme Mermaid.
    Returns: 'mindmap', 'flowchart', 'graph', ou 'unknown'
    """
    code_lower = code.lower()
    
    if 'mindmap' in code_lower:
        return 'mindmap'
    elif 'flowchart' in code_lower:
        return 'flowchart'
    elif re.search(r'\bgraph\s+(TB|TD|BT|RL|LR)', code_lower):
        return 'graph'
    
    return 'unknown'

In [4]:
def clean_mermaid(code: str) -> str:
    """Nettoie le code Mermaid en supprimant les marqueurs markdown."""
    return code.replace("```mermaid", "").replace("```", "").strip()

## Extraction des n≈ìuds et ar√™tes (regex)

In [None]:
def parse_mindmap(mermaid_code: str) -> Tuple[Dict[str, str], List[Tuple]]:
    """
    Parse sp√©cifique pour les mindmaps Mermaid.
    Structure bas√©e sur l'indentation.
    """
    nodes = {}
    edges = []
    
    lines = mermaid_code.split('\n')
    parent_stack = []  # Stack: [(indent_level, node_id)]
    node_counter = 0
    
    for line in lines:
        # Skip empty lines and header
        if not line.strip() or 'mindmap' in line.lower():
            continue
        
        # Calculer le niveau d'indentation
        indent = len(line) - len(line.lstrip())
        content = line.strip()
        
        # Extraire le label (enlever parenth√®ses/brackets)
        label_match = re.search(r'[(\[]+(.*?)[)\]]+|(\S+.*?)$', content)
        if not label_match:
            continue
        
        raw_label = label_match.group(1) or label_match.group(2)
        if not raw_label:
            continue
        
        # Nettoyer le label
        clean_label_text = clean_label(raw_label)
        
        # Cr√©er un ID unique
        node_id = f"n{node_counter}"
        node_counter += 1
        
        nodes[node_id] = clean_label_text
        
        # G√©rer la hi√©rarchie via l'indentation
        # Retirer les parents de niveau sup√©rieur ou √©gal
        while parent_stack and parent_stack[-1][0] >= indent:
            parent_stack.pop()
        
        # Si on a un parent, cr√©er l'ar√™te
        if parent_stack:
            parent_id = parent_stack[-1][1]
            edges.append((parent_id, node_id, 'hierarchy', ''))
        
        # Ajouter ce n≈ìud comme parent potentiel
        parent_stack.append((indent, node_id))
    
    return nodes, edges


def extract_nodes_and_edges(mermaid_code: str) -> Tuple[Dict[str, str], List[Tuple]]:
    """
    Extrait les n≈ìuds et ar√™tes d'un code Mermaid.
    
    Returns:
        (nodes_dict, edges_list)
        - nodes_dict: {node_id: label}
        - edges_list: [(src, dst, edge_type, label)]
    """
    # D√©tecter le type de diagramme
    diagram_type = detect_diagram_type(mermaid_code)
    
    # Si c'est un mindmap, utiliser le parser sp√©cialis√©
    if diagram_type == 'mindmap':
        return parse_mindmap(mermaid_code)
    
    # Sinon, utiliser le parser flowchart/graph standard
    nodes = {}
    edges = []
    
    # Patterns pour les n≈ìuds: A["Label"], A[Label], B((Circle)), C{Decision}, etc.
    node_patterns = [
        r'(\w+)\["([^"]+)"\]',    # A["Label"]
        r'(\w+)\[([^\]]+)\]',      # A[Label]
        r'(\w+)\(\(([^)]+)\)\)',   # A((Circle))
        r'(\w+)\{([^}]+)\}',       # A{Decision}
        r'(\w+)\[/([^/]+)/\]',     # A[/Parallelogram/]
        r'(\w+)\[\\([^\\]+)\\\]',  # A[\Trapezoid\]
    ]
    
    # Patterns pour les ar√™tes: A --> B, A -->|label| B, A --- B, etc.
    edge_patterns = [
        (r'(\w+)\s*-->\s*\|([^|]+)\|\s*(\w+)', 'arrow_labeled'),
        (r'(\w+)\s*-->\s*(\w+)', 'arrow'),
        (r'(\w+)\s*---\s*\|([^|]+)\|\s*(\w+)', 'line_labeled'),
        (r'(\w+)\s*---\s*(\w+)', 'line'),
        (r'(\w+)\s*-\.->(\w+)', 'dotted_arrow'),
        (r'(\w+)\s*==>\s*(\w+)', 'thick_arrow'),
    ]
    
    lines = mermaid_code.split('\n')
    
    for line in lines:
        line = line.strip()
        if not line or line.startswith('graph') or line.startswith('flowchart') or line.startswith('%%'):
            continue
        
        # Extraire les n≈ìuds
        for pattern in node_patterns:
            for node_id, label in re.findall(pattern, line):
                # Nettoyer le label
                cleaned = clean_label(label)
                nodes[node_id] = cleaned
        
        # Extraire les ar√™tes
        for pattern, edge_type in edge_patterns:
            matches = re.findall(pattern, line)
            for match in matches:
                if 'labeled' in edge_type:
                    src, label, dst = match
                    edges.append((src, dst, edge_type, clean_label(label)))
                else:
                    src, dst = match
                    edges.append((src, dst, edge_type, ''))
                
                # Ajouter les n≈ìuds s'ils n'ont pas de label
                if src not in nodes:
                    nodes[src] = clean_label(src)
                if dst not in nodes:
                    nodes[dst] = clean_label(dst)
    
    return nodes, edges

## Construction du graphe NetworkX

In [6]:
def mermaid_to_graph(mermaid_code: str) -> nx.DiGraph:
    """
    Convertit un code Mermaid en graphe NetworkX.
    
    Returns:
        Graphe NetworkX (DiGraph)
    """
    # 1. Nettoyer le code
    cleaned = clean_mermaid(mermaid_code)
    
    # 2. Extraire n≈ìuds et ar√™tes
    nodes, edges = extract_nodes_and_edges(cleaned)
    
    # 3. Construire le graphe
    G = nx.DiGraph()
    
    # Ajouter les n≈ìuds avec leurs labels
    for node_id, label in nodes.items():
        G.add_node(node_id, label=label)
    
    # Ajouter les ar√™tes avec leur type
    for src, dst, edge_type, edge_label in edges:
        G.add_edge(src, dst, type=edge_type, label=edge_label)
    
    return G

## Application sur le dataset complet

Chaque ligne du DataFrame ‚Üí 1 graphe

In [8]:
# Identifier la colonne contenant les codes Mermaid
print("Colonnes:", df.columns.tolist())
print("\nExemple de la premi√®re ligne:")
print(df.iloc[0])

# Adapter le nom de la colonne selon ton dataset
# Supposons que la colonne s'appelle 'code' ou 'mermaid' ou 'diagram'
mermaid_col = None
for col in ['code', 'mermaid', 'diagram', 'mermaid_code', 'text']:
    if col in df.columns:
        mermaid_col = col
        break

if mermaid_col:
    print(f"\n‚úì Colonne Mermaid d√©tect√©e: '{mermaid_col}'")
else:
    print("\n‚ö† Colonne non d√©tect√©e. Affiche les premi√®res lignes pour identifier la bonne colonne.")
    print(df.head())

Colonnes: ['code', 'caption']

Exemple de la premi√®re ligne:
code       ```mermaid\nmindmap\n  root((Multiple Intellig...
caption    Multiple Intelligences, Circular Diagram, elem...
Name: 0, dtype: object

‚úì Colonne Mermaid d√©tect√©e: 'code'


In [9]:
# Transformer chaque code Mermaid en graphe
# REMPLACE 'mermaid_col' par le bon nom de colonne apr√®s avoir ex√©cut√© la cellule pr√©c√©dente
# Par exemple: df['graph'] = df['code'].apply(mermaid_to_graph)

df['graph'] = df[mermaid_col].apply(mermaid_to_graph)

# Ajouter des statistiques
df['num_nodes'] = df['graph'].apply(lambda g: g.number_of_nodes())
df['num_edges'] = df['graph'].apply(lambda g: g.number_of_edges())

print(f"‚úì {len(df)} graphes cr√©√©s")
print(f"\nStatistiques:")
print(df[['num_nodes', 'num_edges']].describe())

‚úì 3524 graphes cr√©√©s

Statistiques:
         num_nodes    num_edges
count  3524.000000  3524.000000
mean      9.103292     4.645006
std       8.702879     5.165029
min       0.000000     0.000000
25%       5.000000     1.000000
50%       7.000000     3.000000
75%      11.000000     7.000000
max     188.000000    77.000000


# Visualisation 

In [None]:
import matplotlib.pyplot as plt

def visualize_graph(G: nx.DiGraph, title: str = "Graphe", diagram_type: str = "unknown"):
    """
    Visualise un graphe NetworkX avec adaptation selon la taille et le type.
    """
    if G.number_of_nodes() == 0:
        print(f"‚ö† Graphe vide: {title}")
        return
    
    # Adapter la taille de la figure selon le nombre de n≈ìuds
    num_nodes = G.number_of_nodes()
    
    if num_nodes > 50:
        figsize = (20, 16)
    elif num_nodes > 20:
        figsize = (16, 12)
    else:
        figsize = (12, 8)
    
    plt.figure(figsize=figsize)
    
    # Choisir le layout selon la taille et le type
    if diagram_type == 'mindmap' and num_nodes < 100:
        # Pour mindmaps: layout radial/hi√©rarchique
        try:
            pos = nx.nx_agraph.graphviz_layout(G, prog='twopi')
        except:
            # Fallback si pygraphviz n'est pas install√©
            pos = nx.spring_layout(G, k=3, iterations=200, seed=42)
    elif num_nodes > 50:
        # Pour gros graphes: Kamada-Kawai (plus stable)
        pos = nx.kamada_kawai_layout(G)
    else:
        # Pour petits graphes: spring layout am√©lior√©
        pos = nx.spring_layout(G, k=3, iterations=200, seed=42)
    
    # Adapter la taille des n≈ìuds inversement au nombre
    node_size = max(500, min(3000, 10000 // num_nodes))
    
    # Adapter la taille de police
    if num_nodes > 50:
        font_size = 6
    elif num_nodes > 20:
        font_size = 8
    else:
        font_size = 10
    
    # Dessiner les n≈ìuds
    nx.draw_networkx_nodes(G, pos, 
                          node_color='lightblue', 
                          node_size=node_size, 
                          alpha=0.9,
                          edgecolors='navy',
                          linewidths=1.5)
    
    # Dessiner les ar√™tes
    nx.draw_networkx_edges(G, pos, 
                          edge_color='gray', 
                          arrows=True, 
                          arrowsize=15, 
                          width=1.5,
                          alpha=0.6,
                          arrowstyle='->')
    
    # Labels des n≈ìuds (d√©j√† nettoy√©s)
    labels = nx.get_node_attributes(G, 'label')
    if not labels:
        # Si pas de labels, utiliser les IDs
        labels = {n: str(n) for n in G.nodes()}
    
    nx.draw_networkx_labels(G, pos, labels, 
                           font_size=font_size,
                           font_weight='bold')
    
    # Labels des ar√™tes (seulement si pas trop de n≈ìuds)
    if num_nodes < 30:
        edge_labels = {k: v for k, v in nx.get_edge_attributes(G, 'label').items() if v}
        if edge_labels:
            nx.draw_networkx_edge_labels(G, pos, edge_labels, 
                                        font_size=max(6, font_size-2))
    
    plt.title(f"{title}\n({num_nodes} n≈ìuds, {G.number_of_edges()} ar√™tes, type: {diagram_type})", 
             fontsize=14, fontweight='bold')
    plt.axis('off')
    plt.tight_layout()
    plt.show()

# Visualiser les 5 premiers graphes avec le type d√©tect√©
print("üé® Visualisation des graphes am√©lior√©s\n")

for idx in range(min(5, len(df))):
    mermaid_code = df.iloc[idx][mermaid_col]
    diagram_type = detect_diagram_type(clean_mermaid(mermaid_code))
    G = df.iloc[idx]['graph']
    
    print(f"Graphe {idx}: {G.number_of_nodes()} n≈ìuds, {G.number_of_edges()} ar√™tes (type: {diagram_type})")
    visualize_graph(G, f"Graphe {idx}", diagram_type)

# Visualisation interactive (BONUS)

Pour les graphes tr√®s complexes (>50 n≈ìuds), utilise pyvis pour une visualisation interactive HTML.

```bash
pip install pyvis
```

In [None]:
try:
    from pyvis.network import Network
    
    def visualize_interactive(G: nx.DiGraph, output='graph.html', title='Graphe interactif'):
        """
        Cr√©e une visualisation interactive HTML avec pyvis.
        Id√©al pour les graphes >50 n≈ìuds.
        """
        net = Network(height='800px', width='100%', 
                     directed=True, 
                     notebook=True,
                     cdn_resources='in_line')
        
        # Options de physique pour un meilleur layout
        net.set_options("""
        {
          "physics": {
            "forceAtlas2Based": {
              "gravitationalConstant": -50,
              "centralGravity": 0.01,
              "springLength": 200,
              "springConstant": 0.08
            },
            "maxVelocity": 50,
            "solver": "forceAtlas2Based",
            "timestep": 0.35,
            "stabilization": {"iterations": 150}
          }
        }
        """)
        
        # Copier le graphe dans pyvis
        net.from_nx(G)
        
        # Personnaliser l'apparence
        for node in net.nodes:
            node['title'] = node.get('label', node['id'])  # Tooltip
            node['color'] = '#97C2FC'
        
        # Sauvegarder et afficher
        net.show(output)
        print(f"‚úì Graphe interactif sauvegard√©: {output}")
        return net
    
    # Exemple: visualiser le graphe le plus complexe
    max_nodes_idx = df['num_nodes'].idxmax()
    G_complex = df.iloc[max_nodes_idx]['graph']
    
    print(f"üìä Graphe le plus complexe: {G_complex.number_of_nodes()} n≈ìuds, {G_complex.number_of_edges()} ar√™tes")
    visualize_interactive(G_complex, 'graphe_complexe.html')
    
except ImportError:
    print("‚ö† pyvis non install√©. Pour installer: pip install pyvis")
    print("Visualisation interactive non disponible.")

## üìù R√©sum√© des am√©liorations

### ‚úÖ Probl√®mes r√©solus

1. **Parsing Mermaid am√©lior√©**
   - ‚úÖ Support des **mindmaps** avec parsing par indentation
   - ‚úÖ D√©tection automatique du type de diagramme (mindmap/flowchart/graph)
   - ‚úÖ Nettoyage des labels (HTML, Markdown, descriptions)

2. **Labels nettoy√©s**
   - ‚úÖ Suppression de `<br/>`, `<br>`
   - ‚úÖ Suppression de Markdown (```)
   - ‚úÖ Suppression des descriptions entre parenth√®ses
   - ‚úÖ Limitation √† 30 caract√®res max
   - ‚úÖ Extraction du concept principal uniquement

3. **Visualisation adaptative**
   - ‚úÖ Layout adapt√© selon le type (mindmap ‚Üí radial, graphe ‚Üí spring/kamada-kawai)
   - ‚úÖ Taille des n≈ìuds adapt√©e au nombre total
   - ‚úÖ Taille de police adapt√©e (6-10pt selon densit√©)
   - ‚úÖ Taille de figure adapt√©e (12x8 ‚Üí 20x16)
   - ‚úÖ Param√®tres spring_layout am√©lior√©s (k=3, iterations=200)

4. **Gestion des types de diagrammes**
   - ‚úÖ Mindmap: structure hi√©rarchique avec indentation
   - ‚úÖ Flowchart: graphe dirig√© standard
   - ‚úÖ Graph: graphe non dirig√©

5. **Visualisation interactive (BONUS)**
   - ‚úÖ Support pyvis pour graphes complexes (>50 n≈ìuds)
   - ‚úÖ Export HTML interactif
   - ‚úÖ Navigation zoom/pan

### üéØ Avant/Apr√®s

**AVANT:**
```python
# Regex simpliste: r'(\w+)(?:\[(.+?)\])?(?:\s*-->|---)\s*(\w+)'
# Labels: "Interactive Sites<br/>(Google Museum, WebQuests...)"
# Layout: spring_layout(k=1, iterations=50)
```

**APR√àS:**
```python
# Parser sp√©cialis√© par type de diagramme
# Labels: "Interactive Sites"
# Layout adaptatif: radial pour mindmaps, kamada-kawai pour gros graphes
# Taille/police adapt√©es au nombre de n≈ìuds
```

In [None]:
import pickle

# Sauvegarder le DataFrame complet avec les graphes
with open('mermaid_graphs.pkl', 'wb') as f:
    pickle.dump(df, f)

print(f"‚úì Dataset sauvegard√©: {len(df)} graphes")
print(f"  - Fichier: mermaid_graphs.pkl")
print(f"  - Colonnes: {df.columns.tolist()}")

# Sauvegarder les r√©sultats