In [1]:
import os, sys, math, logging, random
from pathlib import Path

import numpy as np

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

import pandas as pd
pd.set_option('display.max_colwidth', 200)

import yamlu

from pybpmn.parser import BpmnParser, InvalidBpmnException
from yamlu.img import AnnotatedImage, Annotation

In [2]:
bpmn_paths = yamlu.glob(Path.home() / "ws" / "sapsam" / "xml", "*.xml")
len(bpmn_paths)

5989

In [8]:
import sys
from tqdm import tqdm
from pybpmn.parser import BpmnParser, InvalidBpmnException

logging.getLogger("pybpmn").setLevel("WARNING")
parser = BpmnParser()

exception_records = []
anns = []
for p in tqdm(bpmn_paths):
    try: 
        anns += parser.parse_bpmn_anns(p)
    except InvalidBpmnException as e:
        exception_records.append({"bpmn_path": str(p), "error_type": e.error_type, "error_details": e.details})
        continue
    except Exception as e:
        print(p, sys.exc_info()[0])
        raise e        

print(len(exception_records), len(anns))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5989/5989 [00:14<00:00, 421.21it/s]

559 341876





In [5]:
df = pd.DataFrame.from_records(exception_records)
df.error_type.value_counts()

sequenceFlow has no targetRef attrib                              158
sequenceFlow has no sourceRef attrib                              157
association has no targetRef attrib                                73
messageFlow has no sourceRef attrib                                56
association has no sourceRef attrib                                51
messageFlow has no targetRef attrib                                43
Duplicate model element id                                          9
Invalid endEvent with multiple event definitions                    6
Invalid intermediateThrowEvent with multiple event definitions      3
Invalid boundaryEvent with multiple event definitions               2
BPMN Choreography diagrams are not implemented.                     1
Name: error_type, dtype: int64

In [16]:
from collections import Counter

pd.set_option('display.max_rows', 100)
s = pd.Series(Counter([a.category for a in anns])).sort_values(ascending=False)
print(len(s))
s

71


label                                     107943
sequenceFlow                               88521
task                                       33350
exclusiveGateway                           13486
messageFlow                                12803
lane                                       10455
endEvent                                    9280
association                                 8476
pool                                        8248
dataAssociation                             6283
textAnnotation                              5600
startEvent                                  4492
messageIntermediateCatchEvent               4436
dataObject                                  4119
parallelGateway                             3314
sendTask                                    2670
subProcessCollapsed                         2131
timerIntermediateEvent                      1840
messageStartEvent                           1658
messageIntermediateThrowEvent               1507
eventBasedGateway   