In [1]:
import re
import spacy
import pandas as pd

from utils import patterns

spacy.prefer_gpu()  # type: ignore
nlp = spacy.load("en_core_web_sm")

In [2]:
df = pd.read_feather("tmp.arrow")
df["doc"] = list(nlp.pipe(df["text"]))

In [3]:
law_ents = (
    df.assign(
        ent=lambda df: df["doc"].map(
            lambda x: [ent for ent in x.ents if ent.label_ == "LAW"]
        )
    )
    .explode("ent")
    .dropna(subset=["ent"])
    .assign(ent_text=lambda df: df["ent"].map(lambda x: x.text))
)
law_ents.head(1)

Unnamed: 0,sentence_index,id,text,language,country,user_type,organization,surname,feedback,status,...,tr_number,scope,governance_level,full_name,source,language_detected,tokenized,doc,ent,ent_text
33,33,2665651,Article 7 of the current Proposal defines adve...,en,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,...,,,,,attachment,en,"[article, 7, of, the, current, proposal, defin...","(Article, 7, of, the, current, Proposal, defin...","(Article, 7, of, the)",Article 7 of the


In [4]:
# Add some manually created rejections
rejected = "|".join(
    [
        "^Article$",  # (21) - generally unspecific (the Article, etc.), would require resolution
        "^the Data Governance Act", # (9) - not AI Act
        "^Recommendation \\d+", # (8+6+6+x) - not AI Act
        "^Recital \\d+", # (7+4+4+x) - too few mentions for each
        "^section",  # (6+x) - not AI Act
        "^the (?:European|EU) Charter of Fundamental Rights$|^Charter$",  # (5+3) - not AI Act
        "^(?:the )?Coordinated Plan$",  # (9+5) - too few mentions
        "^Act$", # (5) - Spacy false positive (part of sth. larger)
        "^a European Act$", # (4) - all four also mention "AI Act" as abbreviation
        "^Chapter 8$", # (3) - not AI Act
        "^Paragraph 1$", # (3) can refer to part of many different articles
        "^the Data Protection Law Enforcement Directive", # (3) not AI Act
        "^the Data Act", # (3) not AI Act
        "^the Cybersecurity Act", # (3) not AI Act
        "^Article III", # (3) weird, not used often enough
        "^the “Act", # Introducing abbreviation (in the same sentence as "AI Act" is already used)
        "^Chapter", # (100+) not a unique reference within AI Act
        "^Article (?:114|88)" # (6+4) - not AI Act
    ]
)

# Find any entities that are neither currently extracted nor rejected
filtered = law_ents.query(
    "ent_text.str.count(@patterns.PATTERN, flags=@re.IGNORECASE) == 0 "
    "and ent_text.str.count(@rejected, flags=@re.IGNORECASE) == 0"
)

In [5]:
print(len(filtered))
filtered["ent_text"].value_counts(sort=True).head(10)

150


Article 35 GDPR                             8
Article 29 WP                               6
Article 29 Working Party                    5
Article 29 Data Protection Working Party    4
the Article 29 Working Party                3
the Geneva Convention                       2
Article 22 GDPR                             2
IT Security Act                             2
Article 9 GDPR                              2
Article 35(7)(a) GDPR                       2
Name: ent_text, dtype: int64