# Named Entity Patterns

Named Entity patterns are dictionaries with two keys: "label", specifying the label to assign to the entity if the pattern is matched, and "pattern", the match pattern. The entity ruler accepts two types of patterns:

```json
{"label": "CON-BIM-CATG", "pattern": [{"LOWER": "air"}, {"LOWER": "terminals"}]}
{"label": "CON-BIM-CATG", "pattern": [{"LOWER": "air"}, {"LEMMA": "terminal"}]}
{"label": "CON-BIM-CATG", "pattern": [{"LOWER": "electrical"}, {"LOWER": "equipment"}]}
{"label": "CON-BIM-CATG", "pattern": [{"LOWER": "electrical"}, {"LEMMA": "equipment"}]}
````

## Objectives
 - Generate a pattern file (in `.jsonl` format) that can be applied to a spaCy EntityRuler component that can easily combine rule-based and statistical named entity recognition for even more powerful pipelines.
 - Generate sample training data from the phrase templates


### Imports

In [4]:
import copy
import inflect
import random
import spacy
from spacy import displacy
from spacy.matcher import PhraseMatcher
from spacy.training import (
    offsets_to_biluo_tags,
    biluo_to_iob
)
from tqdm import tqdm
from utilities import (
    csv_to_list,
    jsonl_to_list,
    write_jsonl
)

### Define Utility Functions

In [5]:
# Load the natural language pipeline
nlp = spacy.load("en_core_web_trf")

# Init the inflection engine
infection = inflect.engine()

def is_plural(word):
    tokens = nlp(word)
    last_token = tokens[-1]
    tag = last_token.tag_
    return tag == 'NNPS' or tag == 'NNS' or tag == 'NN'
    # """
    # Tests if a word is plural.
    
    # Args:
    #     word (str): The word to test
    # """
    # singular = infection.singular_noun(word)
    # # Returns true if plural, false if singular or unrecognized
    # return bool(singular)

def make_plural(word):
    """
    Pluralizes a string.

    Args:
        word (str): The string to pluralize.
    """
    return infection.plural_noun(word)

def make_patterns(label, text):
    """
    Makes an array of label pattern for the given text.

    Args:
        label (str): the label to apply to the text pattern
        text (str): The text pattern
    """
    doc = nlp(text)
    tokens = [token for i, token in enumerate(doc)]
    tokens_len = len(tokens)-1
    patterns = []

    # Make the lower case patters
    lower_case_patterns = list(map(lambda x: {"LOWER": x.text.lower()}, tokens))
    
    # Lemmatize the text
    lemma_patterns = []
    for i, token in enumerate(tokens):
        if i == tokens_len:
            if token.text.lower() != token.lemma_.lower():
                lemma_patterns.append({'LEMMA': token.lemma_.lower()})
            else:
                lemma_patterns.append({'LEMMA': token.text.lower()})
        else:
            lemma_patterns.append({'LOWER': token.text.lower()})

    patterns.append({"label": label, "pattern": lower_case_patterns})
    patterns.append({"label": label, "pattern": lemma_patterns})
    return patterns

def make_bim_label_patterns(named_entities):
    """
    Makes a list of BIM label patterns to apply for use in training an ML model.

    Args:
        con_ner_tags (list): A list of con-NER tags
    """
    csv_file = './data/revit_categories_families_types.csv'
    cat_fam_types = csv_to_list(csv_file)
    data = []

    # Create a unique set of all the categories
    categories = sorted(list(set().union(str(item['category']) for item in cat_fam_types)))

    # Map all of the categories
    label = named_entities[0]['label']
    patterns = list(map(lambda x: make_patterns(label, x), categories))
    flattened_list = [item for sublist in patterns for item in sublist]
    for item in flattened_list:
        data.append(item)
    
    for category in categories:
        
        # Create a unique set of the families in this category
        families = set().union([str(d['family']) for d in cat_fam_types if d['category'] == category])

        # Map all of the families
        label = named_entities[1]['label']
        patterns = list(map(lambda x: make_patterns(label, x), families))
        flattened_list = [item for sublist in patterns for item in sublist]
        for item in flattened_list:
            data.append(item)

        for family in families:

            # Create a unique set of types in this family
            types = set().union([str(d['type']) for d in cat_fam_types if d['family'] == family])
            
            # Map all of the types
            label = named_entities[2]['label']
            patterns = list(map(lambda x: make_patterns(label, x), types))
            flattened_list = [item for sublist in patterns for item in sublist]
            for item in flattened_list:
                data.append(item)


    # Make Levels labels
    label = named_entities[4]['label']
    patterns = make_patterns(label, 'Level')
    data += patterns

    return data


def make_rule_patterns_file():
    """
    Makes a list of label patterns to apply for use in training an ML model.
    """

    jsonl_file = './data/named_entities.jsonl'
    named_entities = jsonl_to_list(jsonl_file)
    data = []

    # Make the BIM labels
    data += make_bim_label_patterns(named_entities)

    return data

def make_training_data():
    """
    Makes a list of training data.
    """
    jsonl_file = './data/phrase_examples.jsonl'
    examples = jsonl_to_list(jsonl_file)

    csv_file = './data/revit_categories_families_types.csv'
    cat_fam_types = csv_to_list(csv_file)
    # Create a unique set of all the categories
    categories = sorted(list(set().union(str(item['category']) for item in cat_fam_types)))
    
    data = []

    for i, category in enumerate(categories):
        # Create a unique set of the families in this category
        families = set().union([str(d['family']) for d in cat_fam_types if d['category'] == category])

        for example in examples:
            text = example['text']    
            row = copy.deepcopy(example)
            row['text'] = text.format(category.lower())
            data.append(row)

        for family in families:

            result = family

            # Modify the text based on the family
            tokens = nlp(family)
            first_token = tokens[0]
            last_token = tokens[-1]
            first_tag = first_token.tag_ 
            last_tag = last_token.tag_
            is_noun = last_tag.startswith('N')
            is_plural = last_tag == 'NNPS' or last_tag == 'NNS'
            is_singular_mass = last_tag == 'NN'
            is_modified_noun = is_noun and first_tag == 'JJ'
        
            if is_plural == False:
                result = make_plural(family)

            # The family isn't a noun so concat the family + category
            if is_noun == False:
                result = family + ' ' + category
        
            # The family is a modified noun so concat the family + category
            if is_modified_noun == True:
                # Check to see if the last pluralized token equals the category
                pluralized = make_plural(last_token.text).strip().lower()
                if pluralized.lower() != category.lower():
                    result = family + ' ' + category

            for example in examples:
                text = example['text']
                row = copy.deepcopy(example)
                row['text'] = text.format(result.lower())
                data.append(row)


    navigation_examples = jsonl_to_list('./data/navigation_examples.jsonl')

    # Balance the data set by simply repeating the number of navigation commands
    count = round((len(data) / 3) /6)
    
    # Make Navigation Data
    for i in range(count):
        for item in navigation_examples:
            data.append(item)
    return data

### Generate Patterns File

In [7]:
PATTERNS_OUTPUT_FILE = './data/patterns.jsonl'

# Generate the rule patterns data
data = make_rule_patterns_file()
# Write the dataset out to a jsonl file
write_jsonl(data, PATTERNS_OUTPUT_FILE)

# Test applying the patterns to an entity ruler
if nlp.has_pipe("entity_ruler") == False:
    config = {"overwrite_ents": True}
    ruler = nlp.add_pipe("entity_ruler", config=config).from_disk(PATTERNS_OUTPUT_FILE)
    print("✅ Generated custom patterns:", len(ruler.patterns))

✅ Generated custom patterns: 640


### Generate Training Data

In [6]:
TRAINING_DATA_OUTPUT_FILE = './data/training.jsonl'
data = make_training_data()
#random.shuffle(data)
# Write the dataset out to a jsonl file
write_jsonl(data, TRAINING_DATA_OUTPUT_FILE)
print("✅ Generated training data:", len(data))

✅ Generated training data: 20121
