# Pattern Config Notebook
## XYZ Hotline: spaCy Narrative Extraction Scripts
Use this notebook to add and patterns to the spaCy Entity Ruler.
**Note:** This is not a "complete" representation of the config notebook, but enough to give you an idea. 

### 1. Add Patterns
1. The content of the patterns comes from configPatterns, a configuration file where patterns of interest are recorded in dictionary format. 
2. The dictionary is loaded into this notebook (by running the code cells below), with all of the variables necessary to populate the **Pattern Dictionary**.
3. The Pattern Dictionary is used to populate the **Pattern File** for the **spaCy Entity Ruler**.
4. To add new patterns, find the appropriate category, and enter your pattern using the Entity Ruler Pattern Syntax (supports regular expressions and Python methods).
5. For guidance on the pattern syntax see: https://spacy.io/api/entityruler
### 2. Update Ruler
1. When you are satisfied with your output, follow the prompts in the "Update Ruler" section at bottom of this notebook. 
2. This will change your ruler, so use caution, or provide an alternative filename for the updated ruler to preserve your base ruler.

In [None]:
import warnings 
warnings.filterwarnings('ignore')
from datetime import datetime

from configPatterns import *
import spacy

nlp = spacy.load("en_core_web_trf")
config = {"overwrite_ents": True, "validate": True}
ruler = nlp.add_pipe("entity_ruler", config=config)

personSuffix = [{"label": "PERSON", 
                 "pattern": [{"OP": "?"},
                     {"ENT_TYPE": "PERSON"}, 
                     {"ORTH": ",", "OP": "?"},
                     {"OP": "?"},
                     {"LOWER": suffix}, 
                     {"OP": "?"}]}
                for suffix in suffixes]

personPrefix = [{"label": "PERSON", 
                 "pattern": [{"LOWER": prefix}, 
                             {"OP": "?"},
                             {"ENT_TYPE": "PERSON"}]}
                for prefix in prefixes]
                
# Attributes
attrPhone = [{"label": "attrPHONE", 
                  "pattern": [{"TEXT": {"REGEX": pattern}}]} 
                 for pattern in rgxPhone]

attrPhone +=[{"label": "attrPHONE", "pattern": [
                {"ORTH": "(", "OP": "?"},
                {"SHAPE": "ddd"}, 
                {"ORTH": ")", "OP": "?"},
                {"ORTH": "-", "OP": "?"},
                {"SHAPE": "ddd"}, {"ORTH": "-"}, 
                {"SHAPE": "dddd"}]}]

attrOnline = [{"label": "attrSOCIAL", 
                      "pattern": [{"lower": pattern}]} 
                     for pattern in socialMedia]

attrOnline += [{"label": "attrURL", "pattern": [{"LIKE_URL": True}]},
                {"label": "attrEMAIL", "pattern": [{"LIKE_EMAIL": True}]},
                {"label": "attrIP", "pattern": [{"SHAPE": "ddd.d.d.d"}]},
                {"label": "attrIP", "pattern": [{"SHAPE": "ddd.ddd.d.d"}]}]
                
attrTIN = [{"label": "attrTIN", 
            "pattern": [{"LOWER": "tin"}, {"OP": "?"}, 
                        {"OP": "?"}, {"LIKE_NUM": True},
                        {"ORTH": "-", "OP": "?"}, {"LIKE_NUM": True}]}]

attrSSN = [{"label": "attrSSN", 
            "pattern": [{"LOWER": "ssn"}, {"OP": "?"}, 
                        {"OP": "?"}, {"LIKE_NUM": True},
                        {"OP": "?"}, {"LIKE_NUM": True, "OP": "?"},
                        {"OP": "?"}, {"LIKE_NUM": True, "OP": "?"}]},
           
           {"label": "attrSSN", 
            "pattern": [{"LOWER": "ss#"}, {"OP": "?"}, 
                        {"OP": "?"}, {"LIKE_NUM": True},
                        {"OP": "?"}, {"LIKE_NUM": True, "OP": "?"},
                        {"OP": "?"}, {"LIKE_NUM": True, "OP": "?"}]},
           
            {"label": "attrSSN", 
            "pattern": [{"LOWER": "social security #"}, {"OP": "?"}, 
                        {"OP": "?"}, {"LIKE_NUM": True},
                        {"OP": "?"}, {"LIKE_NUM": True, "OP": "?"},
                        {"OP": "?"}, {"LIKE_NUM": True, "OP": "?"}]},
           
            {"label": "attrSSN", "pattern": [
                        {"LOWER": "ssn", "OP": "?"}, 
                         {"SHAPE": "ddd"}, 
                         {"ORTH": "-", "OP": "?"},
                         {"SHAPE": "dd"}, 
                         {"ORTH": "-", "OP": "?"}, 
                         {"SHAPE": "dddd"}]},
            {"label": "attrSSN", "pattern": [
                        {"TEXT": {"REGEX": rgxSSN}}]}]

attrEIN = [{"label": "attrEIN", 
            "pattern": [{"LOWER": "ein"}, {"OP": "?"}, 
                        {"OP": "?"}, {"LIKE_NUM": True}]},
           {"label": "attrEIN", 
            "pattern": [{"LOWER": "ein"}, {"OP": "?"}, 
                        {"ORTH": "#", "OP": "?"}, {"LIKE_NUM": True},
                        {"OP": "?"}, {"LIKE_NUM": True}]},
           {"label": "attrEIN", 
            "pattern": [{"LOWER": "ein"}, {"LIKE_NUM": True}, 
                        {"ORTH": "-", "OP": "?"}, {"LIKE_NUM": True},
                        {"ORTH": "-", "OP": "?"}, {"LIKE_NUM": True}]}]
                        
attrTIN = [{"label": "attrTIN", 
            "pattern": [{"LOWER": "tin"}, {"OP": "?"}, 
                        {"OP": "?"}, {"LIKE_NUM": True},
                        {"ORTH": "-", "OP": "?"}, {"LIKE_NUM": True}]}]

attrSSN = [{"label": "attrSSN", 
            "pattern": [{"LOWER": "ssn"}, {"OP": "?"}, 
                        {"OP": "?"}, {"LIKE_NUM": True},
                        {"OP": "?"}, {"LIKE_NUM": True, "OP": "?"},
                        {"OP": "?"}, {"LIKE_NUM": True, "OP": "?"}]},
           
           {"label": "attrSSN", 
            "pattern": [{"LOWER": "ss#"}, {"OP": "?"}, 
                        {"OP": "?"}, {"LIKE_NUM": True},
                        {"OP": "?"}, {"LIKE_NUM": True, "OP": "?"},
                        {"OP": "?"}, {"LIKE_NUM": True, "OP": "?"}]},
           
            {"label": "attrSSN", 
            "pattern": [{"LOWER": "social security #"}, {"OP": "?"}, 
                        {"OP": "?"}, {"LIKE_NUM": True},
                        {"OP": "?"}, {"LIKE_NUM": True, "OP": "?"},
                        {"OP": "?"}, {"LIKE_NUM": True, "OP": "?"}]},
           
            {"label": "attrSSN", "pattern": [
                        {"LOWER": "ssn", "OP": "?"}, 
                         {"SHAPE": "ddd"}, 
                         {"ORTH": "-", "OP": "?"},
                         {"SHAPE": "dd"}, 
                         {"ORTH": "-", "OP": "?"}, 
                         {"SHAPE": "dddd"}]},
            {"label": "attrSSN", "pattern": [
                        {"TEXT": {"REGEX": rgxSSN}}]}]

attrEIN = [{"label": "attrEIN", 
            "pattern": [{"LOWER": "ein"}, {"OP": "?"}, 
                        {"OP": "?"}, {"LIKE_NUM": True}]},
           {"label": "attrEIN", 
            "pattern": [{"LOWER": "ein"}, {"OP": "?"}, 
                        {"ORTH": "#", "OP": "?"}, {"LIKE_NUM": True},
                        {"OP": "?"}, {"LIKE_NUM": True}]},
           {"label": "attrEIN", 
            "pattern": [{"LOWER": "ein"}, {"LIKE_NUM": True}, 
                        {"ORTH": "-", "OP": "?"}, {"LIKE_NUM": True},
                        {"ORTH": "-", "OP": "?"}, {"LIKE_NUM": True}]}]
                        
 addrLine1 = [{"label": "addrLine1", 
             "pattern": [{"LIKE_NUM": True}, 
                         {"OP": "?"}, {"OP": "?"},
                         {"LOWER": street}]}
             for street in streets]

addrLine2 = [{"label": "addrLine2", 
             "pattern": [{"LOWER": des}, {"OP": "?"}, 
                         {"LIKE_NUM": True}, {"OP": "?"},
                         {"ORTH": ",", "OP": "?"}]} 
             for des in designators]

addrLine3 = [{"label": "addrLine3", 
             "pattern": [{"ENT_TYPE": "GPE"}, {"OP": "?"}, 
                         {"ENT_TYPE": "GPE"}, {"OP": "?"}, 
                         {"OP": "?"}, {"LIKE_NUM": True, "OP": "?"},
                         {"ORTH": "-", "OP": "?"}, {"LIKE_NUM": True, "OP": "?"}]}]

addrLike = [{"label": "addrLike", 
             "pattern": [{"LOWER": "address"}, {"OP": "?"}, 
                         {"OP": "?"}, {"ENT_TYPE": "addrLine1"}]}]
                         
# Government Entities
govtProgs = [{"label": "govProg", "pattern": [{"LOWER": org}]} for org in govtP]
govtOrgs = [{"label": "govORG", "pattern": [{"LOWER": org}]} for org in govtO]
govtInvOrgs = [{"label": "govInvORG", "pattern": [{"LOWER": org}]} for org in govtI]
govtProgs += [{"label": "govProg", "pattern": [{"LOWER": org}]} for org in govtProgs_]
govtOrgs += [{"label": "govORG", "pattern": [{"LOWER": org}]} for org in govtOrgs_]
govtInvOrgs += [{"label": "govInvORG", "pattern": [{"LOWER": org}]} for org in govtInvOrgs_]
govtProgs += [{"label": "govProg", "pattern": [{"LOWER": o} for o in org]} for org in govtProgs__]
govtOrgs += [{"label": "govORG", "pattern": [{"LOWER": o} for o in org]} for org in govtOrgs__]
govtInvOrgs += [{"label": "govInvORG", "pattern": [{"LOWER": o} for o in org]} for org in govtInvOrgs__]

# Financial Monitoring Services, such as LexisNexis
finSvc = [{"label": "finSvc", "pattern": [{"LOWER": svc}]} for svc in finSvc]
finApp = [{"label": "finApp", "pattern": [{"LOWER": app}]} for app in finApp]
finCrypto = [{"label": "finCrypto", "pattern": [{"LOWER": cryp}]} for cryp in finCrypto]

# Third Party Lenders, Banks, and Financial Institutions
finLender = [{"label": "finORG", 
                  "pattern": [{"LOWER": lend}]}
             for lend in finLend_]
finLender += [{"label": "finORG", 
                   "pattern": [{"LOWER": l} for l in lend]}
                for lend in finLend__]
finLender += [{"label": "finORG1", 
                "pattern": [{"OP": "?"}, {"TEXT": {"REGEX": r"[Bb]ank$"}}]},
              {"label": "finORG1", 
                "pattern": [{"OP": "?"}, {"TEXT": {"REGEX": r"[Cc]redit [Uu]nion$"}}]},
              {"label": "finORG1", 
                "pattern": [{"OP": "?"}, {"OP": "?"}, {"TEXT": {"REGEX": r"[Ff]inancial.*$"}}]},
               {"label": "finORG1", 
                "pattern": [{"OP": "?"}, {"OP": "?"}, {"TEXT": {"REGEX": r"[Ii]nvestments.*$"}}]},
                 {"label": "finORG1", 
                "pattern": [{"OP": "?"}, {"OP": "?"}, {"TEXT": {"REGEX": r"[Ff]unding.*$"}}]}]
                
finEvents = [{"label": "finEVENT", 
                "pattern": [{"OP": "?"}, 
                            {"OP": "?"}, {"LOWER": event}, 
                            {"OP": "?"}, {"OP": "?"}]}
           for event in finEvent_]


finEvents += [{"label": "finEVENT", 
                   "pattern": [{"LOWER": e} for e in event]}
                for event in finEvent__]

finEvents += [{"label": "finEVENT", 
                   "pattern": [{"LOWER": "cafs"}]}]
                   
# Financial Details, like account numbers
# acct pattern, followed by #, and then a number
loanNum = [
    {"label": "acctNum", 
            "pattern": [{"ENT_TYPE": "govtProg", "OP": "?"}, 
                        {"LOWER": acct}, {"OP": "?"}, 
                        {"ORTH": "#", "OP": "?"}, 
                        {"LIKE_NUM": True}]}
           for acct in accts]

loanNum += [{"label": "acctNum", 
            "pattern": [{"ENT_TYPE": "govtProg", "OP": "?"}, 
                        {"LOWER": acct}, {"OP": "?"}, 
                        {"ORTH": "#", "OP": "?"}, 
                        {"ENT_TYPE": "CARDINAL"}]}
           for acct in accts]


loanNum += [{"label": "acctNum", 
            "pattern": [{"ENT_TYPE": "govtProg", "OP": "?"}, 
                        {"LOWER": acct}, {"OP": "?"}, 
                        {"ORTH": "#", "OP": "?"}, 
                        {"SHAPE": "dddd"}]}
            for acct in accts]

loanNum = [{"label":"loanNum",
    "pattern": [{"LOWER":"ppp"},{"ORTH": "#","OP":"?"},{"LIKE_NUM":True}]},
            {"label":"loanNum",
     "pattern": [{"LOWER":"eidl"},{"ORTH": "#","OP":"?"},{"LIKE_NUM":True}]},
            {"label":"loanNum",
     "pattern": [{"LOWER":"sba"},{"ORTH": "#","OP":"?"},{"LIKE_NUM":True}]},
            {"label":"loanNum",
     "pattern": [{"LOWER":"sba"},{"ORTH": "#","OP":"?"},{"LIKE_NUM":True}]},
{"label":"loanINFO","pattern":[{"TEXT": {"REGEX": r"ppp #[0-9]+"}}]},
{"label":"loanINFO","pattern":[{"TEXT": {"REGEX": r"eidl #[0-9]+"}}]},
{"label":"loanINFO","pattern":[{"TEXT": {"REGEX": r"sba loan #[0-9]+"}}]},
{"label":"loanINFO","pattern":[{"TEXT": {"REGEX": r"application #[0-9]+"}}]},
{"label":"loanINFO","pattern":[{"TEXT": {"REGEX": r"ppp #[0-9]+ in the amout of \$.*"}}]},
{"label":"loanINFO","pattern":[{"TEXT": {"REGEX": r"eidl #[0-9]+ in the amout of \$.*"}}]},
{"label":"loanINFO","pattern":[{"TEXT": {"REGEX": r"sba loan #[0-9]+ in the amout of \$.*"}}]}]

complaintant = [{"label":"personCMPLNT","pattern":[{"TEXT": {"REGEX": r"agent #[0-9]+"}}]}]


# Strings matching this pattern -- r"^LN#.*"
loanNum += [{"label": "loanNum",
            "pattern": [{"TEXT": {"REGEX": r"^LN#.*"}}]}]

# Cardinal Numbers of 9, 10, 11, that were not previously caputred
loanNum += [{"label": "loanNum", "pattern": [{}, {}, {"ENT_TYPE": "CARDIAL", "LENGTH": 9}]},
            {"label": "loanNum", "pattern": [{}, {}, {"ENT_TYPE": "CARDIAL", "LENGTH": 10}]},
            {"label": "loanNum", "pattern": [{}, {}, {"ENT_TYPE": "CARDIAL", "LENGTH": 11}]}]
            
currency = [{"label":"MONEY",
             "pattern": [{"ORTH": "<"}, {"ORTH": "/"}, {"ORTH": "li", "OP": "?"},
                         {"ORTH": ">"}, {"OP": "?"}, {"OP": "?"}, {"OP": "?"}, 
                         {"ORTH": "$"}, {"LIKE_NUM": True},
                         {"ORTH": ",", "OP": "?"}, {"LIKE_NUM": True, "OP": "?"},
                         {"ORTH": "<", "OP": "?"}, {"OP": "?"}, {"OP": "?"},
                         {"OP": "?"}, {"ORTH": ">", "OP": "?"},
                         {"ORTH": ",", "OP": "?"}, {"LIKE_NUM": True, "OP": "?"},
                         {"ORTH": "<", "OP": "?"}, {"OP": "?"}, {"OP": "?"},
                         {"OP": "?"}, {"ORTH": ">", "OP": "?"},
                         {"ORTH": "<", "OP": "?"}, {"ORTH": "/", "OP": "?"},
                         {"OP": "?"}, {"OP": "?"},
                         {"OP": "?"}, {"ORTH": ">", "OP": "?"},
                         {"ORTH": ",", "OP": "?"}, {"LIKE_NUM": True, "OP": "?"},
                         {"ORTH": "<", "OP": "?"}, {"OP": "?"}, {"OP": "?"},
                         {"OP": "?"}, {"ORTH": ">", "OP": "?"}]},
    
 {"label":"MONEY",
   "pattern": [{"TEXT": {"REGEX": r'^\$(\d*(\d\.?|\.\d{1,2})).*$'}}]}
 ]

# throw everything into a list
persons_ = *personSuffix, *personPrefix
attrs_ = *attrTIN, *attrSSN, *attrEIN, *attrPhone, *attrOnline
addrs_ = *addrLine1, *addrLine2, *addrLine3, *addrLike
govtOrgs_ = *govtOrgs, *govtProgs, *govtInvOrgs
finOrgs_ = *finLender, *finEvents, *finSvc, *finCrypto, *finApp, *currency

In [None]:
def configPatterns(patterns:list)->(str):
    """Configures Entity Ruler by creating a pattern file
    from spaCy patterns defined in the list.
    """
    nlp = spacy.load("en_core_web_trf")
    config = {"overwrite_ents": True, "validate": True}
    ruler = nlp.add_pipe("entity_ruler", config=config)
    ruler.add_patterns(patterns)

    model = "../../src/bootstrapModel"
    patternPath = f"{model}/patterns.jsonl"
    rulerPath = f"{model}/ruler"

    nlp.to_disk(model)
    ruler.to_disk(patternPath)
    ruler.to_disk(rulerPath)
    return patternPath