In [None]:
# Python RegEx example
import re

text = "Call me at 832-123-5555 or 425-123-4567 for more information."
pattern = r"\d{3}-\d{3}-\d{4}"

# Find matches
matches = re.finditer(pattern, text)

for match in matches:
    print(match.start())
    print(match.end())
    print(match.group())
    print()

"""
    11
    23
    832-123-5555

    27
    39
    425-123-4567
"""

11
23
832-123-5555

27
39
425-123-4567



In [None]:
import spacy 

# Test with more complex text
text = "Call me at 832-123-5555 or 425-123-4567 for more information."
complex_text = "Contact us at info@company.com or visit https://www.company.com. Call 555-123-4567."

nlp = spacy.blank("en")
entity_ruler = nlp.add_pipe("entity_ruler")

# Number Pattern
number_pattern = [
    {
        "label": "PHONE_NUMBER",
        "pattern": [
            {"SHAPE": "ddd"},       # 3 digits
            {"ORTH": "-"},          # exact dash
            {"SHAPE": "ddd"},       # 3 digits
            {"ORTH": "-"},          # exact dash
            {"SHAPE": "dddd"}       # 4 digits
        ]
    }
]

# Email Pattern example
email_patterns = [
    {
        "label": "EMAIL",
        "pattern": [
            {"LIKE_EMAIL": True}  # Built-in email detection
        ]
    }
]

# URL pattern example
url_patterns = [
    {
        "label": "URL",
        "pattern": [
            {"LIKE_URL": True}  # Built-in URL detection
        ]
    }
]

entity_ruler.add_patterns(number_pattern + email_patterns + url_patterns)

doc = nlp(text)

for ent in doc.ents:
    print(ent.text)
"""
    832-123-5555
    425-123-4567
"""

complex_doc = nlp(complex_text)
for ent in complex_doc.ents:
    print(ent.text)
"""
    info@company.com
    https://www.company.com
    555-123-4567
"""



832-123-5555
425-123-4567
info@company.com
https://www.company.com
555-123-4567


In [None]:



# spaCy EntityRuler with RegEx-like patterns
nlp = spacy.blank("en")
nlp.add_pipe("entity_ruler")

# Get EntityRuler component
entity_ruler = nlp.get_pipe("entity_ruler")

# Define spaCy pattern for phone numbers (3 digits - 3 digits - 4 digits)
patterns = [
    {
        "label": "PHONE_NUMBER",
        "pattern": [
            {"SHAPE": "ddd"},  # 3 digits
            {"ORTH": "-"},     # exact dash
            {"SHAPE": "ddd"},  # 3 digits
            {"ORTH": "-"},     # exact dash
            {"SHAPE": "dddd"}  # 4 digits
        ]
    }
]

# Add patterns to EntityRuler
entity_ruler.add_patterns(patterns)

# Process text with spaCy
doc = nlp(text)

print("\nspaCy EntityRuler matches:")
for ent in doc.ents:
    print(f"Entity: {ent.text}")
    print(f"Label: {ent.label_}")
    print(f"Start: {ent.start_char}, End: {ent.end_char}")



# Add more patterns
entity_ruler.add_patterns(email_patterns + url_patterns)

# Test with more complex text
complex_text = "Contact us at info@company.com or visit https://www.company.com. Call 555-123-4567."
complex_doc = nlp(complex_text)

print("\nComplex pattern matching:")
for ent in complex_doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")
