In [None]:
import re
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Input text
text = """The Washington Monument is the most prominent structure in Washington,
D.C. and one of the city’s early attractions. It was built in honor of George
Washington, who led the country to independence and then became its first
President."""

# Process text with spaCy
doc = nlp(text)

# Named Entities using spaCy
print("Named Entities (spaCy):")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")

# Heuristic 1: Titles
titles = ["Dr.", "Mr.", "Mrs.", "Ms.", "Professor"]
entities_with_titles = []
for i, token in enumerate(doc):
    if token.text in titles and i + 1 < len(doc):
        entities_with_titles.append(f"{token.text} {doc[i+1].text}")

print("\nHeuristic 1 - Titles:")
print(entities_with_titles)

# Heuristic 2: Emails
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', text)
print("\nHeuristic 2 - Emails:")
print(emails)

# Heuristic 3: Phone Numbers
phone_numbers = re.findall(r'\+?\d{1,4}[-.\s]?\(?\d{2,4}\)?[-.\s]?\d{3}[-.\s]?\d{4}', text)
print("\nHeuristic 3 - Phone Numbers:")
print(phone_numbers)

# Heuristic 4: Organizations
org_suffixes = ["Inc.", "Ltd.", "Corp.", "Company", "LLC"]
organizations = []
for ent in doc.ents:
    if ent.label_ == "ORG":
        org_name = ent.text
        for token in doc[ent.end : ent.end + 2]:  # Check the next two tokens
            if token.text in org_suffixes:
                org_name += " " + token.text
                break
        organizations.append(org_name)

print("\nOrganizations (Heuristic):")
print(organizations)

# Heuristic 5: Dates
dates = re.findall(r'\b\w+ \d{1,2}, \d{4}\b', text)
print("\nHeuristic 5 - Dates:")
print(dates)

# Combine results
all_entities = {
    "PERSON": list(set([ent.text for ent in doc.ents if ent.label_ == "PERSON"] + entities_with_titles)),
    "EMAIL": emails,
    "PHONE": phone_numbers,
    "ORG": list(set([ent.text for ent in doc.ents if ent.label_ == "ORG"] + organizations)),
    "DATE": list(set([ent.text for ent in doc.ents if ent.label_ == "DATE"] + dates)),
}

print("\nCombined Results:")
for entity_type, entities in all_entities.items():
    print(f"{entity_type}: {entities}")


Named Entities (spaCy):
The Washington Monument (ORG)
Washington (GPE)
D.C. (GPE)
George
Washington (PERSON)
first (ORDINAL)

Heuristic 1 - Titles:
[]

Heuristic 2 - Emails:
[]

Heuristic 3 - Phone Numbers:
[]

Organizations (Heuristic):
['The Washington Monument']

Heuristic 5 - Dates:
[]

Combined Results:
PERSON: ['George\nWashington']
EMAIL: []
PHONE: []
ORG: ['The Washington Monument']
DATE: []


: 