In [2]:
import json

with open('knowledge_base.json', 'r') as f:
    domain_knowledge = json.load(f)

competitors = domain_knowledge["competitors"]
features = domain_knowledge["features"]
pricing_keywords = domain_knowledge["pricing_keywords"]
security_concerns = domain_knowledge["security_concerns"]

# Function to extract entities using dictionary lookup
def dictionary_lookup(text):
    extracted_entities = {
        "competitors": [],
        "features": [],
        "pricing_keywords": [],
        "security_concerns": []
    }
    
    # Check for competitors
    for competitor in competitors:
        if competitor.lower() in text.lower():
            extracted_entities["competitors"].append(competitor)
    
    # Check for features
    for feature in features:
        if feature.lower() in text.lower():
            extracted_entities["features"].append(feature)
    
    # Check for pricing keywords
    for keyword in pricing_keywords:
        if keyword.lower() in text.lower():
            extracted_entities["pricing_keywords"].append(keyword)
    
    # Check for security keywords
    for keyword in security_concerns:
        if keyword.lower() in text.lower():
            extracted_entities["security_concerns"].append(keyword)
    
    return extracted_entities

In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")

def spacy_ner_extraction(text):
    doc = nlp(text)
    ner_entities = {ent.label_: [] for ent in doc.ents}
    
    # Extract named entities (person names, organizations, etc.)
    for ent in doc.ents:
            ner_entities[ent.label_].append(ent.text)
    
    return ner_entities


In [4]:
def combined_entity_extraction(text):
    dict_ = dictionary_lookup(text)
    
    ner_ = spacy_ner_extraction(text)
    
    return {
        "dict_":dict_,
        "ner_":ner_
    }

In [5]:
test_snippets = [
    "CompetitorX has a better pricing model, but we are more secure.",
    "Our service is cheaper than CompetitorY, and it comes with an advanced AI engine.",
    "Are you SOC2 certified? We need to ensure data handling complies with security standards.",
]

# Extract entities for each snippet
for snippet in test_snippets:
    entities = combined_entity_extraction(snippet)
    print(f"Text Snippet: {snippet}")
    print(f"Extracted Entities: {entities}")
    print("-" * 50)

Text Snippet: CompetitorX has a better pricing model, but we are more secure.
Extracted Entities: {'dict_': {'competitors': ['CompetitorX'], 'features': [], 'pricing_keywords': ['pricing model'], 'security_concerns': []}, 'ner_': {}}
--------------------------------------------------
Text Snippet: Our service is cheaper than CompetitorY, and it comes with an advanced AI engine.
Extracted Entities: {'dict_': {'competitors': ['CompetitorY'], 'features': ['AI engine'], 'pricing_keywords': [], 'security_concerns': []}, 'ner_': {'GPE': ['AI']}}
--------------------------------------------------
Text Snippet: Are you SOC2 certified? We need to ensure data handling complies with security standards.
Extracted Entities: {'dict_': {'competitors': [], 'features': [], 'pricing_keywords': [], 'security_concerns': ['SOC2 certified', 'data handling']}, 'ner_': {}}
--------------------------------------------------


In [6]:
nlp.to_disk("ner_model_spacy")