In [73]:
import spacy
from keybert import KeyBERT
from parsedatetime import Calendar
import datetime
from collections import defaultdict
from spacy.matcher import Matcher
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer



In [74]:
nlp = spacy.load("en_core_web_lg")
kw_model = KeyBERT()
cal = Calendar()
sentiment_analyzer = SentimentIntensityAnalyzer()

In [75]:
matcher = Matcher(nlp.vocab)

# Define patterns for event-like noun phrases
# This rule looks for an optional determiner/preposition followed by a noun
# E.g., "the meeting," "in the call," "a discussion"
event_pattern = [{"POS": {"IN": ["DET", "ADP", "PROPN"]}, "OP": "?"}, 
                 {"POS": "NOUN"}]
matcher.add("GENERAL_EVENT", [event_pattern])

In [76]:
def extract_structured_info(text):
    """
    Extracts structured information (persons, locations, dates, events, action items)
    from a given text using spaCy and KeyBERT. It now also resolves relative dates.
    """
    doc = nlp(text)
    
    matches = matcher(doc)
    
    # Initialize dictionaries and lists for structured output
    persons = set()
    locations = set()
    dates = {}
    events = set()
    action_items = {}

    for match_id, start, end in matches:
        span = doc[start:end]
        if nlp.vocab.strings[match_id] == "GENERAL_EVENT":
            events.add(span.text)

    # Use spaCy to find named entities and resolve dates
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            persons.add(ent.text)
        elif ent.label_ == "GPE" or ent.label_ == "LOC":
            locations.add(ent.text)
        elif ent.label_ == "DATE":
            phrase = ent.text
            time_struct, parse_status = cal.parse(phrase)
            
            if parse_status:
                actual_datetime = datetime.datetime(*time_struct[:6])
                actual_date_str = actual_datetime.strftime('%Y-%m-%d')
                
                dates[phrase] = {
                    "actual_date": actual_date_str,
                    "context": text
                }
            else:
                # Fallback for dates that parsedatetime can't handle
                dates[phrase] = {"context": text}
        elif ent.label_ == "EVENT":
            events.add(ent.text)

    for match_id, start, end in matches:
        span = doc[start:end]
        # Basic filter: check if the matched span is not a named entity
        # and has a length of 1 or 2 tokens to avoid capturing too much
        is_entity = any(ent.text == span.text for ent in doc.ents)
        if not is_entity and len(span) <= 2:
            events.add(span.text)

    # Use spaCy's dependency parser to find action items
    for token in doc:
        if token.pos_ == "VERB":
            subject = None
            task = None
            
            for child in token.children:
                if child.dep_ in ("nsubj", "nsubjpass"):
                    subject = child.text
            
            task_tokens = [token.text]
            for child in token.children:
                if child.dep_ in ("dobj", "compound", "prt"):
                    task_tokens.append(child.text)
            
            if subject and len(task_tokens) > 1:
                task_str = " ".join(task_tokens)
                action_items[task_str] = subject
            elif subject and not task:
                action_items[token.text] = subject
    
    sentiment_dict = sentiment_analyzer.polarity_scores(text)
    compound_score = sentiment_dict['compound']
    
    if compound_score >= 0.05:
        sentiment_label = "positive"
    elif compound_score <= -0.05:
        sentiment_label = "negative"
    else:
        sentiment_label = "neutral"
    
    # Prepare the final output in the requested format
    if any(persons) or any(locations) or any(dates) or any(events) or any(action_items) or any(sentiment_label):
        # Return the structured data if found
        return {
            "persons": list(persons),
            "locations": list(locations),
            "dates": dates,
            "events": list(events),
            "action_items": action_items,
            "sentiment": {
            "label": sentiment_label,
            "score": compound_score
            }
        }
    else:
        # Fallback: Use KeyBERT to get general keywords
        keywords = kw_model.extract_keywords(
            text,
            keyphrase_ngram_range=(1, 2),
            stop_words="english",
            top_n=5
        )
        return {
            "status": "No specific structured data found.",
            "keywords": [keyword[0] for keyword in keywords]
        }

In [78]:
expan_query = "Tell me about the guy who said he would help me with my project."
extracted_data = extract_structured_info(expan_query)
print(extracted_data)

{'persons': [], 'locations': [], 'dates': {}, 'events': ['project', 'guy', 'the guy'], 'action_items': {'said': 'who', 'help me': 'he'}, 'sentiment': {'label': 'positive', 'score': 0.4019}}
