# Lab 6: Information Extraction

Simple Example of information extraction with Mellea using generative slots.

In [None]:
# Import necessary libraries
from mellea import generative, start_session
from mellea.backends import model_ids

# Display utilities
from IPython.display import display, Markdown

# Format code cells with black
import jupyter_black

jupyter_black.load()

In [None]:
# Start a Mellea session
m = start_session()


# Define a generative function to extract person names
@generative
def extract_all_person_names(doc: str) -> list[str]:
    """
    Extract all person names mentioned in the given document text.

    This function analyzes the input text and identifies names of people,
    including full names, titles, and honorifics. It returns a list of unique
    person names found in the document.

    Args:
        doc (str): The input document text to analyze for person names.
              Can contain paragraphs, quotes, or any text format.

    Returns:
        list[str]: A list of strings where each string is a person's name.
                  Names may include titles (e.g., "President Obama", "Dr. Smith").
                  Returns an empty list if no names are found.

    Examples:
        >>> text = "President Biden met with Chancellor Merkel."
        >>> extract_all_person_names(text)
        ['President Biden', 'Chancellor Merkel']
    """


# ref: https://www.nytimes.com/2012/05/20/world/world-leaders-at-us-meeting-urge-growth-not-austerity.html
NYTimes_text = """CAMP DAVID, Md. — Leaders of the world's richest countries banded together on Saturday 
to press Germany to back more pro-growth policies to halt the deepening debt crisis in Europe, 
as President Obama for the first time gained widespread support for his argument that Europe, and the 
United States by extension, cannot afford Chancellor Angela Merkel's one-size-fits-all approach emphasizing austerity."""

person_names = extract_all_person_names(m, doc=NYTimes_text)

print(f"person_names = {person_names}")
# out: person_names = ['President Obama', 'Angela Merkel']

# Exercises

1. Extend the extractor to include entity types (person, organization, location) and return structured records.
2. Add a confidence field or provenance (text span) for each extracted name.
3. Compose the extractor with a deduplication/normalization step (e.g., convert "B. Obama" → "Barack Obama").