<a href="https://colab.research.google.com/github/deepak590/ml_data/blob/main/_langextract_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install langextract

In [None]:
import os
from google.colab import userdata

# LANGEXTRACT_API_KEY = userdata.get('GOOGLE_AI_STUDIO')
os.environ["LANGEXTRACT_API_KEY"] = LANGEXTRACT_API_KEY


In [None]:
def visualize_extraction_results(result, output_name="extraction_results.jsonl", output_dir="."):
    """
    Saves extraction results to a JSONL file and generates an HTML visualization.

    Args:
        result: The AnnotatedDocument object containing the extraction results.
        output_name (str): The name for the output JSONL file.
        output_dir (str): The directory to save the output file.

    Returns:
        IPython.core.display.HTML: The HTML visualization of the results.
    """
    lx.io.save_annotated_documents([result], output_name=output_name, output_dir=output_dir)
    html_content = lx.visualize(os.path.join(output_dir, output_name))
    print(html_content)
    return html_content

def display_extractions(input_text, result):
    """
    Prints the extracted entities from a LangExtract result object,
    including their character positions.

    Args:
        input_text (str): The original text that was processed.
        result (lx.data.Result): The result object returned by lx.extract.
    """
    print(f"Input: {input_text}\n")
    print("Extracted entities:")
    for entity in result.extractions:
        position_info = ""
        if entity.char_interval:
            start, end = entity.char_interval.start_pos, entity.char_interval.end_pos
            position_info = f" (pos: {start}-{end})"
        print(f"• {entity.extraction_class.capitalize()}: {entity.extraction_text}{position_info}")

# How to use the function:
# Assuming 'input_text' and 'result' are defined from a previous extraction
# display_extractions(input_text, result)

In [None]:
display_extractions(input_text, result)

In [None]:
 visualize_extraction_results(result)

NameError: name 'result' is not defined

In [None]:
## Customer Complaints Data

In [None]:
import textwrap
import langextract as lx
import datetime

# 1. Define a more comprehensive prompt for financial complaint data
prompt = textwrap.dedent("""\
You are a senior analyst for a financial institution. Your task is to extract highly structured and detailed
entities from customer complaints. Extract the complaint type, specific financial product, any financial impact,
the customer's request, the core issue summary, and new entities like customer contacts and other involved parties.
Use exact text for the extractions. Do not paraphrase or overlap entities.
Provide meaningful and specific attributes for each entity to add context and categorization.
""")

# 2. Provide a high-quality, diverse set of examples with new attributes and entities
examples = [
    # Example 1: Account Closed for Fraud (Bank Name changed to Apex Bank)
    lx.data.ExampleData(
        text=(
            "On XX/XX/XXXX Apex Bank randomly closed my account claiming fraud. They have never contacted me or gave any info. "
            "They claim its related to a check... I loaned someone money and they made a payment to me. "
            "That check returned and now Apex Bank is holding my money XXXX. Its been nearly 3 months."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class="complaint_type",
                extraction_text="closed my account claiming fraud",
                attributes={"category": "Account Closure", "reason": "fraud claim"},
            ),
            lx.data.Extraction(
                extraction_class="product",
                extraction_text="my account",
                attributes={"type": "Checking/Savings Account", "bank": "Apex Bank"},
            ),
            lx.data.Extraction(
                extraction_class="involved_party",
                extraction_text="Apex Bank",
                attributes={"role": "institution"},
            ),
            lx.data.Extraction(
                extraction_class="customer_request",
                extraction_text="holding my money",
                attributes={"action": "release funds", "urgency": "high"},
            ),
            lx.data.Extraction(
                extraction_class="core_issue_summary",
                extraction_text="closed my account claiming fraud",
                attributes={"summary": "unjustified account closure due to a third-party returned check"},
            ),
        ],
    ),
    # Example 2: Old Overdraft Reporting Issue (Bank Name changed)
    lx.data.ExampleData(
        text=(
            "Apex Bank, formerly known as First National Bank, closed my account in 2021 due to overdraft that I dont recognize. "
            "Now I cant bank with anyone due to them reporting on XXXX and XXXX XXXX XXXX for {$180.00}. "
            "I really need a bank account I havent been able to get one in years. Theyre past statute of limitations."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class="complaint_type",
                extraction_text="reporting on XXXX and XXXX XXXX XXXX for {$180.00}",
                attributes={"category": "Credit Reporting", "status": "disputed"},
            ),
            lx.data.Extraction(
                extraction_class="product",
                extraction_text="bank account",
                attributes={"type": "Checking/Savings Account", "bank": "Apex Bank"},
            ),
            lx.data.Extraction(
                extraction_class="involved_party",
                extraction_text="First National Bank",
                attributes={"role": "former bank"},
            ),
            lx.data.Extraction(
                extraction_class="financial_impact_amount",
                extraction_text="{$180.00}",
                attributes={"value": "180.00", "currency": "USD", "reason": "overdraft"},
            ),
            lx.data.Extraction(
                extraction_class="customer_request",
                extraction_text="cant bank with anyone",
                attributes={"action": "correct credit report", "goal": "open new account"},
            ),
            lx.data.Extraction(
                extraction_class="core_issue_summary",
                extraction_text="closed my account in 2021 due to overdraft that I dont recognize",
                attributes={"summary": "inaccurate reporting of old overdraft debt from a merged bank"},
            ),
        ],
    ),
    # Example 3: Check Deposit Hold (Bank Name changed)
    lx.data.ExampleData(
        text=(
            "Update: As of Monday, XX/XX/XXXX, deposited funds of {$400.00} into Apex Bank checking account are STILL NOT AVAILABLE. "
            "Today is the 10th day a hold was put on my deposit, and I still do not know why a hold was placed. "
            "The deposit of {$400.00} was made by Check # XXXX. I will be filing a new complaint with the Federal Trade Commission "
            "for your unlawful withholding of my deposit funds and your lack of customer support."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class="complaint_type",
                extraction_text="a hold was put on my deposit",
                attributes={"category": "Funds Availability", "duration_days": "10"},
            ),
            lx.data.Extraction(
                extraction_class="product",
                extraction_text="Apex Bank checking account",
                attributes={"type": "Checking Account", "bank": "Apex Bank"},
            ),
            lx.data.Extraction(
                extraction_class="involved_party",
                extraction_text="Federal Trade Commission",
                attributes={"role": "regulatory body", "type": "threatened complaint"},
            ),
            lx.data.Extraction(
                extraction_class="customer_contact",
                extraction_text=None,  # No specific contact mentioned, but can be added if available
                attributes={"method": "unspecified", "outcome": "lack of customer support"},
            ),
            lx.data.Extraction(
                extraction_class="financial_impact_amount",
                extraction_text="{$400.00}",
                attributes={"value": "400.00", "currency": "USD", "reason": "deposit hold"},
            ),
            lx.data.Extraction(
                extraction_class="customer_request",
                extraction_text="unlawful withholding of my deposit funds",
                attributes={"action": "release funds"},
            ),
            lx.data.Extraction(
                extraction_class="core_issue_summary",
                extraction_text="10th day a hold was put on my deposit",
                attributes={"summary": "unexplained hold on a deposited check"},
            ),
        ],
    ),
]

In [None]:
# 3. Run the extraction on a new input text
input_text = ( "My mortgage servicer incorrectly increased my interest rate. I have a fixed-rate loan and need this corrected immediately. The increase is adding $150 to my monthly payment."
)

input_text = ("""I deposited {$25000.00} with Truist on XX/XX/year>.
They suspected fraud. It took 2 months and a cfsb complain to get some information. They told me that I needed to tell the person that wrote me the check to have their bank get it back. That person talked with their bank, XXXX XXXX, who did a fraud investigation and said they found no evidence of fraud.

I have tried to contact Truist for a week with no response. I have sent emails and left voice mail.

They will not release the money to me and are doing nothing to get the money back to the person who wrote me the check.

Truist has had my {$25000.00} for 100 days.

At this point, I am considering filing a criminal complaint. Truist stole my money because they are not lifting a finger to get the money back to any rightful owner.""")

# Print input_text to check its content
print("Input text being sent to lx.extract:")
print(input_text)

result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-pro", # Or another suitable model
)

# Accessing the extracted data
print(result)

# This would produce a list of Extraction objects similar to the example,
# but based on the input_text about the mortgage complaint.

Input text being sent to lx.extract:
I deposited {$25000.00} with Truist on XX/XX/year>.
They suspected fraud. It took 2 months and a cfsb complain to get some information. They told me that I needed to tell the person that wrote me the check to have their bank get it back. That person talked with their bank, XXXX XXXX, who did a fraud investigation and said they found no evidence of fraud.

I have tried to contact Truist for a week with no response. I have sent emails and left voice mail.

They will not release the money to me and are doing nothing to get the money back to the person who wrote me the check.

Truist has had my {$25000.00} for 100 days.

At this point, I am considering filing a criminal complaint. Truist stole my money because they are not lifting a finger to get the money back to any rightful owner.


[94m[1mLangExtract[0m: model=[92mgemini-2.5-pro[0m, current=[92m790[0m chars, processed=[92m790[0m chars:  [00:29]

[92m✓[0m Extraction processing complete
[92m✓[0m Extracted [1m9[0m entities ([1m7[0m unique types)
  [96m•[0m Time: [1m29.09s[0m
  [96m•[0m Speed: [1m27[0m chars/sec
  [96m•[0m Chunks: [1m1[0m
AnnotatedDocument(extractions=[Extraction(extraction_class='complaint_type', extraction_text='suspected fraud', char_interval=CharInterval(start_pos=57, end_pos=72), alignment_status=<AlignmentStatus.MATCH_FUZZY: 'match_fuzzy'>, extraction_index=1, group_index=0, description=None, attributes={'category': 'Funds Availability', 'reason': 'fraud suspicion', 'duration_days': '100'}), Extraction(extraction_class='product', extraction_text='account with Truist', char_interval=None, alignment_status=None, extraction_index=2, group_index=1, description=None, attributes={'type': 'Bank Account', 'bank': 'Truist'}), Extraction(extraction_class='involved_party', extraction_text='Truist', char_interval=CharInterval(start_pos=29, end_pos=35), alignment_status=<AlignmentStatus.MATCH_FUZZY: '




In [None]:
display_extractions(input, result)

Input: <bound method Kernel.raw_input of <google.colab._kernel.Kernel object at 0x7aca11a409d0>>

Extracted entities:
• Complaint_type: suspected fraud (pos: 57-72)
• Product: account with Truist
• Involved_party: Truist (pos: 29-35)
• Involved_party: cfsb (pos: 97-101)
• Involved_party: XXXX XXXX (pos: 274-283)
• Financial_impact_amount: {$25000.00} (pos: 12-23)
• Customer_request: release the money to me (pos: 473-496)
• Core_issue_summary: Truist has had my {$25000.00} for 100 days (pos: 29-35)
• Customer_contact: tried to contact Truist for a week with no response. I have sent emails and left voice mail (pos: 365-456)


In [None]:
visualize_extraction_results(result, output_name="extraction_results.jsonl")


[94m[1mLangExtract[0m: Saving to [92mextraction_results.jsonl[0m: 1 docs [00:00, 611.86 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mextraction_results.jsonl[0m



[94m[1mLangExtract[0m: Loading [92mextraction_results.jsonl[0m: 100%|██████████| 3.60k/3.60k [00:00<00:00, 10.3MB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mextraction_results.jsonl[0m
<IPython.core.display.HTML object>





In [None]:


### **Simple Earnings Transcripts with Metadata**

import textwrap
import langextract as lx
import datetime

# 1. Define a concise prompt that includes metadata
prompt = textwrap.dedent("""\
Extract key financial results, forward-looking guidance, and product updates from the provided earnings transcript.
Also, extract the company name and the fiscal period.
The extraction classes should be kept simple and direct. Use the exact text for each extraction.
""")

# 2. Provide a simple, high-quality set of examples
examples = [
    # Example 1: TechCorp's Q2 performance
    lx.data.ExampleData(
        text=(
            "Good morning. This is TechCorp's second quarter earnings call. Our Q2 revenue was up 15% year-over-year to $5.2 billion. This was driven by the successful launch "
            "of our new 'Fusion' platform. We are confident in our growth trajectory and expect to "
            "exceed our previous annual guidance."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class="company_name",
                extraction_text="TechCorp"
            ),
            lx.data.Extraction(
                extraction_class="fiscal_period",
                extraction_text="second quarter"
            ),
            lx.data.Extraction(
                extraction_class="financials",
                extraction_text="Q2 revenue was up 15% year-over-year to $5.2 billion"
            ),
            lx.data.Extraction(
                extraction_class="product_update",
                extraction_text="successful launch of our new 'Fusion' platform"
            ),
            lx.data.Extraction(
                extraction_class="guidance",
                extraction_text="expect to exceed our previous annual guidance"
            ),
        ],
    ),
    # Example 2: Global Dynamics' Q3 outlook
    lx.data.ExampleData(
        text=(
            "Welcome to Global Dynamics' Q3 earnings call. We reported a profit of $250 million this quarter, a slight decrease from last year. We are "
            "facing some macroeconomic headwinds, particularly in our international markets. "
            "Our team is focused on cost-saving measures, and we project a flat growth for the next quarter."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class="company_name",
                extraction_text="Global Dynamics"
            ),
            lx.data.Extraction(
                extraction_class="fiscal_period",
                extraction_text="Q3"
            ),
            lx.data.Extraction(
                extraction_class="financials",
                extraction_text="profit of $250 million this quarter"
            ),
            lx.data.Extraction(
                extraction_class="sentiment",
                extraction_text="some macroeconomic headwinds"
            ),
            lx.data.Extraction(
                extraction_class="guidance",
                extraction_text="project a flat growth for the next quarter"
            ),
        ],
    ),
]


In [None]:
input_text = """
Good morning, everyone, and welcome to TechCorp's third quarter 2025 earnings call.
 We delivered a solid Q3 with revenue growing 10% year-over-year to $7.8 billion, showcasing the strength of our core business. However, our profitability saw some pressure, with operating margins slightly contracting due to ongoing investments in our new Quantum Cloud service. While this is a headwind, we are extremely confident in the long-term potential of Quantum Cloud. Our legacy software division, on the other hand, experienced a 2% decline, a trend we are actively working to reverse. Looking ahead to the fourth quarter and beyond, we remain cautious about the broader economic climate but are projecting full-year revenue to be within our previously issued guidance of $30 to $31 billion. Our primary focus is on driving efficiency and expanding our market share against competitors like Global Dynamics.
"""

result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-pro",
)

print(result)

[94m[1mLangExtract[0m: model=[92mgemini-2.5-pro[0m, current=[92m900[0m chars, processed=[92m900[0m chars:  [00:09]

[92m✓[0m Extraction processing complete
[92m✓[0m Extracted [1m9[0m entities ([1m6[0m unique types)
  [96m•[0m Time: [1m9.19s[0m
  [96m•[0m Speed: [1m98[0m chars/sec
  [96m•[0m Chunks: [1m1[0m
AnnotatedDocument(extractions=[Extraction(extraction_class='company_name', extraction_text='TechCorp', char_interval=CharInterval(start_pos=40, end_pos=48), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=1, group_index=0, description=None, attributes={}), Extraction(extraction_class='fiscal_period', extraction_text='third quarter 2025', char_interval=CharInterval(start_pos=51, end_pos=69), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=2, group_index=1, description=None, attributes={}), Extraction(extraction_class='financials', extraction_text='revenue growing 10% year-over-year to $7.8 billion', char_interval=CharInterval(start_pos=114, end_pos=164), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>,




In [None]:
display_extractions(input, result)

Input: <bound method Kernel.raw_input of <google.colab._kernel.Kernel object at 0x7aca11a409d0>>

Extracted entities:
• Company_name: TechCorp (pos: 40-48)
• Fiscal_period: third quarter 2025 (pos: 51-69)
• Financials: revenue growing 10% year-over-year to $7.8 billion (pos: 114-164)
• Financials: operating margins slightly contracting due to ongoing investments in our new Quantum Cloud service (pos: 263-361)
• Financials: legacy software division, on the other hand, experienced a 2% decline (pos: 465-534)
• Product_update: investments in our new Quantum Cloud service (pos: 317-361)
• Sentiment: extremely confident in the long-term potential of Quantum Cloud (pos: 396-459)
• Sentiment: cautious about the broader economic climate (pos: 638-681)
• Guidance: projecting full-year revenue to be within our previously issued guidance of $30 to $31 billion (pos: 690-784)


In [None]:
visualize_extraction_results(result, output_name="extraction_results.jsonl")


[94m[1mLangExtract[0m: Saving to [92mextraction_results.jsonl[0m: 1 docs [00:00, 628.36 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mextraction_results.jsonl[0m



[94m[1mLangExtract[0m: Loading [92mextraction_results.jsonl[0m: 100%|██████████| 3.50k/3.50k [00:00<00:00, 9.25MB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mextraction_results.jsonl[0m
<IPython.core.display.HTML object>





/bin/bash: -c: line 1: syntax error near unexpected token `('
/bin/bash: -c: line 1: `[Alt text](!https://drive.google.com/file/d/1yBOyfBJGEuN9sUs8DRn0NRmUISXCan9B/view?usp=drive_link)'


![Alt text](!https://drive.google.com/file/d/1yBOyfBJGEuN9sUs8DRn0NRmUISXCan9B/view?usp=drive_link)


[Alt text](!https://drive.google.com/file/d/1yBOyfBJGEuN9sUs8DRn0NRmUISXCan9B/view?usp=drive_link)



In [None]:
import textwrap
import langextract as lx
import datetime

# 1. Define a concise prompt for a compliance officer
prompt = textwrap.dedent("""\
You are a compliance officer reviewing a regulatory notice. Your task is to extract critical, structured
information about new or updated requirements. Focus on the regulation's name, effective date,
the specific action required, and the enforcement body.
Use exact text for extractions. Do not paraphrase or overlap entities.
Provide meaningful attributes for each entity to add context and help with automated tracking.
""")

# 2. Provide a high-quality example to guide the model
examples = [
    lx.data.ExampleData(
        text=(
            "FEDERAL REGISTER NOTICE: The Federal Reserve Board, acting under the authority of "
            "Regulation B (Fair Lending Act), announces a new amendment to reporting requirements. "
            "Effective October 1, 2025, all state-member banks are required to submit quarterly "
            "disaggregated loan data. The purpose is to enhance fair lending oversight."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class="enforcement_body",
                extraction_text="The Federal Reserve Board",
                attributes={"jurisdiction": "federal"},
            ),
            lx.data.Extraction(
                extraction_class="regulation",
                extraction_text="Regulation B (Fair Lending Act)",
                attributes={"category": "compliance", "id": "Reg-B"},
            ),
            lx.data.Extraction(
                extraction_class="effective_date",
                extraction_text="October 1, 2025",
                attributes={"date": "2025-10-01", "type": "implementation"},
            ),
            lx.data.Extraction(
                extraction_class="affected_parties",
                extraction_text="state-member banks",
                attributes={"scope": "banks", "type": "state-member"},
            ),
            lx.data.Extraction(
                extraction_class="required_action",
                extraction_text="submit quarterly disaggregated loan data",
                attributes={"action_type": "reporting", "frequency": "quarterly"},
            ),
        ],
    )
]

# 3. Run the extraction on your new regulatory notice text
input_text = (
    "NOTICE: The SEC has issued new guidelines under the 'Financial Transparency and Reporting Act' "
    "which mandate that all public companies must disclose cybersecurity incident impacts. "
    "This rule is effective starting January 1, 2026. This applies to all entities "
    "that are registered with the Commission. Failure to comply may result in civil penalties."
)

result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-pro", # Or another suitable model
)

# Print the result to see the extracted data
print(result)

[94m[1mLangExtract[0m: model=[92mgemini-2.5-pro[0m, current=[92m348[0m chars, processed=[92m348[0m chars:  [00:08]

[92m✓[0m Extraction processing complete
[92m✓[0m Extracted [1m5[0m entities ([1m5[0m unique types)
  [96m•[0m Time: [1m8.20s[0m
  [96m•[0m Speed: [1m42[0m chars/sec
  [96m•[0m Chunks: [1m1[0m
AnnotatedDocument(extractions=[Extraction(extraction_class='enforcement_body', extraction_text='The SEC', char_interval=CharInterval(start_pos=8, end_pos=15), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=1, group_index=0, description=None, attributes={'jurisdiction': 'federal'}), Extraction(extraction_class='regulation', extraction_text="'Financial Transparency and Reporting Act'", char_interval=CharInterval(start_pos=52, end_pos=94), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=2, group_index=1, description=None, attributes={'category': 'reporting'}), Extraction(extraction_class='effective_date', extraction_text='January 1, 2026', char_interval=CharInterval(start_pos=213, end_pos=228), alignment_status=<Alig




In [None]:

for extraction in result.extractions:
  print()
  for attribute_name, attribute_value in vars(extraction).items():
    print(f"{attribute_name.upper()}: {attribute_value}")


EXTRACTION_CLASS: enforcement_body
EXTRACTION_TEXT: The SEC
CHAR_INTERVAL: CharInterval(start_pos=8, end_pos=15)
_TOKEN_INTERVAL: TokenInterval(start_index=2, end_index=4)
ALIGNMENT_STATUS: AlignmentStatus.MATCH_EXACT
EXTRACTION_INDEX: 1
GROUP_INDEX: 0
DESCRIPTION: None
ATTRIBUTES: {'jurisdiction': 'federal'}

EXTRACTION_CLASS: regulation
EXTRACTION_TEXT: 'Financial Transparency and Reporting Act'
CHAR_INTERVAL: CharInterval(start_pos=52, end_pos=94)
_TOKEN_INTERVAL: TokenInterval(start_index=10, end_index=17)
ALIGNMENT_STATUS: AlignmentStatus.MATCH_EXACT
EXTRACTION_INDEX: 2
GROUP_INDEX: 1
DESCRIPTION: None
ATTRIBUTES: {'category': 'reporting'}

EXTRACTION_CLASS: effective_date
EXTRACTION_TEXT: January 1, 2026
CHAR_INTERVAL: CharInterval(start_pos=213, end_pos=228)
_TOKEN_INTERVAL: TokenInterval(start_index=34, end_index=38)
ALIGNMENT_STATUS: AlignmentStatus.MATCH_FUZZY
EXTRACTION_INDEX: 3
GROUP_INDEX: 2
DESCRIPTION: None
ATTRIBUTES: {'date': '2026-01-01', 'type': 'implementation'}

E

## Custom example

In [None]:
TC_ARTICLE = """Shortly after Hunter Lightman joined OpenAI as a researcher in 2022, he watched his colleagues launch ChatGPT, one of the fastest-growing products ever. Meanwhile, Lightman quietly worked on a team teaching OpenAI’s models to solve high school math competitions.

Today that team, known as MathGen, is considered instrumental to OpenAI’s industry-leading effort to create AI reasoning models: the core technology behind AI agents that can do tasks on a computer like a human would.

“We were trying to make the models better at mathematical reasoning, which at the time they weren’t very good at,” Lightman told TechCrunch, describing MathGen’s early work.

OpenAI’s models are far from perfect today — the company’s latest AI systems still hallucinate and its agents struggle with complex tasks.

But its state-of-the-art models have improved significantly on mathematical reasoning. One of OpenAI’s models recently won a gold medal at the International Math Olympiad, a math competition for the world’s brightest high school students. OpenAI believes these reasoning capabilities will translate to other subjects, and ultimately power general-purpose agents that the company has always dreamed of building.

ChatGPT was a happy accident — a lowkey research preview turned viral consumer business — but OpenAI’s agents are the product of a years-long, deliberate effort within the company.

“Eventually, you’ll just ask the computer for what you need and it’ll do all of these tasks for you,” said OpenAI CEO Sam Altman at the company’s first developer conference in 2023. “These capabilities are often talked about in the AI field as agents. The upsides of this are going to be tremendous.”

Whether agents will meet Altman’s vision remains to be seen, but OpenAI shocked the world with the release of its first AI reasoning model, o1, in the fall of 2024. Less than a year later, the 21 foundational researchers behind that breakthrough are the most highly sought-after talent in Silicon Valley.

Mark Zuckerberg recruited five of the o1 researchers to work on Meta’s new superintelligence-focused unit, offering some compensation packages north of $100 million. One of them, Shengjia Zhao, was recently named chief scientist of Meta Superintelligence Labs.

The reinforcement learning renaissance
The rise of OpenAI’s reasoning models and agents are tied to a machine learning training technique known as reinforcement learning (RL). RL provides feedback to an AI model on whether its choices were correct or not in simulated environments.

RL has been used for decades. For instance, in 2016, about a year after OpenAI was founded in 2015, an AI system created by Google DeepMind using RL, AlphaGo, gained global attention after beating a world champion in the board game, Go.

By 2018, OpenAI pioneered its first large language model in the GPT series, pretrained on massive amounts of internet data and a large clusters of GPUs. GPT models excelled at text processing, eventually leading to ChatGPT, but struggled with basic math.

It took until 2023 for OpenAI to achieve a breakthrough, initially dubbed “Q*” and then “Strawberry,” by combining LLMs, RL, and a technique called test-time computation. The latter gave the models extra time and computing power to plan and work through problems, verifying its steps, before providing an answer.

This allowed OpenAI to introduce a new approach called “chain-of-thought” (CoT), which improved AI’s performance on math questions the models hadn’t seen before.

“I could see the model starting to reason,” said El Kishky. “It would notice mistakes and backtrack, it would get frustrated. It really felt like reading the thoughts of a person.”

Though individually these techniques weren’t novel, OpenAI uniquely combined them to create Strawberry, which directly led to the development of o1. OpenAI quickly identified that the planning and fact checking abilities of AI reasoning models could be useful to power AI agents.

“We had solved a problem that I had been banging my head against for a couple of years,” said Lightman. “It was one of the most exciting moments of my research career.”

Scaling reasoning
With AI reasoning models, OpenAI determined it had two new axes that would allow it to improve AI models: using more computational power during the post-training of AI models, and giving AI models more time and processing power while answering a question.

“OpenAI, as a company, thinks a lot about not just the way things are, but the way things are going to scale,” said Lightman.

Shortly after the 2023 Strawberry breakthrough, OpenAI spun up an “Agents” team led by OpenAI researcher Daniel Selsam to make further progress on this new paradigm, two sources told TechCrunch. Although the team was called “Agents,”  OpenAI didn’t initially differentiate between reasoning models and agents as we think of them today. The company just wanted to make AI systems capable of completing complex tasks.

Eventually, the work of Selsam’s Agents team became part of a larger project to develop the o1 reasoning model, with leaders including OpenAI co-founder Ilya Sutskever, chief research officer Mark Chen, and chief scientist Jakub Pachocki.

OpenAI would have to divert precious resources — mainly talent and GPUs — to create o1. Throughout OpenAI’s history, researchers have had to negotiate with company leaders to obtain resources; demonstrating breakthroughs was a surefire way to secure them.

“One of the core components of OpenAI is that everything in research is bottom up,” said Lightman. “When we showed the evidence [for o1], the company was like, ‘This makes sense, let’s push on it.’”

Some former employees say that the startup’s mission to develop AGI was the key factor in achieving breakthroughs around AI reasoning models. By focusing on developing the smartest-possible AI models, rather than products, OpenAI was able to prioritize o1 above other efforts. That type of large investment in ideas wasn’t always possible at competing AI labs.

The decision to try new training methods proved prescient. By late 2024, several leading AI labs started seeing diminishing returns on models created through traditional pretraining scaling. Today, much of the AI field’s momentum comes from advances in reasoning models.

What does it mean for an AI to “reason?”
In many ways, the goal of AI research is to recreate human intelligence with computers. Since the launch of o1, ChatGPT’s UX has been filled with more human-sounding features such as “thinking” and “reasoning.”

When asked whether OpenAI’s models were truly reasoning, El Kishky hedged, saying he thinks about the concept in terms of computer science.

“We’re teaching the model how to efficiently expend compute to get an answer. So if you define it that way, yes, it is reasoning,” said El Kishky.

Lightman takes the approach of focusing on the model’s results and not as much on the means or their relation to human brains.

“If the model is doing hard things, then it is doing whatever necessary approximation of reasoning it needs in order to do that,” said Lightman. “We can call it reasoning, because it looks like these reasoning traces, but it’s all just a proxy for trying to make AI tools that are really powerful and useful to a lot of people.”

OpenAI’s researchers note people may disagree with their nomenclature or definitions of reasoning — and surely, critics have emerged — but they argue it’s less important than the capabilities of their models. Other AI researchers tend to agree.

Nathan Lambert, an AI researcher with the non-profit AI2, compares AI reasoning modes to airplanes in a blog post. Both, he says, are manmade systems inspired by nature — human reasoning and bird flight, respectively — but they operate through entirely different mechanisms. That doesn’t make them any less useful, or any less capable of achieving similar outcomes.

A group of AI researchers from OpenAI, Anthropic, and Google DeepMind agreed in a recent position paper that AI reasoning models are not well understood today, and more research is needed. It may be too early to confidently claim what exactly is going on inside them.

The next frontier: AI agents for subjective tasks
The AI agents on the market today work best for well-defined, verifiable domains such as coding. OpenAI’s Codex agent aims to help software engineers offload simple coding tasks. Meanwhile, Anthropic’s models have become particularly popular in AI coding tools like Cursor and Claude Code — these are some of the first AI agents that people are willing to pay up for.

However, general purpose AI agents like OpenAI’s ChatGPT Agent and Perplexity’s Comet struggle with many of the complex, subjective tasks people want to automate. When trying to use these tools for online shopping or finding a long-term parking spot, I’ve found the agents take longer than I’d like and make silly mistakes.

Agents are, of course, early systems that will undoubtedly improve. But researchers must first figure out how to better train the underlying models to complete tasks that are more subjective.

“Like many problems in machine learning, it’s a data problem,” said Lightman, when asked about the limitations of agents on subjective tasks. “Some of the research I’m really excited about right now is figuring out how to train on less verifiable tasks. We have some leads on how to do these things.”

Noam Brown, an OpenAI researcher who helped create the IMO model and o1, told TechCrunch that OpenAI has new general-purpose RL techniques which allow them to teach AI models skills that aren’t easily verified. This was how the company built the model which achieved a gold medal at IMO, he said.

OpenAI’s IMO model was a newer AI system that spawns multiple agents, which then simultaneously explore several ideas, and then choose the best possible answer. These types of AI models are becoming more popular; Google and xAI have recently released state-of-the-art models using this technique.

“I think these models will become more capable at math, and I think they’ll get more capable in other reasoning areas as well,” said Brown. “The progress has been incredibly fast. I don’t see any reason to think it will slow down.”

These techniques may help OpenAI’s models become more performant, gains that could show up in the company’s upcoming GPT-5 model. OpenAI hopes to assert its dominance over competitors with the launch of GPT-5, ideally offering the best AI model to power agents for developers and consumers.

But the company also wants to make its products simpler to use. El Kishky says OpenAI wants to develop AI agents that intuitively understand what users want, without requiring them to select specific settings. He says OpenAI aims to build AI systems that understand when to call up certain tools, and how long to reason for.

These ideas paint a picture of an ultimate version of ChatGPT: an agent that can do anything on the internet for you, and understand how you want it to be done. That’s a much different product than what ChatGPT is today, but the company’s research is squarely headed in this direction.

While OpenAI undoubtedly led the AI industry a few years ago, the company now faces a tranche of worthy opponents. The question is no longer just whether OpenAI can deliver its agentic future, but can the company do so before Google, Anthropic, xAI, or Meta beat them to it?"""



In [None]:
# 1. Define a concise prompt
prompt = textwrap.dedent("""\
Extract people's name, ai models, products and company names in order of appearance.
Use exact text for extractions. Do not paraphrase or overlap entities.
Provide meaningful related entities for each entity to add context.""")

# 2. Provide a high-quality example to guide the model
examples = [
    lx.data.ExampleData(
        text=(
            "David Ha from Sakana AI labs has trained many models"
            " including the early 'WM1' and his company makes a product called 'AI Scientist' ."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class="person_name",
                extraction_text="David Ha",
                attributes={"company": "Sakana AI"},
            ),
            lx.data.Extraction(
                extraction_class="company_name",
                extraction_text="Sakana AI",
                attributes={"employee": "David Ha"},
            ),
            lx.data.Extraction(
                extraction_class="ai_model",
                extraction_text="WM1",
                attributes={"company": "Sakana AI"},
            ),
            lx.data.Extraction(
                extraction_class="product",
                extraction_text="'AI Scientist'",
                attributes={"company": "Sakana AI"},
            ),
        ],
    )
]

# 3. Run the extraction on your input text
input_text = (
    TC_ARTICLE
)
result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash",
)

[94m[1mLangExtract[0m: model=[92mgemini-2.5-flash[0m, current=[92m2,307[0m chars, processed=[92m11,412[0m chars:  [00:25]

[92m✓[0m Extraction processing complete
[92m✓[0m Extracted [1m74[0m entities ([1m4[0m unique types)
  [96m•[0m Time: [1m25.86s[0m
  [96m•[0m Speed: [1m442[0m chars/sec
  [96m•[0m Chunks: [1m12[0m





In [None]:
result

AnnotatedDocument(extractions=[Extraction(extraction_class='character', extraction_text='Lady Juliet', char_interval=CharInterval(start_pos=0, end_pos=11), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=1, group_index=0, description=None, attributes={'emotional_state': 'longing'}), Extraction(extraction_class='emotion', extraction_text='longingly', char_interval=CharInterval(start_pos=18, end_pos=27), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=2, group_index=1, description=None, attributes={'feeling': 'yearning'}), Extraction(extraction_class='emotion', extraction_text='her heart aching', char_interval=CharInterval(start_pos=42, end_pos=58), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=3, group_index=2, description=None, attributes={'feeling': 'sorrow'}), Extraction(extraction_class='character', extraction_text='Romeo', char_interval=CharInterval(start_pos=63, end_pos=68), alignment

### Print out the people mentioned

In [None]:
for ex in result.extractions:
    if ex.extraction_class == "person_name":
        print(ex.extraction_class)
        print(ex.extraction_text)
        print(ex.attributes)
        print(ex.char_interval)
        print("====================")

person_name
Hunter Lightman
{'company': 'OpenAI'}
CharInterval(start_pos=14, end_pos=29)
person_name
Sam Altman
{'company': 'OpenAI'}
CharInterval(start_pos=1512, end_pos=1522)
person_name
Mark Zuckerberg
{'company': 'Meta'}
CharInterval(start_pos=2002, end_pos=2017)
person_name
Shengjia Zhao
{'company': 'Meta Superintelligence Labs'}
CharInterval(start_pos=2181, end_pos=2194)
person_name
El Kishky
None
CharInterval(start_pos=3568, end_pos=3577)
person_name
Lightman
{'company': 'OpenAI'}
CharInterval(start_pos=4077, end_pos=4085)
person_name
Daniel Selsam
{'company': 'OpenAI'}
CharInterval(start_pos=4660, end_pos=4673)
person_name
Ilya Sutskever
{'company': 'OpenAI'}
CharInterval(start_pos=5125, end_pos=5139)
person_name
Mark Chen
{'company': 'OpenAI'}
CharInterval(start_pos=5164, end_pos=5173)
person_name
Jakub Pachocki
{'company': 'OpenAI'}
CharInterval(start_pos=5195, end_pos=5209)
person_name
Lightman
{'company': 'OpenAI'}
CharInterval(start_pos=5558, end_pos=5566)
person_name
El K

## Companies mentioned

In [None]:
unique_companies = set()
for ex in result.extractions:
    if ex.extraction_class == "company_name":
        unique_companies.add(ex.extraction_text)

for ex in result.extractions:
    if ex.extraction_class == "company_name":
        print(ex.extraction_text)
        print(ex.char_interval)
        print("====================")

print(unique_companies)

OpenAI
CharInterval(start_pos=37, end_pos=43)
TechCrunch
CharInterval(start_pos=613, end_pos=623)
OpenAI
CharInterval(start_pos=893, end_pos=899)
Meta
CharInterval(start_pos=2066, end_pos=2070)
Meta Superintelligence Labs
CharInterval(start_pos=2234, end_pos=2261)
OpenAI
CharInterval(start_pos=2315, end_pos=2321)
Google DeepMind
CharInterval(start_pos=2671, end_pos=2686)
OpenAI
CharInterval(start_pos=2794, end_pos=2800)
OpenAI
CharInterval(start_pos=3754, end_pos=3760)
OpenAI
CharInterval(start_pos=4603, end_pos=4609)
TechCrunch
CharInterval(start_pos=4738, end_pos=4748)
OpenAI
CharInterval(start_pos=5500, end_pos=5506)
OpenAI
CharInterval(start_pos=6575, end_pos=6581)
OpenAI
CharInterval(start_pos=7303, end_pos=7309)
AI2
CharInterval(start_pos=7602, end_pos=7605)
Anthropic
CharInterval(start_pos=7955, end_pos=7964)
Google DeepMind
CharInterval(start_pos=7970, end_pos=7985)
OpenAI
CharInterval(start_pos=8332, end_pos=8338)
Anthropic
CharInterval(start_pos=8425, end_pos=8434)
Perplexity

In [None]:
for ex in result.extractions:
    if ex.extraction_class == "ai_model" or ex.extraction_class == "product":
        print(ex.extraction_class)
        print(ex.extraction_text)
        # print(ex.char_interval)
        print("====================")


product
ChatGPT
product
general-purpose agents
ai_model
ChatGPT
product
OpenAI’s agents
product
agents
ai_model
o1
ai_model
AlphaGo
ai_model
GPT series
ai_model
ChatGPT
ai_model
Q*
ai_model
Strawberry
ai_model
Strawberry
ai_model
o1
ai_model
o1
ai_model
o1
product
o1
ai_model
ChatGPT
ai_model
Codex
product
Cursor
product
Claude Code
product
ChatGPT Agent
product
Comet
ai_model
IMO model
ai_model
o1
ai_model
GPT-5
product
ChatGPT
