In [3]:
from pydantic import BaseModel, Field
from typing import List, TypedDict, Annotated, Any


import langchain
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.output_parsers.json import JsonOutputParser
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_core.prompts import (ChatPromptTemplate, PromptTemplate,
                                    SystemMessagePromptTemplate, HumanMessagePromptTemplate)
from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser

import dotenv
dotenv.load_dotenv()

langchain.verbose=True
        
model = ChatGoogleGenerativeAI(model='models/gemini-2.0-pro-exp', request_timeout=60, verbose=True)
# model = ChatOpenAI(model='gpt-4o-mini', request_timeout=60, verbose=True)

input_prompt = """As a specialized research assistant, your task is to perform detailed topic analysis
of news item summaries. You will process news items summaries provided as a JSON object according to  
the input specification below. You will extract topics of the news item summaries according to the 
output specification below and return a raw JSON object without any additional formatting or markdown syntax. 

Input Specification:
You will receive an array of JSON objects representing news summaries.
Each headline object contains exactly two fields:
'id': A unique numeric identifier
'summary': The news summmary item

Example input:
[
 {{
    "id": 29,
    "summary": "- Elon Musk's xAI launched Grok 3, a new family of AI models trained using 100,000 Nvidia H100 GPUs at the Colossus Supercluster; benchmarks show it outperforms competitors like GPT-4o and Claude 3.5 Sonnet in areas such as math, science, and coding.

- Grok 3 includes advanced features like reasoning models for step-by-step logical problem-solving and a DeepSearch function that synthesizes internet-sourced information into single answers; it is initially available to X Premium+ subscribers, with advanced features under a paid "SuperGrok" plan.

- Former Tesla AI director Andrej Karpathy and others have confirmed Grok 3's strong performance, with Karpathy noting it is comparable to and slightly better than leading AI models from OpenAI and other competitors."
  }},
{{
    "id": 34,
    "summary": "- Google Gemini has received a memory upgrade that allows it to recall past conversations and summarize previous chats, enhancing its ability to remember user preferences such as interests and professional details. This feature is currently available only to Google One AI Premium subscribers in English, with broader language support expected soon.

- Users retain control over their data with options to delete past conversations, prevent chats from being saved, or set them to auto-delete, although discussions can still be used for AI training unless deleted.

- Similar to OpenAI's ChatGPT persistent memory feature, Gemini's upgrade aims to make chats more practical, though users are advised not to input sensitive information as conversations may be reviewed for quality control."
  }},
 {{
    "id": 47,
    "summary": "- Major tech companies like OpenAI, Google, and Meta are competing to dominate generative AI, though the path to profitability remains uncertain.  

- Chinese start-up DeepSeek has introduced a cost-effective way to build powerful AI, disrupting the market and pressuring established players.

- OpenAI aims to reach 1 billion users, while Meta continues to invest heavily in AI despite market disruptions caused by DeepSeek."
  }},
{{
    "id": 56,
    "summary": "- OpenAI is exploring new measures to protect itself from a potential hostile takeover by Elon Musk.  
- The company is in discussions to empower its non-profit board to maintain control as it transitions into a for-profit business model."
  }},
 {{
    "id": 63,
    "summary": "- The New York Times has approved the use of select AI tools, such as GitHub Copilot, Google Vertex AI, and their in-house summarization tool Echo, to assist with tasks like content summarization, editing, and enhancing product development, while reinforcing the tools as aids rather than replacements for journalistic work.

- Strict guidelines and safeguards have been implemented, including prohibitions on using AI to draft full articles, revise them significantly, or generate images and videos, with a mandatory training video to prevent misuse and protect journalistic integrity.

- Some staff members have expressed concerns about AI potentially compromising creativity and accuracy, leading to skepticism about universal adoption, although the guidelines align with standard industry practices."
  }},
]

Output Specification:
Return a raw JSON object containing 'items', a list of JSON objects, each containing:
'id': Matching the input item's id field.
'extracted_topics': An array of relevant topic strings
Topics should capture:
- The main subject matter
- Key entities (companies, people, products)
- Technical domains, industry sectors, event types

Output Example:
{{items:
 [{{"id": 29, "extracted_topics": ['AI model development', 'xAI Grok capabilities', 'AI advancements']}},
  {{"id": 34, "extracted_topics": ['Google Gemini', 'Interactive AI advancements', 'Digital assistants']}},
  {{"id": 47, "extracted_topics": ['OpenAI', 'Google', 'Meta', 'DeepSeek']}},
  {{"id": 56, "extracted_topics": ['OpenAI', 'non-profit oversight', 'anti-takeover strategies', 'Elon Musk']}},
  {{"id": 63, "extracted_topics": ['New York Times', 'AI in journalism', 'GitHub Copilot', 'Google Vertex AI']}},
 ]
}}

Detailed Guidelines:
The output must strictly adhere to the output specification.
Do not return markdown, return a raw JSON string.
For each input item, output a valid JSON object for each news item in the exact schema provided.
Extract 3-6 relevant topics per news item.
Avoid duplicate or redundant topics.
Use topics which are as specific as possible.
Please analyze the following news items and provide topic classifications according to these specifications:
"""

input_text = \
[
  {
    "id": 0,
    "summary": "4 days left to save up to $325 at TechCrunch Sessions: AI\n* TechCrunch Sessions: AI will take place on June 5 at UC Berkeley's Zellerbach Hall, featuring speakers like Twelve Labs CEO Jae Lee, CapitalG partner Jill Chase, and Khosla Ventures partner Kanu Gulati.  One session will focus on how small companies can stay relevant in the rapidly changing AI space.\n* The event will include main stage talks, breakout sessions, and demos of the latest AI advancements.  It's aimed at AI leaders, VCs, and tech enthusiasts.\n* A discount of up to $325 on select tickets is available until March 2 at 11:59 p.m. PT.\n"
  },
  {
    "id": 1,
    "summary": "57% of enterprise employees input confidential data into AI tools, survey reveals\n* 57% of enterprise employees at companies with 5,000+ staff admitted to inputting confidential company data into publicly available generative AI tools like ChatGPT, according to a TELUS Digital Experience survey of 1,000 US-based employees.\n* 68% of surveyed employees use personal AI accounts for work, indicating a rise in \"shadow AI\" practices that bypass IT and security oversight, increasing data exposure and compliance violation risks.\n* While 29% of respondents confirmed their organizations have AI guidelines, enforcement is weak, with only 24% receiving AI training and 42% reporting no consequences for not following guidelines.  TELUS Digital Fuel iX general manager Bret Kinsella emphasized the need for secure, company-approved AI solutions to address the security risks associated with employees using personal AI accounts for work tasks.\n"
  },
  {
    "id": 2,
    "summary": "A New Machine Learning Approach Answers What-If Questions\n* Causal ML, an emerging machine learning technique, helps managers make better decisions by analyzing potential outcomes of different choices, unlike traditional ML which relies on correlations and may provide flawed insights for decision-making.\n* Causal ML allows for exploring \"what-if\" scenarios, considering various factors and their influence on outcomes, such as determining the optimal R&D budget by considering its impact on revenue alongside other economic variables.\n* While traditional ML remains suitable for predictions like stock prices or customer preferences, Causal ML is valuable for exploring cause-and-effect relationships and informing actions in various business functions like product development, finance, and marketing.\n"
  },
  {
    "id": 3,
    "summary": "A major AI company filed accounts months late and pointed the finger at its Big Four auditor\n* Supermicro filed delayed financial reports, blaming former auditor EY's resignation over concerns about financial reporting governance and senior management integrity.\n* EY dropped Supermicro as a client in October 2024 after raising these concerns, prompting an internal review and the hiring of a new accounting firm, BDO.\n* BDO found no material issues with Supermicro's financials, and the company's stock rebounded after the filings were submitted.\n"
  },
  {
    "id": 4,
    "summary": "AI CAPTCHA Fails Are the Internets New Comedy Show!\n* AI struggles with CAPTCHA challenges, particularly image-based ones, often misidentifying objects or failing to interpret blurred text.  Examples include AI misidentifying a painted bicycle symbol as a real bicycle and failing to transcribe blurred text.\n* While AI can solve some simpler text-based CAPTCHAs, the increasing complexity of CAPTCHAs, including puzzle-based ones, poses a significant challenge even for advanced AI models.\n* Dedicated CAPTCHA-solving tools, unlike AI, are specifically designed to bypass CAPTCHAs and are more effective for this purpose.\n"
  },
  {
    "id": 5,
    "summary": "AI Overview jokes\n* Google's AI Overview feature is producing humorous and inaccurate results for certain queries, including questions about elements ending in \"um\" and the kosher status of tripe.  The inaccuracies seem to stem from the LLMs' difficulty with spelling and counting, as well as a \"lossy-compressed summary of the internet.\"\n* Users report Google is proactively suggesting these flawed queries in the search dropdown, raising questions about whether a human or an algorithm selected them as examples of AI Overview's capabilities.\n*  Several users provided additional examples of inaccurate or nonsensical responses from Google's AI Overview, ranging from incorrect information about surnames of African origin to illogical hyphenation advice and flawed explanations of linguistic history.\n"
  },
  {
    "id": 6,
    "summary": "AI startup Bridgetown Research raises $19 million in latest funding\n* Bridgetown Research, a Seattle-based AI startup, raised $19 million in Series A funding, led by Lightspeed Venture Partners and Accel, with participation from a research university.  The funding round values the company at $250 million.\n* Unlike many AI companies focused on LLMs, Bridgetown Research develops AI agents that collect and analyze proprietary data from experts and customer surveys to provide insights for strategic decision-making.\n* The company plans to use the funding to expand the capabilities of its AI agents and broaden access to sector-specific intelligence through partnerships.\n"
  },
  {
    "id": 7,
    "summary": "AI-Powered Ransomware Attacks\n* AI is being used to enhance ransomware attacks, automating processes like vulnerability analysis, malware deployment, and lateral movement within networks, making them more sophisticated and harder to detect.\n* AI-powered phishing attacks are becoming more targeted and convincing, leveraging publicly available data to create personalized messages and dynamically adjusting content based on recipient behavior.\n* Defending against AI-powered ransomware requires a multi-layered approach including AI-driven security systems, firewalls, updated anti-malware software, intrusion detection systems, end-point detection and response tools, employee training, and robust incident response plans with regular data backups.\n"
  },
  {
    "id": 8,
    "summary": "Akool unleashes enhancements to its AI human 3D avatars connected to LLMs\n* Akool Inc. enhanced its AI-driven 3D human avatars to connect with large language models (LLMs), enabling dynamic conversational experiences.  The avatars can display emotions, movements, and hand gestures, creating a lifelike interaction similar to a video call.\n* Akool offers two avatar types: talking avatars for scripted messages and streaming avatars for real-time conversations, suitable for customer service and guidance.  Users can customize LLMs or integrate with existing models like OpenAI's.\n* Akool CEO Jiajun Lu highlighted the avatars' success in customer service and language education, citing the improved user experience from interacting with a lifelike figure. He also sees potential in government and healthcare.  Low latency and full-body motion capabilities are key differentiators for Akool's technology.\n"
  },
  {
    "id": 9,
    "summary": "Alibaba And DeepSeek Intensify AI Showdown, Challenge OpenAI Market Dominance\n* Chinese AI startup DeepSeek has reopened its core programming interface after a three-week suspension.\n* DeepSeek had previously suspended service due to capacity issues, according to Bloomberg.\n* DeepSeek's reopening intensifies the AI competition with OpenAI and other major players.\n"
  },
  {
    "id": 11,
    "summary": "Amazon's $25 Billion Robotics Push Targets Cost Savings, AI Growth And Temu Competition: Report\n* Amazon has committed up to $25 billion to its retail network, including robotics, aiming for cost savings and AI growth.\n* The investment is partly driven by competition from Temu and the increasing costs of artificial intelligence.\n* The robotics investment has the potential to generate near-term savings for Amazon.\n"
  },
  {
    "id": 12,
    "summary": "Amazons subscription-based Alexa+ looks highly capableand questionable\n* Amazon is launching Alexa+, a more conversational and capable version of its voice assistant, powered by large language models.  It will be free for Prime members and $20/month for non-Prime subscribers.\n* Alexa+ will initially be available on Echo Show 8, 10, 15, and 21 smart displays.  Amazon demonstrated features like personalized recipe recommendations, ticket price monitoring, and seamless integration with other Amazon services like Amazon Music and Fire TV.\n* This upgrade aims to revitalize Amazon's voice assistant business, which has struggled to be profitable, especially in the face of competition from generative AI chatbots.\n"
  },
  {
    "id": 13,
    "summary": "Anthropic's Claude 3.7 Sonnet reportedly cost a few tens of millions of dollars to train, similar to Claude 3.5 and cheaper than GPT-4, which cost over $100M\n* Anthropic's latest AI model, Claude 3.7 Sonnet, reportedly cost \"a few tens of millions of dollars\" to train, using less than 10^26 FLOPs of computing power.\n* This cost is comparable to Claude 3.5 and significantly lower than the reported training costs of OpenAI's GPT-4 (over $100 million) and Google's Gemini Ultra (close to $200 million).\n* While current costs are relatively low, Anthropic CEO Dario Amodei predicts future AI model training will cost billions of dollars.\n"
  },
  {
    "id": 14,
    "summary": "Apple AI tool transcribed the word 'racist' as 'Trump'\n* Apple's speech-to-text tool incorrectly transcribed \"racist\" as \"Trump,\" a problem the company claims is due to difficulty distinguishing words with \"r.\"  A fix is being rolled out.\n* Experts dispute Apple's explanation, citing the distinct phonetic differences and vast training data used for such models.  One expert suggested potential software manipulation, while a former Apple employee called it a \"serious prank.\"\n* This follows another incident where Apple's AI-generated news summaries displayed false information, leading to suspension of the feature.\n"
  },
  {
    "id": 15,
    "summary": "Artificial Intelligence (AI) and the Metaverse Intellectual Property (IP), Standards and Policies Training Course (ONLINE EVENT: March 11, 2025 & ON-DEMAND)\n* A training course titled \"Artificial Intelligence (AI) and the Metaverse: Intellectual Property (IP) and standards and policies\" has been launched by ResearchAndMarkets.com, covering legal and commercial aspects of AI and Metaverse technologies.\n* The course addresses intellectual property issues arising from AI and Metaverse use, AI standards' role in policy development, and the latest UK and EU legislation.  It includes a practical workshop on negotiating IP clauses.\n*  The speakers include Mark Weston, a partner at Hill Dickinson LLP specializing in commercial, IP, and IT law, and Henry Rivero, founder of Riveroconsult with expertise in TV and digital media.\n"
  },
  {
    "id": 16,
    "summary": "Balancing Power With Caution: AIs Impact On Breast Cancer\n* AI is showing promise in breast cancer care, particularly in mammography, potentially increasing detection rates by 20% without increasing false positives.  However, access to AI-enhanced mammography is currently uneven, and research is ongoing to determine if AI can match the effectiveness of dual radiologist readings.\n* While AI can accelerate analysis and personalize treatment, challenges remain, including biased datasets and cost barriers for patients.  Advocacy for insurance coverage and diverse participation in clinical trials are needed to ensure equitable access.\n*  The human element in healthcare must be preserved alongside AI advancements. Technology should complement, not replace, compassionate patient care.\n"
  },
  {
    "id": 17,
    "summary": "Big bang Nvidia Q4 earnings today; here's what you need to watch out for\n* Nvidia's Q4 FY2025 earnings are projected at $38.32 billion in revenue and $21.08 billion in net income, representing significant year-over-year growth driven by increasing demand for AI infrastructure.\n*  Analysts are optimistic about Nvidia's performance, with a majority giving \"buy\" ratings and an average price target of $175.  The company's success is attributed to the rising demand for its data center chips, particularly in the AI sector.\n*  Investors are also keenly awaiting Nvidia's FY2026 guidance, with projected revenue of approximately $42 billion.  The performance and guidance will be key indicators of the overall AI market's trajectory.\n"
  },
  {
    "id": 18,
    "summary": "Billionaire Ray Dalio Says AI Risks 'Totalitarian Control Or Anarchy' As It Could Reshape World In Next 5 Years: Here Are AI-Linked ETFs For Investors To Consider\n* Billionaire Ray Dalio warns that the development of artificial intelligence (AI) could lead to totalitarian control or anarchy in the next five years.\n* Dalio shared his concerns during a recent podcast interview with Tucker Carlson.\n* He emphasized the unpredictable nature of AI's development and its potential societal impact. \n"
  },
  {
    "id": 19,
    "summary": "Bluesky Dubs AI Video of Trump Sucking Elon Musk's Toes 'Non-Consensual Explicit Material'\n* Bluesky removed an AI-generated video depicting Trump engaging in explicit acts with Elon Musk, citing it as \"non-consensual explicit material.\"\n* The video, shared by journalist Marisa Kabas, was originally displayed on hacked TV screens within the Department of Housing and Urban Development (HUD) as a protest.\n* After Kabas appealed the removal, citing the video's political context, Bluesky reinstated it, acknowledging their moderators initially missed the newsworthy context.\n"
  },
  {
    "id": 20,
    "summary": "Cash torrent pouring into Nvidia slows  despite booming Blackwell adoptionMay we all have problems like annual revenue growth dropping from 126 to 114 percentSystems11 hrs|6\n* Nvidia's fiscal year 2025 revenue reached $130 billion (114% growth), slightly lower than the previous year's 126% growth, with Q4 2025 revenue at $39.3 billion, including $11 billion from Blackwell GPUs.  Profits for FY 2025 were $72.9 billion (145% growth).\n*  The company forecasts Q1 2026 revenue around $43 billion, driven by anticipated demand for AI infrastructure, particularly large GPU clusters, and the shift towards widespread inferencing deployments.\n*  While facing geopolitical pressures like export controls and potential tariffs, Nvidia remains optimistic about growth, citing strong demand for Blackwell accelerators, NVLink interconnects, and networking products, and partnerships like the one with Cisco for Spectrum-X.\n"
  }
]


class TopicSpec(BaseModel):
    """TopicSpec class for structured output of story topics"""
    id: int = Field(description="The id of the story")
    extracted_topics: List[str] = Field(
        description="List of topics covered in the story")


class TopicSpecList(BaseModel):
    """List of TopicSpec class for structured output"""
    items: List[TopicSpec] = Field(description="List of TopicSpec")


prompt_template = ChatPromptTemplate.from_messages([
        ("system", input_prompt),
        ("user", "{input_text}")
    ])
input_dict = {"input_text": input_text}

# gives pydantic validation error with google, works with openai
chain = prompt_template | model.with_structured_output(TopicSpecList)
response = chain.invoke(input_dict)

response


ValidationError: 20 validation errors for TopicSpecList
items.0.extracted_topics
  Field required [type=missing, input_value={'id': 0.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
items.1.extracted_topics
  Field required [type=missing, input_value={'id': 1.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
items.2.extracted_topics
  Field required [type=missing, input_value={'id': 2.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
items.3.extracted_topics
  Field required [type=missing, input_value={'id': 3.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
items.4.extracted_topics
  Field required [type=missing, input_value={'id': 4.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
items.5.extracted_topics
  Field required [type=missing, input_value={'id': 5.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
items.6.extracted_topics
  Field required [type=missing, input_value={'id': 6.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
items.7.extracted_topics
  Field required [type=missing, input_value={'id': 7.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
items.8.extracted_topics
  Field required [type=missing, input_value={'id': 8.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
items.9.extracted_topics
  Field required [type=missing, input_value={'id': 9.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
items.10.extracted_topics
  Field required [type=missing, input_value={'id': 11.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
items.11.extracted_topics
  Field required [type=missing, input_value={'id': 12.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
items.12.extracted_topics
  Field required [type=missing, input_value={'id': 13.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
items.13.extracted_topics
  Field required [type=missing, input_value={'id': 14.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
items.14.extracted_topics
  Field required [type=missing, input_value={'id': 15.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
items.15.extracted_topics
  Field required [type=missing, input_value={'id': 16.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
items.16.extracted_topics
  Field required [type=missing, input_value={'id': 17.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
items.17.extracted_topics
  Field required [type=missing, input_value={'id': 18.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
items.18.extracted_topics
  Field required [type=missing, input_value={'id': 19.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
items.19.extracted_topics
  Field required [type=missing, input_value={'id': 20.0}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing

In [None]:
chain = prompt_template | model | JsonOutputParser()
response = chain.invoke(input_dict)
# Validate with TopicSpecList
try:
    validated_response = TopicSpecList(**response)
    print(validated_response)
except ValidationError as e:
    print(f"Validation Error: {e.json()}")