## **Guardian Agent Test**

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
cohere_api_key = os.environ["COHERE_API_KEY"]
guardian_api_key = os.environ["GUARDIAN_API_KEY"]

In [2]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_cohere import ChatCohere, CohereEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from datetime import datetime

USER_AGENT environment variable not set, consider setting it to identify your requests.


### **1) Get data** -> Guardian API

In [3]:
results = []

In [4]:
import requests, json

#1) us-news:
request = requests.get(
        "https://content.guardianapis.com/search?tag=-tone/minutebyminute&section=us-news&show-fields=headline,trailText,body",
        headers={"api-key": guardian_api_key},
        timeout=10
    )

results = request.json().get("response", {}).get("results", [])
print(len(results))

10


In [5]:
#2) sport:
request = requests.get(
        "https://content.guardianapis.com/search?tag=-tone/minutebyminute&section=sport&show-fields=headline,trailText,body",
        headers={"api-key": guardian_api_key},
        timeout=10
    )

results += request.json().get("response", {}).get("results", [])
print(len(results))

20


In [6]:
#3) football:
request = requests.get(
        "https://content.guardianapis.com/search?tag=-tone/minutebyminute&section=football&show-fields=headline,trailText,body",
        headers={"api-key": guardian_api_key},
        timeout=10
    )

results += request.json().get("response", {}).get("results", [])
print(len(results))

30


In [7]:
import html2text
from cleantext import clean

#HTML tags cleaner:
text_maker = html2text.HTML2Text()
text_maker.ignore_links = True

#useful material:
latest_articles = []

#STORE & CLEAN:
for i in range(len(results)):
    res = results[i]
    #HEADLINE:
    headline = res.get("fields", {}).get("headline", "").lower()
    headline = f"{headline}. " if headline != "" else headline
    #TRAILTEXT:
    trailtext = res.get("fields", {}).get("trailText", "").lower()
    trailtext = f"{trailtext}. " if trailtext != "" else trailtext
    #BODY:
    body = res.get("fields", {}).get("body", "")
    body = text_maker.handle(body)
    body = clean(body, no_emoji=True)
    body = body.replace(">", "").replace("\n\n", "###").replace(".\n", ".###").replace("\n", " ").replace("###", "\n")
    results[i]["fields"]["body"] = body
    #CONCATENATE:
    latest_articles.append(f"{headline}{trailtext}{body}")


In [8]:
json.dump(results, open(f"{datetime.strftime(datetime.now(), '%Y-%m-%d_%H:%M')}test.json", "w"))

In [9]:
print(len(latest_articles))
print(latest_articles[0])

30
former pennsylvania fire chief identified as victim killed at trump rally. corey comperatore, 50, was a ‘hero’ who ‘had so much life left to experience’, his sister says. corey comperatore, a former fire chief of the buffalo township volunteer fire company in pennsylvania, has been identified as the victim who was shot and killed amidst an assassination attempt on former us president donald trump on saturday.
"he was a hero that shielded his daughters. his wife and girls just lived through the unthinkable and unimaginable," comperatore's sister, dawn comperatore schafer, said in a post on facebook.
"my baby brother just turned 50 and had so much life left to experience.
hatred has no limits and love has no bounds. pray for my sister-in-law, nieces, my mother, sister, me and his nieces and nephews as this feels like a terrible nightmare but we know it is our painful reality." two other rally attendants were wounded.
comperatore's wife and daughter described the scene at the rally, st

### **2) Vectorize & prepare retriever**

In [10]:
from langchain.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)

documents = text_splitter.create_documents(latest_articles)
print(documents[0])
print(documents[1])
all_splits = text_splitter.split_documents(documents)


page_content='former pennsylvania fire chief identified as victim killed at trump rally. corey comperatore, 50, was a ‘hero’ who ‘had so much life left to experience’, his sister says. corey comperatore, a former fire chief of the buffalo township volunteer fire company in pennsylvania, has been identified as the victim who was shot and killed amidst an assassination attempt on former us president donald trump on saturday.
"he was a hero that shielded his daughters. his wife and girls just lived through the unthinkable and unimaginable," comperatore's sister, dawn comperatore schafer, said in a post on facebook.
"my baby brother just turned 50 and had so much life left to experience.
hatred has no limits and love has no bounds. pray for my sister-in-law, nieces, my mother, sister, me and his nieces and nephews as this feels like a terrible nightmare but we know it is our painful reality." two other rally attendants were wounded.' metadata={'start_index': 0}
page_content='comperatore's 

In [11]:
from langchain_cohere import CohereEmbeddings
from langchain_community.vectorstores import Chroma

#Create Vector DB:
vectorstore = Chroma.from_documents(documents=all_splits, embedding=CohereEmbeddings())

In [12]:
#Init retriever:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

Test retriever:

In [13]:
retrieved_docs = retriever.invoke("Is Donald Trump dead?")
len(retrieved_docs)

Failed to get info from https://api.smith.langchain.com: LangSmithError('Failed to GET /info in LangSmith API. latin-1\n…\n0\n1\nordinal not in range(256)')
Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. latin-1\n…\n0\n1\nordinal not in range(256)')


6

In [14]:
for doc in retrieved_docs:
    print(doc.page_content)

aides to trump's election campaign said on sunday morning that he was fine**,** in "great spirits and doing well".
the former president called for national unity and resilience, in a country deeply divided amid the tumultuous 2024 election, where joe biden's re- election campaign as the presumptive democratic nominee has been thrown into crisis by his shaky performance while trump is expected to be anointed as the republican nominee at the party's convention this week with a hard-right agenda.
trump posted on his social media platform: "i knew immediately that something was wrong in that i heard a whizzing sound, shots, and immediately felt the bullet ripping through the skin ... much bleeding took place," then further posted on sunday that "it is more important than ever that we stand united, and show our true character as americans, remaining strong and determined, and not allowing evil to win", also adding that "it was god alone who prevented the unthinkable from happening".
as trum

### **3) Add LLM**

In [None]:
from langchain_cohere import ChatCohere
from langchain_core.messages import HumanMessage

#Init Foundation LLM:
llm = ChatCohere()

#test:
messages = [HumanMessage(content="knock knock")]
print(llm.invoke(messages))

In [26]:
from langchain.prompts import ChatPromptTemplate

#Build prompt:
template = """You are a journalist and you are giving answers to the audience's questions. Use the following pieces of retrieved context to answer the question. If you know the answer, use five sentences maximum and be as detailed as possible. If you don't know the answer, say that you don't have any information on that, then briefly tell the user what you know on the requested topic in two sentences maximum.

Question: {question}

Context: {context}

Answer:
"""

prompt = ChatPromptTemplate.from_template(template)

Format documents:

In [17]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

Define RAG chain:

In [27]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

In [28]:
rag_chain.invoke("Is Donald Trump alive?")

'Yes, Donald Trump is alive. The former president and current Republican nominee for the 2024 election survived an assassination attempt in western Pennsylvania. Trump sustained a non-life-threatening injury to his right ear and is said to be in "great spirits."'

In [29]:
rag_chain.invoke("Is Donald Trump dead?")

'No, Donald Trump is not dead. The former president and current Republican nominee for the 2024 election is in "great spirits and doing well" after surviving an apparent assassination attempt at a rally in western Pennsylvania.'

In [30]:
for chunk in rag_chain.stream("Who just won Wimbledon?"):
    print(chunk, end="", flush=True)

Carlos Alcaraz just won Wimbledon, defeating Novak Djokovic in the men's singles final. This is Alcaraz's second consecutive Wimbledon title and fourth major title overall. The final score was 6-2, 6-2, 7-6 (4), with Alcaraz dominating and closing out the match with a brilliant drop shot winner in the tiebreak. At 21 years old, Alcaraz is the first man since Roger Federer to win his first four Grand Slam finals.

In [31]:
for chunk in rag_chain.stream("Is Jannik Sinner still playing in Wimbledon?"):
    print(chunk, end="", flush=True)

I don't have any information on whether Jannik Sinner is still playing in Wimbledon. However, I can tell you that the 2024 Wimbledon tournament is currently ongoing, with the men's singles final having taken place on July 14.

In [32]:
rag_chain.invoke("Who are you?")

"I am a journalist reporting on various events and stories, including the English celebrations for the football team's victory, the Trump rally shooting, and the endurance runner Russ Cook."

In [33]:
rag_chain.invoke("Which big game?")

'The big game is the Euro 2024 final between England and Spain at the Olympiastadion in Berlin.'

In [34]:
rag_chain.invoke("Are you ready for the football match?")

"Yes, I am ready for the final match of Euro 2024 between England and Spain. The game will take place in Berlin, and England's manager Gareth Southgate has stated that his team is ready to face their opponents. This will be a thrilling match, and I am excited to see who will lift the trophy."

In [37]:
rag_chain.invoke("Is Italy still playing in the European Championship?")

'No, Italy is not still playing in the European Championship. The context mentions that England lost to Italy in the final of the last Euros. However, it does not specify when this took place.'

In [36]:
rag_chain.invoke("When is the final?")

'The final of the Euros 2024 will take place on Sunday night at the Olympiastadion in Berlin, Germany.'

In [38]:
for chunk in rag_chain.stream("Dimmi in italiano chi gioca la finale degli europei di calcio?"):
    print(chunk, end="", flush=True)

La Spagna affronterà l'Inghilterra nella finale degli Europei di calcio 2024. Questa sarà la quinta apparizione della Spagna in una finale europea, mentre l'Inghilterra ha raggiunto la finale per la prima volta. La Spagna ha vinto tre dei suoi quattro precedenti apparizioni in finale, mentre l'Inghilterra spera di vincere il suo primo titolo europeo. La partita si preannuncia entusiasmante, con entrambe le squadre determinate a conquistare la vittoria.