In [1]:
import os
from json import JSONDecodeError
from typing import (
    Annotated,
    Any,
    Callable,
    Dict,
    List,
    Literal,
    Optional,
    Sequence,
    Union,
)

from dotenv import load_dotenv
from datasets import load_dataset
from langchain import hub
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import BaseMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import END, START, StateGraph
from langgraph.graph.message import add_messages
from langgraph.prebuilt import ToolNode, tools_condition
from pydantic import BaseModel, ConfigDict, Field
from typing_extensions import TypedDict


load_dotenv(override=True)

True

In [2]:
from datasets import load_dataset

from langchain_core.documents import Document


In [3]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from uuid import uuid4
from tqdm import tqdm
from datasets import load_dataset


qclient = QdrantClient(host="localhost", port=6334)
# embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L12-v2")
embedding_model = HuggingFaceEmbeddings(
    model_name=os.getenv(
        "HUGGINGFACE_EMBEDDING_MODEL", "intfloat/multilingual-e5-large-instruct"
    )
)
print(embedding_model)

collection_name = "climateqa-ipcc-ipbes-reports-1.0"

collections = [collection.name for collection in qclient.get_collections().collections]
print(collections)
vector_size = 1024
# vector_size = 384
if collection_name not in collections:
    qclient.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
    )

vector_store = QdrantVectorStore(
    client=qclient,
    collection_name=collection_name,
    embedding=embedding_model,
)

if collection_name not in collections:
    ipcc_data = load_dataset("Ekimetrics/climateqa-ipcc-ipbes-reports-1.0")
    docs = [
        Document(
            page_content=doc["content"],
            metadata={k: v for k, v in doc.items() if k != "content"},
        ) for doc in ipcc_data["train"] if doc["chunk_type"]=="text"
    ]
    vector_store.add_documents(
        documents=docs, ids=[str(uuid4()) for _ in range(len(docs))]
    ) 

model_name='intfloat/multilingual-e5-large-instruct' cache_folder=None model_kwargs={} encode_kwargs={} multi_process=False show_progress=False
['climateqa-ipcc-ipbes-reports-1.0', 'IPCC_AR6_SYR_LongerReport']


In [4]:
# qclient.delete_collection(collection_name)
collections = [collection.name for collection in qclient.get_collections().collections]
print(collections)

['climateqa-ipcc-ipbes-reports-1.0', 'IPCC_AR6_SYR_LongerReport']


In [5]:
results = vector_store.similarity_search("The world has gotten warmer", k=5)

for res in results:
    content = res.page_content.replace("\n", " ")
    print(f"* {content} [{res.metadata}]")

* Observed changes in temperature have emerged in most regions [{'page_number': 211, 'section_header': 'Observed changes in temperature have emerged in most regions', 'num_characters': 60.0, 'num_words': 9.0, 'num_tokens': 10.0, 'num_tokens_approx': 12.0, 'chunk_type': 'text', 'toc_level0': '1: Framing, Context, and Methods', 'toc_level1': '1.4 AR6 Foundations and Concepts', 'toc_level2': '1.4.2 Variability and Emergence  of the Climate Change Signal', 'toc_level3': None, 'document_id': 'document2', 'document_number': 2, 'source': 'IPCC', 'report_type': 'Full Report', 'tags': 'AR6;WGI', 'short_name': 'IPCC AR6 WGI FR', 'name': 'Full Report. In: Climate Change 2021: The Physical Science Basis. Contribution of the WGI to the AR6 of the IPCC', 'chapter': None, 'url': 'https://report.ipcc.ch/ar6/wg1/IPCC_AR6_WGI_FullReport.pdf', 'n_pages': 2409, 'release_date': 2021, 'element_id': None, 'image_path': None, 'file_size': None, 'figure_code': None, '_id': 'b7813732-cb4f-4229-832c-337daa6a47ae

In [6]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank

retriever = vector_store.as_retriever(search_kwargs={"k": 15})

compressor = FlashrankRerank(top_n=10, model="ms-marco-MiniLM-L-12-v2", score_threshold=0.01) # uses ms-marco-MultiBERT-L-12 multilingual model
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke(
    "There is no scenario as for which humans can avoid warming of the atmosphere to over 1.5 degrees of preinductrial levels"
)
print([doc.metadata for doc in compressed_docs])


INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/query "HTTP/1.1 200 OK"


[{'id': 4, 'relevance_score': np.float32(0.017267333), 'page_number': 139, 'section_header': 'Box 1.1 | Summary of IPCC AR5 and Special Report findings', 'num_characters': 848.0, 'num_words': 161.0, 'num_tokens': 210.0, 'num_tokens_approx': 214.0, 'chunk_type': 'text', 'toc_level0': 'Chapters and Cross-Chapter Papers ', 'toc_level1': 'Chapter 1  Point of Departure and Key Concepts', 'toc_level2': None, 'toc_level3': None, 'document_id': 'document6', 'document_number': 6, 'source': 'IPCC', 'report_type': 'Full Report', 'tags': 'AR6;WGII', 'short_name': 'IPCC AR6 WGII FR', 'name': 'Full Report. In: Climate Change 2022: Impacts, Adaptation and Vulnerability. Contribution of the WGII to the AR6 of the IPCC', 'chapter': None, 'url': 'https://report.ipcc.ch/ar6/wg2/IPCC_AR6_WGII_FullReport.pdf', 'n_pages': 3068, 'release_date': 2022, 'element_id': None, 'image_path': None, 'file_size': None, 'figure_code': None, '_id': '9fc90b38-1087-4b7f-b5b5-0bc3ec184f53', '_collection_name': 'climateqa-ip

In [7]:
[(doc.metadata["relevance_score"].item(), doc.page_content) for doc in compressed_docs]

[(0.01726733334362507,
  '* Global warming is likely to reach 1.5degC between 2030 and 2052 if it continues to increase at the current rate.\r\n* Climate-related risks for natural and human systems are higher for global warming of 1.5degC than at present, but lower than at 2degC. \r\nMost adaptation needs will be lower for global warming of 1.5degC compared to 2degC.\r\n* In model pathways with no or limited overshoot of 1.5degC, global net anthropogenic CO2 emissions decline by about 45% from 2010 \r\nlevels by 2030 (40-60% interquartile range), reaching net zero around 2050 (2045-2055 interquartile range).\r\n* Pathways reflecting current nationally stated mitigation ambitions as submitted under the Paris Agreement would not limit global \r\nwarming to 1.5degC, even if supplemented by very challenging increases in the scale and ambition of emissions reductions after 2030.'),
 (0.013359571807086468,
  'of Global Warming of 1.5degC Above Pre-industrial Levels and Related Global \r\nGre

In [8]:
import json

from langchain.tools import tool

# qdrant_retriever = vector_store.as_retriever(search_kwargs={"k": 10})


@tool(name_or_callable="qdrant_retriever")
def retriever_tool(claim: str):
    """
    Retreives for examples from IPCC reports
    """
    results = compression_retriever.invoke(claim)
    return json.dumps(
        [
            {
                "text": res.page_content,
                "source": res.metadata["source"],
                "report_short_name": res.metadata["short_name"],
                "report_name": res.metadata["name"],
                "page_number": f"{res.metadata['page_number']}",
                "url": res.metadata["url"],
            }
            for res in results
        ]
    )


docs =json.loads(retriever_tool.invoke("There is no scenario as for which humans can avoid warming of the atmosphere to over 1.5 degrees of preinductrial levels"))
docs


INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/query "HTTP/1.1 200 OK"


[{'text': '* Global warming is likely to reach 1.5degC between 2030 and 2052 if it continues to increase at the current rate.\r\n* Climate-related risks for natural and human systems are higher for global warming of 1.5degC than at present, but lower than at 2degC. \r\nMost adaptation needs will be lower for global warming of 1.5degC compared to 2degC.\r\n* In model pathways with no or limited overshoot of 1.5degC, global net anthropogenic CO2 emissions decline by about 45% from 2010 \r\nlevels by 2030 (40-60% interquartile range), reaching net zero around 2050 (2045-2055 interquartile range).\r\n* Pathways reflecting current nationally stated mitigation ambitions as submitted under the Paris Agreement would not limit global \r\nwarming to 1.5degC, even if supplemented by very challenging increases in the scale and ambition of emissions reductions after 2030.',
  'source': 'IPCC',
  'report_short_name': 'IPCC AR6 WGII FR',
  'report_name': 'Full Report. In: Climate Change 2022: Impacts

In [25]:
llm = ChatOllama(
    model="bespoke-minicheck:latest",
    temperature=float(os.getenv("TEMPERATURE", 0.1)),
    context_window=32000,
)

In [26]:
import numpy as np
print(*np.unique([1, 2,3,2]))

1 2 3


In [27]:
np.unique([doc['report_short_name'] for doc in docs])

array(['IPCC AR6 WGII FR'], dtype='<U16')

In [28]:
claim = "There is no scenario as for which humans can avoid warming of the atmosphere to over 1.5 degrees of preinductrial levels"

docs = json.loads(retriever_tool.invoke(claim))

document = f"""The following source document has been compile from the following sources: {[*np.unique([doc['report_short_name'] for doc in docs])]}.
"""
for idx, doc in enumerate(docs):
    document += f"{idx+1}) {doc['text']}\n"

llm.invoke(f"Document: {document}\nClaim:{claim}")

INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


AIMessage(content='No', additional_kwargs={}, response_metadata={'model': 'bespoke-minicheck:latest', 'created_at': '2025-03-18T15:50:29.014525Z', 'done': True, 'done_reason': 'stop', 'total_duration': 3474620125, 'load_duration': 31934333, 'prompt_eval_count': 572, 'prompt_eval_duration': 3235000000, 'eval_count': 8, 'eval_duration': 203000000, 'message': Message(role='assistant', content='', images=None, tool_calls=None)}, id='run-90771f40-17c5-4d60-87ee-af1b3f33cab9-0', usage_metadata={'input_tokens': 572, 'output_tokens': 8, 'total_tokens': 580})

In [29]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.runnables import RunnableParallel


def format_docs(docs):
    doc_json = json.loads(docs)
    document = f"""The following source document has been compile from the following sources: {[*np.unique([doc['report_short_name'] for doc in doc_json])]}.
    """
    for idx, doc in enumerate(doc_json):
        document += f"{idx+1}) {doc['text']}\n"
    return document


prompt = ChatPromptTemplate.from_template("Document: {document}\nClaim:{claim}")

chain = (
    { 
        "claim": RunnablePassthrough(),
        "document": retriever_tool | RunnableLambda(format_docs)
    }
    | prompt
    | llm
)

print(chain.invoke(claim))


INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


content='No' additional_kwargs={} response_metadata={'model': 'bespoke-minicheck:latest', 'created_at': '2025-03-18T15:50:30.818735Z', 'done': True, 'done_reason': 'stop', 'total_duration': 1343291709, 'load_duration': 29347042, 'prompt_eval_count': 573, 'prompt_eval_duration': 1095000000, 'eval_count': 8, 'eval_duration': 214000000, 'message': Message(role='assistant', content='', images=None, tool_calls=None)} id='run-8e56e7cb-e2ee-4ac8-ada7-4a42ac2e5f4a-0' usage_metadata={'input_tokens': 573, 'output_tokens': 8, 'total_tokens': 581}


In [14]:
def download_claims(n_samples=None):
    dataset_name = "charlotte-samson/climatesafeguards"
    dataset = load_dataset(dataset_name, split="test")
    dataset = (
        dataset.select_columns(
            [
                "whisper-largev3",
                "Misinfo",
            ]
        )
        .shuffle()
        .to_pandas()
        .dropna()
        .rename(columns={"whisper-largev3": "text"})
    )
    if n_samples is not None:
        dataset = dataset.sample(n=n_samples)
    dataset["label"] = dataset.Misinfo.astype(int).map({0: "correct", 1: "incorrect"})
    return dataset

In [15]:
dataset = download_claims()

In [16]:
dataset.head()

Unnamed: 0,text,Misinfo,label
0,"Le journal permanent, Nina Pavot. Ouverture ...",1,incorrect
1,des normes environnementales de plus en plus ...,1,incorrect
2,"C'est pour ça déjà qu'on participe à ça, pour...",0,correct
3,"des mécontentements. Mais là, on arrive à un ...",1,incorrect
6,"des grands réseaux sociaux, 20 ans ces jours...",0,correct


In [17]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=40,
    length_function=len,
    is_separator_regex=False,
)
texts = text_splitter.split_text(dataset.iloc[4].text)

In [18]:
display(len(texts))
display(dataset.iloc[4].Misinfo)
texts

22

np.int64(0)

['des grands réseaux sociaux, 20 ans  ces jours-ci, qui ont changé  les réseaux sociaux dans notre quotidien.  Il y a du bon, il y a du moins bon',
 "Il y a du bon, il y a du moins bon  également, on verra ça avec Eugénie. La revue de presse  d'Europe 1 également arrive dans 5 minutes.  Bonjour",
 'arrive dans 5 minutes.  Bonjour Olivier Delagarde. Bonjour mon cher  Dimitri. A la une ce matin. Il y a des  écolos qui mangent leur chapeau, un',
 'écolos qui mangent leur chapeau, un  champion qui tire sa révérence  et puis les enthousiasmes sportifs très sélectifs  de notre président. Allez, à',
 "sélectifs  de notre président. Allez, à tout de suite.  Mais d'abord, comme tous les vendredis,  le regard de Catherine sur votre  actualité. Bonjour",
 'Catherine sur votre  actualité. Bonjour Catherine. Bonjour  Dimitri, bonjour à tous. Alors les tracteurs  Catherine, vont rentrer à la ferme',
 "Catherine, vont rentrer à la ferme  aujourd'hui. Certains s'approchaient de la capitale, mais  les 

In [32]:
responses = chain.batch(texts)
predictions = [int(resp.content=="No") for resp in responses]
any(predictions)

INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/

True

In [34]:
from tqdm.autonotebook import tqdm

verdicts = []
for idx, record in tqdm(dataset.iterrows(), total=len(dataset)):
    texts = text_splitter.split_text(record.text)
    responses = chain.batch(texts)
    predictions = [resp.content=="No" for resp in responses]
    verdicts.append(int(any(predictions)))

  0%|          | 0/44 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6334/collections/climateqa-ipcc-ipbes-reports-1.0/points/

In [38]:
len(dataset)

44

In [40]:
dataset = dataset.assign(llm_pred=verdicts)


from sklearn.metrics import classification_report
print(classification_report(dataset.Misinfo.values, dataset.llm_pred.values))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        17
           1       0.61      1.00      0.76        27

    accuracy                           0.61        44
   macro avg       0.31      0.50      0.38        44
weighted avg       0.38      0.61      0.47        44



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
