In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet("../../data/raw/4_channels_predictions_09_2023_09_2024.parquet")
print(df.shape)

(39131, 11)


In [3]:
df.start = pd.to_datetime(df.start)
df.start.dt.strftime("%Y-%m").value_counts()
# df = df.loc[(df.start >= pd.to_datetime("2023-12-01")) & (df.start <= pd.to_datetime("2024-05-01"))]

start
2024-02    3000
2023-09    2966
2024-01    2966
2023-08    2876
2023-11    2589
2023-05    2501
2024-04    2432
2024-03    2399
2024-08    2393
2023-07    2342
2023-12    2263
2024-07    2241
2023-10    2068
2024-05    2022
2023-06    1947
2024-06    1737
2024-09     334
2023-04      55
Name: count, dtype: int64

In [4]:
df.columns

Index(['start', 'text', 'channel_name', 'channel_is_radio',
       'channel_program_type', 'channel_program', 'themes', 'keywords',
       'num_keywords', 'num_tokens', 'claims'],
      dtype='object')

In [5]:
import litellm
import openai
from litellm import acompletion, completion_cost
from pydantic import BaseModel, Field
from typing import Callable, Awaitable, Union
from functools import wraps
import asyncio
from tqdm.asyncio import tqdm


class MediatreePrediction(BaseModel):
    cards_label_pred: str
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
    cost: float
    claim_pred: str | None = None


class MediatreeClaimIdentifier(BaseModel):
    claim: str = Field(
        description="Reformulation courte et claire de l'affirmation à vérifier"
    )
    context: str = Field(
        description="Un seul paragraphe résumant le contexte essentiel pour comprendre l'affirmation"
    )
    quote: str = Field(description="Citation exacte du texte contenant l'affirmation")


class MediatreeClaimIdentifierResponse(BaseModel):
    claims: Union[list[MediatreeClaimIdentifier], None]
    prompt_tokens: Union[int, None]
    completion_tokens: Union[int, None]
    total_tokens: Union[int, None]
    cost: Union[float, None]

In [6]:
import json


def extract_claims_batch_prepare(texts: list[str]) -> list[str]:
    system_prompt = """
Tu es un assistant spécialisé dans l'analyse de désinformation environnementale.

TÂCHE:
Analyse l'extrait de transcription TV/Radio fourni et identifie les affirmations (claims) qui nécessitent une vérification factuelle sur les thèmes suivants:
- Changement climatique
- Transition écologique
- Énergie
- Biodiversité
- Pollution
- Pesticides
- Ressources naturelles (eau, minéraux, etc.)


FORMAT DE RÉPONSE:
Tu dois OBLIGATOIREMENT répondre au format JSON suivant:
{
    "claims": [
        {
            "claim": "Reformulation courte et claire de l'affirmation à vérifier",
            "context": "Un seul paragraphe résumant le contexte essentiel pour comprendre l'affirmation",
            "quote": "Citation exacte du texte contenant l'affirmation"
        }
    ]
}

RÈGLES IMPORTANTES:
1. Inclure UNIQUEMENT les affirmations vérifiables sur les thèmes environnementaux
2. Chaque claim doit être unique
3. Le format JSON doit être strictement respecté
4. Si aucune affirmation à vérifier n'est trouvée, renvoyer un tableau claims vide
5. Maximum 3 claims par analyse

Analyse maintenant le texte suivant:"""
    batch_lines = []
    for index, text in enumerate(texts):
        batch_lines.append(
            json.dumps(
                dict(
                    custom_id=f"claim_extract_text_id_{index}",
                    method="POST",
                    url="/v1/chat/completions",
                    body=dict(
                        messages=[
                            {"role": "system", "content": system_prompt},
                            {"role": "user", "content": text.strip()},
                        ],
                        model="gpt-4o-mini",
                        max_tokens=2000,
                        temperature=0.01,
                        response_format={"type": "json_object"},
                    ),
                )
            )
        )
    return batch_lines


with open("../../data/processed/batch/claim_extract_batch_lines.jsonl", "w") as f:
    f.write("\n".join(extract_claims_batch_prepare(df.text.tolist())))

In [7]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o-mini")
len(encoding.encode("\n".join(extract_claims_batch_prepare(df.text.tolist()))))

47438077

In [8]:
import time

RUN_BATCH = False

if RUN_BATCH:
    client = openai.OpenAI()
    batch_file = client.files.create(
        file=open("../../data/processed/batch/claim_extract_batch_lines.jsonl", "rb"),
        purpose="batch",
    )
    batch_job = client.batches.create(
        input_file_id=batch_file.id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
    )

    idx = 0
    while True:
        batch_job = client.batches.retrieve(batch_job.id)
        if not batch_job.status == "completed":
            idx += 1
            if idx > 3:
                idx = 1
                print(batch_job.status, "   ", end="\r")
            print(batch_job.status, "." * idx, end="\r", sep=" ")
        else:
            break
        time.sleep(1)
    print(batch_job)
    result_file_id = batch_job.output_file_id
    result = client.files.content(result_file_id).content

    result_file_name = (
        "../../data/processed/batch/claim_extract_batch_lines_results.jsonl"
    )

    with open(result_file_name, "wb") as file:
        file.write(result)
    json_lines = result.decode().splitlines()
else:
    with open(
        "../../data/processed/batch/claim_extract_batch_lines_results.jsonl", "r"
    ) as file:
        result = file.read()
    json_lines = result.splitlines()

In [9]:
records = []
for response in json_lines:
    response = json.loads(response)
    for key, claim_list in json.loads(
        response["response"]["body"]["choices"][0]["message"]["content"]
    ).items():
        if len(claim_list) > 0:
            for claim in claim_list:
                record = dict(
                    custom_id=response["custom_id"],
                    claim=claim["claim"],
                    context=claim["context"],
                    quote=claim["quote"],
                )
                records.append(record)
        else:
            record = dict(
                custom_id=response["custom_id"], claim=None, context=None, quote=None
            )
            records.append(record)

In [10]:
df.head()

Unnamed: 0_level_0,start,text,channel_name,channel_is_radio,channel_program_type,channel_program,themes,keywords,num_keywords,num_tokens,claims
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1dcd4b454f8bac42440259ce26a1a2192051186bb5728be489dc654a9a967d1d,2023-09-01 06:26:00,<unk> <unk> <unk> <unk> aerosmith en janvier m...,europe1,True,Information - Magazine,Bonjour,"[""biodiversite_concepts_generaux"", ""adaptation...","[{""keyword"": ""eaux"", ""timestamp"": 169354246806...",1,383,[{'analysis': 'Cette allégation nécessite une ...
0eb5805fa23e0819f817ea10fe1fccd19e61e40a1239cc93f701fd56bd8ea66f,2023-09-01 06:50:00,la très grande majorité d'entre eux ne connais...,europe1,True,Information - Magazine,Bonjour,"[""biodiversite_concepts_generaux"", ""changement...","[{""keyword"": ""eaux"", ""timestamp"": 169354380309...",3,593,[{'analysis': 'L'allégation semble être fondée...
b6d54aefb250671e7754a688411ce9e68badcad88a665be3de78996d13b74fd2,2023-09-01 07:38:00,mais titeuf ne vieillit pas le monde change ti...,europe1,True,Information - Magazine,Europe 1 Matin,"[""changement_climatique_constat""]","[{""keyword"": ""bilan carbone"", ""timestamp"": 169...",1,606,[{'analysis': 'Il est pertinent de vérifier si...
23c2d3b292d9ab0fb0d0b2b8c34f3b88708c79ffab2d6659accf565ba61f48ae,2023-09-01 08:44:00,dû travailler très vite le journal arrive à no...,europe1,True,Information - Magazine,Europe 1 Matin,"[""biodiversite_concepts_generaux"", ""ressources...","[{""keyword"": ""eaux"", ""timestamp"": 169355074908...",3,515,[{'analysis': 'Cette allégation est problémati...
71df0ce2b34afa23391d8e31d35ccd213ae2a881b7ce412813a06a60a2e47d3c,2023-09-01 08:46:00,pas avoir lieu ni même européens existait ni l...,europe1,True,Information - Magazine,Europe 1 Matin,"[""biodiversite_concepts_generaux"", ""changement...","[{""keyword"": ""\u00e9cologiste"", ""timestamp"": 1...",2,536,[{'analysis': 'L'allégation concernant la chut...


In [11]:
claim_detections = pd.DataFrame(records)
claim_detections["idx"] = claim_detections.custom_id.str.split("_").str[-1].astype(int)
claim_detections = claim_detections.merge(
    df.reset_index()[
        [
            "start",
            "text",
            "channel_name",
            "channel_is_radio",
            "channel_program_type",
            "channel_program",
        ]
    ],
    left_on="idx",
    right_index=True,
    how="left",
)
display(claim_detections.head(10))
display(claim_detections.shape)
display(claim_detections.loc[~claim_detections.claim.isna()].shape)

Unnamed: 0,custom_id,claim,context,quote,idx,start,text,channel_name,channel_is_radio,channel_program_type,channel_program
0,claim_extract_text_id_0,,,,0,2023-09-01 06:26:00,<unk> <unk> <unk> <unk> aerosmith en janvier m...,europe1,True,Information - Magazine,Bonjour
1,claim_extract_text_id_1,Les propriétaires de campings peuvent économis...,L'émission discute d'une innovation dans les c...,pour les propriétaires de campings la facture ...,1,2023-09-01 06:50:00,la très grande majorité d'entre eux ne connais...,europe1,True,Information - Magazine,Bonjour
2,claim_extract_text_id_2,,,,2,2023-09-01 07:38:00,mais titeuf ne vieillit pas le monde change ti...,europe1,True,Information - Magazine,Europe 1 Matin
3,claim_extract_text_id_3,La CGT considère le nucléaire comme indispensa...,La CGT a quitté le collectif Alliance Écologie...,or pour la cgt le nucléaire fait partie du mix...,3,2023-09-01 08:44:00,dû travailler très vite le journal arrive à no...,europe1,True,Information - Magazine,Europe 1 Matin
4,claim_extract_text_id_4,Un dérèglement climatique a failli éradiquer l...,L'affirmation suggère qu'un événement climatiq...,vous avez deviné déjà un dérèglement climatiqu...,4,2023-09-01 08:46:00,pas avoir lieu ni même européens existait ni l...,europe1,True,Information - Magazine,Europe 1 Matin
5,claim_extract_text_id_5,,,,5,2023-09-01 11:30:00,petits plats de laurent mariotte le magazine c...,europe1,True,Information - Magazine,Pascal Praud et vous
6,claim_extract_text_id_6,La Renault Clio hybride permet jusqu'à 80% de ...,L'affirmation concerne les capacités de condui...,comme nouvelle renault clio tech full hybride ...,6,2023-09-01 11:44:00,des super produits et des supers prix découvre...,europe1,True,Information - Magazine,Pascal Praud et vous
7,claim_extract_text_id_6,La Renault Clio hybride a une autonomie de 900 km,L'affirmation porte sur l'autonomie totale de ...,et neuf cents km d'autonomie,6,2023-09-01 11:44:00,des super produits et des supers prix découvre...,europe1,True,Information - Magazine,Pascal Praud et vous
8,claim_extract_text_id_7,Les émissions de gaz à effet de serre provoque...,L'affirmation est faite dans le contexte de l'...,Les émissions de gaz à effet de serre provoque...,7,2023-09-01 12:02:00,à la gare sncf d'orange tout près du lycée pro...,europe1,True,Information - Magazine,Pascal Praud et vous
9,claim_extract_text_id_8,,,,8,2023-09-01 13:12:00,été en manque de braderie donc cette année on ...,europe1,True,Information - Magazine,Europe 1 Matin


(38145, 11)

(16623, 11)

In [12]:
claim_detections.to_csv("../../data/processed/claim_detections_video_extractions.csv")

In [13]:
SYSTEM_PROMPT = """
You are an environmental science researcher classifying climate-related claims using the CARDS system. Analyze the given text and categorize it according to these disinformation types:

1: Global warming is not happing. Climate change is NOT leading to melting ice (such as glaciers, sea ice, and permafrost), increased extreme weather, or rising sea levels. Cold weather also shows that climate change is not happening.
2: Greenhouse gases from humans are not the causing climate change.
3: The impacts of climate change will not be bad and might even be beneficial.
4: Climate solutions are harmful or unnecessary.
5: Climate science is uncertain, unsound, unreliable, or biased.
6: Climate scientists and proponents of climate action are alarmist, biased, wrong, hypocritical, corrupt, and/or politically motivated.
7: We need fossil fuels for economic growth, prosperity, and to maintain our standard of living.
0: No relevant claim detected.

Return a json with the following structure:
{
    "cards_label": "<category-number>",
    "explanation": "Your explanation for the classification"
}

"""


USER_PROMPT_TEMPLATE = """
Claim to analyze:
Claim: {claim}
Context: {context}
Exact quote: {quote}
"""

In [35]:
from pydantic import BaseModel, Field
import asyncio


class MediatreeClaimClassification(BaseModel):
    cards_label: str = Field(description="Label of the disinformation type")
    explanation: str = Field(description="Explanation for the classification")


# Limit concurrent requests to avoid API rate limiting
# (it depends on the model you use and your API tier)
semaphore = asyncio.Semaphore(40)


# Decorator that ensures `acompletion` uses the semaphore
def with_semaphore(acquire_semaphore):
    def decorator(func):
        @wraps(func)
        async def wrapper(*args, **kwargs):
            async with acquire_semaphore:
                return await func(*args, **kwargs)

        return wrapper

    return decorator


acompletion = with_semaphore(semaphore)(acompletion)


async def classify(
    system_prompt: str, user_prompt: str
) -> MediatreeClaimClassification:
    response = await acompletion(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        model="gpt-4o-mini",
        max_tokens=500,
        temperature=0,
        response_format={"type": "json_object"},
    )
    claim_validity = response.choices[0].message.content
    return MediatreeClaimClassification(**json.loads(claim_validity))


async def classify_claims(claims_df: pd.DataFrame) -> pd.DataFrame:
    claim_classifications: list[MediatreeClaimClassification] = await tqdm.gather(
        *[
            classify(
                SYSTEM_PROMPT,
                USER_PROMPT_TEMPLATE.format(
                    claim=claim.claim, context=claim.context, quote=claim.quote
                ),
            )
            for claim in claims_df.itertuples()
            if claim.claim is not None
        ]
    )

    claim_classifications_df = pd.DataFrame(
        [
            classification.model_dump(exclude_none=True)
            for classification in claim_classifications
        ],
        index=claims_df.loc[claims_df.claim.notna()].custom_id,
    )

    claim_classifications_df = pd.merge(
        claims_df,
        claim_classifications_df,
        left_on="custom_id",
        right_index=True,
        how="left",
    )

    return claim_classifications_df

In [37]:
classified_claims_df = await classify_claims(claim_detections)

100%|██████████| 16623/16623 [13:41<00:00, 20.22it/s]


In [38]:
classified_claims_df.head()

Unnamed: 0,custom_id,claim,context,quote,idx,start,text,channel_name,channel_is_radio,channel_program_type,channel_program,cards_label,explanation
0,claim_extract_text_id_0,,,,0,2023-09-01 06:26:00,<unk> <unk> <unk> <unk> aerosmith en janvier m...,europe1,True,Information - Magazine,Bonjour,,
1,claim_extract_text_id_1,Les propriétaires de campings peuvent économis...,L'émission discute d'une innovation dans les c...,pour les propriétaires de campings la facture ...,1,2023-09-01 06:50:00,la très grande majorité d'entre eux ne connais...,europe1,True,Information - Magazine,Bonjour,0.0,The claim discusses an innovation in camping t...
2,claim_extract_text_id_2,,,,2,2023-09-01 07:38:00,mais titeuf ne vieillit pas le monde change ti...,europe1,True,Information - Magazine,Europe 1 Matin,,
3,claim_extract_text_id_3,La CGT considère le nucléaire comme indispensa...,La CGT a quitté le collectif Alliance Écologie...,or pour la cgt le nucléaire fait partie du mix...,3,2023-09-01 08:44:00,dû travailler très vite le journal arrive à no...,europe1,True,Information - Magazine,Europe 1 Matin,4.0,The claim suggests that nuclear energy is esse...
4,claim_extract_text_id_4,Un dérèglement climatique a failli éradiquer l...,L'affirmation suggère qu'un événement climatiq...,vous avez deviné déjà un dérèglement climatiqu...,4,2023-09-01 08:46:00,pas avoir lieu ni même européens existait ni l...,europe1,True,Information - Magazine,Europe 1 Matin,0.0,The claim discusses a historical climate event...


In [39]:
classified_claims_df.to_csv("../../data/processed/claim_detections_classified.csv")

In [1]:
import pandas as pd
classified_claims_df = pd.read_csv("../../data/processed/claim_detections_classified.csv")

In [42]:
classified_claims_df.loc[classified_claims_df.claim.notna()].to_csv(
    "../../data/processed/claim_detections_classified.csv", index=False
)

In [2]:
stratified_sample = classified_claims_df.loc[classified_claims_df.claim.notna()].groupby('category').apply(
    lambda x: x.sample(frac=0.20)
)


AttributeError: 'DataFrame' object has no attribute 'claim'

In [45]:
classified_claims_df.loc[
    (classified_claims_df.claim.notna()) & (classified_claims_df.cards_label != "0"),
    [
        "idx",
        "claim",
        "cards_label",
        "text",
        "context",
        "quote",
    ],
].sample(100, ).to_csv(
    "../../data/processed/claim_detections_classified_sample_no_0.csv", index=False
)