In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet("../../data/raw/4_channels_predictions_09_2023_09_2024.parquet")
print(df.shape)
df.start = pd.to_datetime(df.start)
cutoff = df.start.max() - pd.Timedelta(days=21)
# df = df[df.start >= cutoff]
# df = df.loc[(df.start >= pd.to_datetime("2023-12-01")) & (df.start <= pd.to_datetime("2024-06-01"))]
print(df.shape)

(39131, 11)
(39131, 11)


In [3]:
display(df.text.str.len().describe())
display(df.num_tokens.describe())

display(df.channel_is_radio.value_counts())
display(df.channel_program_type.value_counts())

count    39131.000000
mean      2118.706882
std        412.775075
min        122.000000
25%       1930.000000
50%       2203.000000
75%       2399.000000
max       3869.000000
Name: text, dtype: float64

count    39131.000000
mean       473.946973
std         93.818349
min         30.000000
25%        430.000000
50%        493.000000
75%        537.500000
max        827.000000
Name: num_tokens, dtype: float64

channel_is_radio
False    23725
True     15406
Name: count, dtype: int64

channel_program_type
Information en continu    19107
Information - Magazine    17420
Information - Journal      2604
Name: count, dtype: int64

In [4]:
import litellm
from litellm import acompletion, completion_cost
from pydantic import BaseModel, Field
from typing import Callable, Awaitable, Union
from functools import wraps
import asyncio
from tqdm.asyncio import tqdm


class MediatreePrediction(BaseModel):
    cards_label_pred: str
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
    cost: float
    claim_pred: str | None = None


class MediatreeClaimIdentifier(BaseModel):
    context: str = Field(
        description="la phrase/paragraphe qui contient le contexte general"
    )
    quote: str = Field(description="Citation exacte du texte contenant l'affirmation")


class MediatreeClaimIdentifierResponse(BaseModel):
    claims: Union[list[MediatreeClaimIdentifier], None]
    prompt_tokens: Union[int, None]
    completion_tokens: Union[int, None]
    total_tokens: Union[int, None]
    cost: Union[float, None]


async def report_experiment_results(
    df: pd.DataFrame, predict_experiment: Callable[[str], Awaitable[None]]
) -> None:
    # Copy df to avoid modifications
    df = df.copy()

    # Run the experiment
    mediatree_predictions: list[MediatreeClaimIdentifierResponse] = await tqdm.gather(
        *[predict_experiment(text) for text in df["text"]]
    )
    # Create lists to store individual claims and their metadata
    rows = []

    for idx, pred in zip(df.index, mediatree_predictions):
        if pred is not None and pred.claims is not None:
            for claim in pred.claims:
                rows.append(
                    {
                        "id": idx,
                        "context": claim.context,
                        "quote": claim.quote,
                        "prompt_tokens": pred.prompt_tokens,
                        "completion_tokens": pred.completion_tokens,
                        "total_tokens": pred.total_tokens,
                        "cost": pred.cost,
                    }
                )

    # Create DataFrame with one claim per row
    mediatree_predictions_df = pd.DataFrame(rows)

    if not mediatree_predictions_df.empty:
        # Set id as index if there are any claims
        mediatree_predictions_df = mediatree_predictions_df.set_index("id")
    # df = pd.concat([df, mediatree_predictions_df], axis=1)
    df = df.merge(
        mediatree_predictions_df, left_index=True, right_index=True, how="outer"
    )

    # Show stats and performance
    show_llm_usage(df)

    return df


def show_llm_usage(df: pd.DataFrame) -> None:
    print("\nLLM USAGE\n=========\n")
    print(
        f"Median token usage:\n- Prompt: {int(df["prompt_tokens"].median())}\n"
        f"- Completion: {int(df["completion_tokens"].median())}\n"
        f"- Total: {int(df["total_tokens"].median())}"
    )
    print(f"\nTotal cost: ${df["cost"].sum():.3f}\n")


# Limit concurrent requests to avoid API rate limiting
# (it depends on the model you use and your API tier)
semaphore = asyncio.Semaphore(40)


# Decorator that ensures `acompletion` uses the semaphore
def with_semaphore(acquire_semaphore):
    def decorator(func):
        @wraps(func)
        async def wrapper(*args, **kwargs):
            async with acquire_semaphore:
                return await func(*args, **kwargs)

        return wrapper

    return decorator


acompletion = with_semaphore(semaphore)(acompletion)

In [27]:
import json


async def extract_claims(text: str) -> MediatreeClaimIdentifierResponse:
    system_prompt = """
Tu es un assistant spécialisé dans l'analyse de désinformation environnementale.

TÂCHE:
Analyse l'extrait de transcription TV/Radio fourni et identifie les affirmations (claims) qui nécessitent une vérification factuelle sur les thèmes suivants:
- Changement climatique
- Transition écologique
- Énergie
- Biodiversité
- Pollution
- Pesticides
- Ressources naturelles (eau, minéraux, etc.)
- Sort la quote seulement si t'est sure qu'elle continent de la désinformation environnementale/climatique.
- Avec le quote il faute sortire le paragraphe qui contient le contexte general utilisant du texte avant et apres le quote. AU MOINS DEUX PHRASES.
- Pour exemple, le suivant texte:
"Merci monsieur Dupont, en fait je ne suis pas d'accord avec vous. Si on regard les dèrnieres jours, en France il fait plus froid que d'abitude ! Il y a pas de signes du rechauffement climatique !"

La quote est: "Il y a pas de signes du rechauffement climatique !"
Et le context est: "Si on regard les dèrnieres jours, en France il fait plus froid que d'abitude ! Il y a pas de signes du rechauffement climatique !"

FORMAT DE RÉPONSE:
Tu dois OBLIGATOIREMENT répondre au format JSON suivant:
{
    "claims": [
        {
            "context": "Un seul paragraphe résumant le contexte essentiel pour comprendre l'affirmation",
            "quote": "l'extrait de texte culpable de desinformation"
        }
    ]
}

RÈGLES IMPORTANTES:
1. Inclure UNIQUEMENT les affirmations vérifiables sur les thèmes environnementaux contenant de la désinformation environnementale/climatique
2. Chaque claim doit être unique
3. Le format JSON doit être strictement respecté
4. Si aucune affirmation à vérifier n'est trouvée, renvoyer les claims comme un json vide {"claims": []}
5. Maximum 3 claims par analyse
6. La quote doit être exactement la meme que dans le texte, et concener que la phrase incriminée.
7. Le context doit être une paragraphe extrait et pas re-elaboré contenant la quote, contenant du texte avant et apres le quote. AU MOINS DEUX PHRASES.

Analyse maintenant le texte suivant:

"""
    response = await acompletion(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": text.strip()},
        ],
        model="gpt-4o-mini",
        max_tokens=2000,
        temperature=0.01,
        response_format={ "type": "json_object" }
    )
    claim_identifier = response.choices[0].message.content
    claims = (
        [
            MediatreeClaimIdentifier(**claim)
            for claim in json.loads(claim_identifier)["claims"]
        ]
        if json.loads(claim_identifier)["claims"] != []
        else None
    )
    return (
        MediatreeClaimIdentifierResponse(
            claims=claims,
            prompt_tokens=response.usage.prompt_tokens,
            completion_tokens=response.usage.completion_tokens,
            total_tokens=response.usage.total_tokens,
            cost=completion_cost(response),
        )
        if claims is not None
        else None
    )


claim_detections = await report_experiment_results(df.iloc[:5000], extract_claims)

100%|██████████| 5000/5000 [03:02<00:00, 27.39it/s]


LLM USAGE

Median token usage:
- Prompt: 1028
- Completion: 224
- Total: 1251

Total cost: $0.433






In [28]:
claim_detections.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5274 entries, 000cd7d6be597f6a569a45803f2648b8d096c4c7153a7e98ea63f0e827474f8a to fffc87c2d670e5368f1a5e0f43a430c4fdf99bf5042f1c9c3ea7ec9b08f8490e
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   start                 5274 non-null   datetime64[ns]
 1   text                  5274 non-null   object        
 2   channel_name          5274 non-null   object        
 3   channel_is_radio      5274 non-null   bool          
 4   channel_program_type  5274 non-null   object        
 5   channel_program       5274 non-null   object        
 6   themes                5274 non-null   object        
 7   keywords              5274 non-null   object        
 8   num_keywords          5274 non-null   int64         
 9   num_tokens            5274 non-null   int64         
 10  claims                5274 non-null   object        
 11  context         

In [29]:
claim_detections = claim_detections.dropna(subset=["context", "quote"])

In [30]:
claim_detections.shape

(1415, 17)

In [31]:
claim_detections[["context", "quote", "text"]]


Unnamed: 0_level_0,context,quote,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000cd7d6be597f6a569a45803f2648b8d096c4c7153a7e98ea63f0e827474f8a,Les fortes pluies de cet automne ont fait débo...,avec le réchauffement climatique on va avoir d...,un charles guyard arrivent au pire moment comm...
004d1651ab85ba561a2a59e16bd67eb052acf4db8447f6ca7d1e8a22efed7bf0,Il est surveillé par satellite pour savoir com...,l'agriculture et je le cite une catastrophe hu...,capitalisme et le libre marché des normes envi...
004d1651ab85ba561a2a59e16bd67eb052acf4db8447f6ca7d1e8a22efed7bf0,Alors l'écrivaine du monde paysan marie-hélène...,ils veulent la décroissance mais sans en assum...,capitalisme et le libre marché des normes envi...
005ff4ddbceb67699166e6e785ef77df468b2ff8162f1d44e5630e1d7a2b5977,Alors que la zone b termine ses vacances d'hiv...,la crise climatique,les offres en cours conditions en magasin ou s...
00919bd64439601cc4584384cf3f0f61045f4629560b7def73c833f59fca662d,"Christophe Béchu, le ministre de la transition...",vous auriez pu faire exactement la même interv...,les interviews qui qu'il a donné un depuis deu...
...,...,...,...
fee7f2aae94f499a2466a2e0555d199dc0595cf88fdf68bf34990576ab70eca5,alors il y a au moins une annonce qui semble s...,abandon du no du principal indicateur utilisé ...,avant leur évacuation par les crs à saint-quen...
ff072840e917a5764089d0f56b67cd0fe0dbe7c760d9ad67071fe1d440da046c,sécheresse exceptionnelle qui faisait que les ...,les barrages étaient vides,sécheresse exceptionnelle qui faisait que les ...
ff072840e917a5764089d0f56b67cd0fe0dbe7c760d9ad67071fe1d440da046c,il faut rester toujours vigilants c'est pour ç...,les barrages sont pleins,sécheresse exceptionnelle qui faisait que les ...
ff0fe9f038a9ee3e214cb4d126b51df619b490311a04333d83a6c43b7ee884ce,Il y a un embargo sur le ring en ce moment et ...,il y a des sécheresses ou à des inondations qu...,petit bonheur la chance au petit bonheur la ch...


In [32]:
claim_detections[["context", "quote", "text"]].to_csv("../../data/processed/claim_detections_video_extractions.csv")

In [33]:
import tiktoken

INSTRUCTION_PROMPT = """You are an expert in disinformation on environmental and climate subjects, expert in climate science and know everything about the GIEC. I am going to give you series of claims coming from quotes on medias. Among these, some will contain climate disinformation. Your task is to find disinformation claim if they exist, and classify them following the pre-defined categories.

You must respond with a JSON object containing one field:
- "category": the classification label

<categories>
- "0_accepted": No relevant environmental disinformation claim detected.
- "1_not_happening": Global warming is not happing. Climate change is NOT leading to melting ice (such as glaciers, sea ice, and permafrost), increased extreme weather, or rising sea levels. Cold weather also shows that climate change is not happening.
- "2_not_human": Greenhouse gases from humans are not the causing climate change.
- "3_not_bad": The impacts of climate change will not be bad and might even be beneficial.
- "4_solutions_wont_work": Climate solutions are harmful or unnecessary.
- "5_science_is_unreliable": Climate science is uncertain, unsound, unreliable, or biased.
- "6_scientists_are_biased": Climate scientists and proponents of climate action are alarmist, biased, wrong, hypocritical, corrupt, and/or politically motivated.
- "7_need_fossil_fuel": We need fossil fuels for economic growth, prosperity, and to maintain our standard of living.
</categories>
"""

EXAMPLES_PROMPT = """
<examples>

<example>
<extract>
A warmer planet will actually be good for agriculture—more CO2 means bigger crops and longer growing seasons!
</extract>
<highlighted_quote>
more CO2 means bigger crops and longer growing seasons!
</highlighted_quote>
<response>
{
    "category": "3_not_bad"
}
</response>
</example>

<example>
<extract>
The data is clear: the Earth is warming due to human activities, and immediate action is required to mitigate its effects.
</extract>
<highlighted_quote>
the Earth is warming due to human activities, and immediate action is required to mitigate its effects.
</highlighted_quote>
<response>
{
    "category": "0_accepted"
}
</response>
</example>

<example>
<extract>
Global warming is just a myth pushed by elites—if the planet were really heating up, why did we just have the coldest winter in decades?
</extract>
<highlighted_quote>
Global warming is just a myth pushed by elites
</highlighted_quote>
<response>
{
    "category": "1_not_happening"
}
</response>
</example>

<example>
<extract>
These so-called climate experts are just chasing grant money, hyping up fear to keep their funding flowing.
</extract>
<highlighted_quote>
climate experts are just chasing grant money
</highlighted_quote>
<response>
{
    "category": "6_scientists_are_biased"
}
</response>
</example>

<example>
<extract>
Climate models have been wrong so many times; why should we trust them now? They're just guesses based on incomplete data.
</extract>
<highlighted_quote>
Climate models have been wrong so many times
</highlighted_quote>
<response>
{
    "category": "5_science_is_unreliable"
}
</response>
</example>

<example>
<extract>
Climate has always changed naturally throughout history; blaming humans for it is absurd and ignores natural cycles like solar activity.
</extract>
<highlighted_quote>
Climate has always changed naturally throughout history
</highlighted_quote>
<response>
{
    "category": "2_not_human"
}
</response>
</example>

<example>
<extract>
Switching to renewable energy will cripple our economy without making any real difference to the planet's temperature.
</extract>
<highlighted_quote>
Switching to renewable energy will cripple our economy
</highlighted_quote>
<response>
{
    "category": "4_solutions_wont_work"
}
</response>
</example>

<example>
<extract>
Without fossil fuels, our society would collapse—we can't power modern civilization on wind turbines and solar panels alone.
</extract>
<highlighted_quote>
Without fossil fuels, our society would collapse
</highlighted_quote>
<response>
{
    "category": "7_need_fossil_fuel"
}
</response>
</example>

</examples>"""

FEW_SHOT_PROMPT = INSTRUCTION_PROMPT + EXAMPLES_PROMPT

encoding = tiktoken.encoding_for_model("gpt-4o-mini")
len(encoding.encode(FEW_SHOT_PROMPT))

914

In [34]:



USER_PROMPT_TEMPLATE = """
Extract to analyze:
<extract>
{context}
</extract>
<highlighted_quote>
{quote}
</highlighted_quote>
"""


In [35]:
class MediatreeClaimCardsClassification(BaseModel):
    category: str = Field(
        description="Label of the disinformation type"
    )


async def classify(
    system_prompt: str, user_prompt: str
) -> MediatreeClaimCardsClassification:
    response = await acompletion(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        model="gpt-4o-mini",
        max_tokens=1000,
        temperature=0.01,
        response_format={ "type": "json_object" }
    )
    claim_validity = response.choices[0].message.content
    return MediatreeClaimCardsClassification(**json.loads(claim_validity))


async def classify_claims(claims_df: pd.DataFrame) -> pd.DataFrame:
    claim_classifications: list[MediatreeClaimCardsClassification] = await tqdm.gather(
        *[
            classify(
                FEW_SHOT_PROMPT,
                USER_PROMPT_TEMPLATE.format(
                    context=claim.context, quote=claim.quote
                ),
            )
            for claim in claims_df.itertuples()
        ]
    )

    claim_classifications_df = pd.DataFrame(
        [classification.model_dump(exclude_none=True) for classification in claim_classifications],
        index=claims_df.index,
    )

    claim_classifications_df = pd.concat([claims_df, claim_classifications_df], axis=1)

    return claim_classifications_df

In [36]:
classified_claims_df = await classify_claims(claim_detections)


100%|██████████| 1415/1415 [00:29<00:00, 47.92it/s]


In [37]:
classified_claims_df.category.value_counts()

category
0_accepted                 1144
4_solutions_wont_work       138
1_not_happening              66
5_science_is_unreliable      24
3_not_bad                    14
2_not_human                  12
6_scientists_are_biased      11
7_need_fossil_fuel            6
Name: count, dtype: int64

In [40]:
import numpy as np

classified_claims_df["category_group"] = "No disinformation"
classified_claims_df["category_group"] = np.where(classified_claims_df.category.isin(["1_not_happening", "2_not_human", "3_not_bad", "5_science_is_unreliable"]), "Disinformation", classified_claims_df["category_group"])
classified_claims_df["category_group"] = np.where(classified_claims_df.category.isin(["4_solutions_wont_work", "6_scientists_are_biased", "7_need_fossil_fuel"]), "Inaction", classified_claims_df["category_group"])
classified_claims_df.category_group.value_counts()

category_group
No disinformation    1144
Inaction              155
Disinformation        116
Name: count, dtype: int64

In [44]:
classified_claims_df[["category", "category_group", "quote", "context", "text"]].to_csv("../../data/processed/claim_detections_classified.csv")

In [45]:
import pandas as pd
classified_df = pd.read_csv("../../data/processed/claim_detections_classified.csv")

In [46]:
stratified_sample = classified_df.groupby('category_group').apply(
    lambda x: x.sample(n=100)
).reset_index(drop=True).set_index("id")

stratified_sample_no_0 = classified_df[classified_df.category != "0_accepted"].groupby('category_group').apply(
    lambda x: x.sample(n=100)
).reset_index(drop=True).set_index("id")

  stratified_sample = classified_df.groupby('category_group').apply(
  stratified_sample_no_0 = classified_df[classified_df.category != "0_accepted"].groupby('category_group').apply(


In [48]:
sample_claims_df = stratified_sample.sample(frac=1)
sample_claims_df[["category", "category_group", "quote", "context", "text"]].to_csv("../../data/processed/claim_detections_classified_sample.csv")

sample_claims_df_no_0 = stratified_sample_no_0.sample(frac=1)
sample_claims_df_no_0[["category", "category_group", "quote", "context", "text"]].to_csv("../../data/processed/claim_detections_classified_sample_no_0.csv")
