In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet("../../data/raw/4_channels_predictions_09_2023_09_2024.parquet")
print(df.shape)
df.start = pd.to_datetime(df.start)
cutoff = df.start.max() - pd.Timedelta(days=21)
# df = df[df.start >= cutoff]
df = df.loc[(df.start >= pd.to_datetime("2023-12-01")) & (df.start <= pd.to_datetime("2024-06-01"))]
print(df.shape)

(39131, 11)
(15082, 11)


In [3]:
display(df.text.str.len().describe())
display(df.num_tokens.describe())

display(df.channel_is_radio.value_counts())
display(df.channel_program_type.value_counts())

count    15082.000000
mean      2161.461477
std        379.606272
min        213.000000
25%       1978.000000
50%       2231.000000
75%       2424.000000
max       3116.000000
Name: text, dtype: float64

count    15082.000000
mean       482.950007
std         85.832160
min         53.000000
25%        440.000000
50%        498.000000
75%        542.000000
max        732.000000
Name: num_tokens, dtype: float64

channel_is_radio
False    8723
True     6359
Name: count, dtype: int64

channel_program_type
Information - Magazine    7428
Information en continu    6379
Information - Journal     1275
Name: count, dtype: int64

In [16]:
import litellm
from litellm import acompletion, completion_cost
from pydantic import BaseModel, Field
from typing import Callable, Awaitable, Union
from functools import wraps
import asyncio
from tqdm.asyncio import tqdm


class MediatreePrediction(BaseModel):
    cards_label_pred: str
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
    cost: float
    claim_pred: str | None = None


class MediatreeClaimIdentifier(BaseModel):
    context: str = Field(
        description="la phrase/paragraphe qui contient le contexte general"
    )
    quote: str = Field(description="Citation exacte du texte contenant l'affirmation")


class MediatreeClaimIdentifierResponse(BaseModel):
    claims: Union[list[MediatreeClaimIdentifier], None]
    prompt_tokens: Union[int, None]
    completion_tokens: Union[int, None]
    total_tokens: Union[int, None]
    cost: Union[float, None]


async def report_experiment_results(
    df: pd.DataFrame, predict_experiment: Callable[[str], Awaitable[None]]
) -> None:
    # Copy df to avoid modifications
    df = df.copy()

    # Run the experiment
    mediatree_predictions: list[MediatreeClaimIdentifierResponse] = await tqdm.gather(
        *[predict_experiment(text) for text in df["text"]]
    )
    # Create lists to store individual claims and their metadata
    rows = []

    for idx, pred in zip(df.index, mediatree_predictions):
        if pred is not None and pred.claims is not None:
            for claim in pred.claims:
                rows.append(
                    {
                        "id": idx,
                        "context": claim.context,
                        "quote": claim.quote,
                        "prompt_tokens": pred.prompt_tokens,
                        "completion_tokens": pred.completion_tokens,
                        "total_tokens": pred.total_tokens,
                        "cost": pred.cost,
                    }
                )

    # Create DataFrame with one claim per row
    mediatree_predictions_df = pd.DataFrame(rows)

    if not mediatree_predictions_df.empty:
        # Set id as index if there are any claims
        mediatree_predictions_df = mediatree_predictions_df.set_index("id")
    # df = pd.concat([df, mediatree_predictions_df], axis=1)
    df = df.merge(
        mediatree_predictions_df, left_index=True, right_index=True, how="outer"
    )

    # Show stats and performance
    show_llm_usage(df)

    return df


def show_llm_usage(df: pd.DataFrame) -> None:
    print("\nLLM USAGE\n=========\n")
    print(
        f"Median token usage:\n- Prompt: {int(df["prompt_tokens"].median())}\n"
        f"- Completion: {int(df["completion_tokens"].median())}\n"
        f"- Total: {int(df["total_tokens"].median())}"
    )
    print(f"\nTotal cost: ${df["cost"].sum():.3f}\n")


# Limit concurrent requests to avoid API rate limiting
# (it depends on the model you use and your API tier)
semaphore = asyncio.Semaphore(40)


# Decorator that ensures `acompletion` uses the semaphore
def with_semaphore(acquire_semaphore):
    def decorator(func):
        @wraps(func)
        async def wrapper(*args, **kwargs):
            async with acquire_semaphore:
                return await func(*args, **kwargs)

        return wrapper

    return decorator


acompletion = with_semaphore(semaphore)(acompletion)

In [58]:
import json


async def extract_claims(text: str) -> MediatreeClaimIdentifierResponse:
    system_prompt = """
Tu es un assistant spécialisé dans l'analyse de désinformation environnementale.

TÂCHE:
Analyse l'extrait de transcription TV/Radio fourni et identifie les affirmations (claims) qui nécessitent une vérification factuelle sur les thèmes suivants:
- Changement climatique
- Transition écologique
- Énergie
- Biodiversité
- Pollution
- Pesticides
- Ressources naturelles (eau, minéraux, etc.)
- Sort la quote seulement si t'est sure qu'elle continent de la désinformation environnementale/climatique.
- Avec le quote il faute sortire la phrase/paragraphe qui contient le contexte general.
- Pour exemple, le suivant texte:
"Merci monsieur Dupont, en fait je ne suis pas d'accord avec vous. Si on regard les dèrnieres jours, en France il fait plus froid que d'abitude ! Il y a pas de signes du rechauffement climatique !"

La quote est: "Il y a pas de signes du rechauffement climatique !"
Et le context est: "Si on regard les dèrnieres jours, en France il fait plus froid que d'abitude ! Il y a pas de signes du rechauffement climatique !"

FORMAT DE RÉPONSE:
Tu dois OBLIGATOIREMENT répondre au format JSON suivant:
{
    "claims": [
        {
            "context": "Un seul paragraphe résumant le contexte essentiel pour comprendre l'affirmation",
            "quote": "Si on regard les dèrnieres jours, en France il fait plus froid que d'abitude ! Il y a pas de signes du rechauffement climatique !"
        }
    ]
}

RÈGLES IMPORTANTES:
1. Inclure UNIQUEMENT les affirmations vérifiables sur les thèmes environnementaux contenant de la désinformation environnementale/climatique
2. Chaque claim doit être unique
3. Le format JSON doit être strictement respecté
4. Si aucune affirmation à vérifier n'est trouvée, renvoyer les claims comme un json vide {"claims": []}
5. Maximum 3 claims par analyse
6. La quote doit être exactement la meme que dans le texte, et concener que la phrase incriminée.
7. Le context doit être une phrase/paragraphe extrait et pas re-elaboré contenant la quote et plus de contexte.

Analyse maintenant le texte suivant:

"""
    response = await acompletion(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": text.strip()},
        ],
        model="gpt-4o-mini",
        max_tokens=2000,
        temperature=0.01,
        response_format={ "type": "json_object" }
    )
    claim_identifier = response.choices[0].message.content
    claims = (
        [
            MediatreeClaimIdentifier(**claim)
            for claim in json.loads(claim_identifier)["claims"]
        ]
        if json.loads(claim_identifier)["claims"] != []
        else None
    )
    return (
        MediatreeClaimIdentifierResponse(
            claims=claims,
            prompt_tokens=response.usage.prompt_tokens,
            completion_tokens=response.usage.completion_tokens,
            total_tokens=response.usage.total_tokens,
            cost=completion_cost(response),
        )
        if claims is not None
        else None
    )


claim_detections = await report_experiment_results(df, extract_claims)

100%|██████████| 15082/15082 [09:09<00:00, 27.45it/s]  


LLM USAGE

Median token usage:
- Prompt: 999
- Completion: 180
- Total: 1178

Total cost: $1.443






In [59]:
claim_detections.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16931 entries, 00057a237e27df45620beb329d9203ddba9749a67b3b94e90b85800e8d93c620 to fffc87c2d670e5368f1a5e0f43a430c4fdf99bf5042f1c9c3ea7ec9b08f8490e
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   start                 16931 non-null  datetime64[ns]
 1   text                  16931 non-null  object        
 2   channel_name          16931 non-null  object        
 3   channel_is_radio      16931 non-null  bool          
 4   channel_program_type  16931 non-null  object        
 5   channel_program       16931 non-null  object        
 6   themes                16931 non-null  object        
 7   keywords              16931 non-null  object        
 8   num_keywords          16931 non-null  int64         
 9   num_tokens            16931 non-null  int64         
 10  claims                16931 non-null  object        
 11  context        

In [60]:
claim_detections = claim_detections.dropna(subset=["context", "quote"])

In [61]:
claim_detections.shape

(5474, 17)

In [62]:
claim_detections[["context", "quote", "text"]]


Unnamed: 0_level_0,context,quote,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000c11641b9a40d0bc7ea1e0be6437d1a69cb81043e67ac480e13c4986a294de,Un consensus qui arrivent en poils tard mais q...,un mix cent pourcent renouvelables il est poss...,cop vingt-huit s'engage à faire une transition...
000c11641b9a40d0bc7ea1e0be6437d1a69cb81043e67ac480e13c4986a294de,autre engagement de cette cop vingt-huit tripl...,tripler les investissements dans ai les renouv...,cop vingt-huit s'engage à faire une transition...
000cd7d6be597f6a569a45803f2648b8d096c4c7153a7e98ea63f0e827474f8a,mais même dans les endroits où ils sont bons i...,on va avoir des pluies de plus en plus diluvie...,un charles guyard arrivent au pire moment comm...
004d1651ab85ba561a2a59e16bd67eb052acf4db8447f6ca7d1e8a22efed7bf0,est ce qu'entre écologie et agriculture il fau...,l'agriculture et je le cite une catastrophe hu...,capitalisme et le libre marché des normes envi...
0057473cb2913c896ca5683b0c3a0d067cfc038d11afc381984404aca985ee77,elle annonce moins quatre-vingt-dix pour cent ...,moins quatre-vingt-dix pour cent de gaz à effe...,disent des annonces tu vas devoir vous le disa...
...,...,...,...
ff493499bb701e7c61da520140feea26b73a21c31a661d9883e94cfae39f0f25,Il faut qu'on continue de produire une aliment...,il faut qu'on continue de produire une aliment...,nourrir nos concitoyens nous pénaliser qu'aura...
ff493499bb701e7c61da520140feea26b73a21c31a661d9883e94cfae39f0f25,Aujourd'hui on a toujours voulu une agricultur...,une agriculture plus verte plus vertueuse pour...,nourrir nos concitoyens nous pénaliser qu'aura...
ff680102ed08ce32376fb5f635f846b44aecb52d27d074634c219255a06fd326,faut leur alléger les normes par exemple ce qu...,il faut leur alléger les normes par exemple ce...,faut leur alléger les normes par exemple ce qu...
ff898a4273dbb51f28b4d7b18255104efa23e592064aaa6cfce653989aa4039d,les activistes accusent l'usine de polluer la ...,polluer la nappe phréatique et de consommer de...,américain thomas hermans c'est une action qui ...


In [63]:
claim_detections[["context", "quote", "text"]].to_csv("../../data/processed/claim_detections_video_extractions.csv")

In [64]:
import tiktoken

INSTRUCTION_PROMPT = """You are an expert in disinformation on environmental and climate subjects, expert in climate science and know everything about the GIEC. I am going to give you series of claims coming from quotes on medias. Among these, some will contain climate disinformation. Your task is to find disinformation claim if they exist, and classify them following the pre-defined categories.

You must respond with a JSON object containing one field:
- "category": the classification label

<categories>
- "0_accepted": No relevant environmental disinformation claim detected.
- "1_not_happening": Global warming is not happing. Climate change is NOT leading to melting ice (such as glaciers, sea ice, and permafrost), increased extreme weather, or rising sea levels. Cold weather also shows that climate change is not happening.
- "2_not_human": Greenhouse gases from humans are not the causing climate change.
- "3_not_bad": The impacts of climate change will not be bad and might even be beneficial.
- "4_solutions_wont_work": Climate solutions are harmful or unnecessary.
- "5_science_is_unreliable": Climate science is uncertain, unsound, unreliable, or biased.
- "6_scientists_are_biased": Climate scientists and proponents of climate action are alarmist, biased, wrong, hypocritical, corrupt, and/or politically motivated.
- "7_need_fossil_fuel": We need fossil fuels for economic growth, prosperity, and to maintain our standard of living.
</categories>
"""

EXAMPLES_PROMPT = """
<examples>

<example>
<extract>
A warmer planet will actually be good for agriculture—more CO2 means bigger crops and longer growing seasons!
</extract>
<highlighted_quote>
more CO2 means bigger crops and longer growing seasons!
</highlighted_quote>
<response>
{
    "category": "3_not_bad"
}
</response>
</example>

<example>
<extract>
The data is clear: the Earth is warming due to human activities, and immediate action is required to mitigate its effects.
</extract>
<highlighted_quote>
the Earth is warming due to human activities, and immediate action is required to mitigate its effects.
</highlighted_quote>
<response>
{
    "category": "0_accepted"
}
</response>
</example>

<example>
<extract>
Global warming is just a myth pushed by elites—if the planet were really heating up, why did we just have the coldest winter in decades?
</extract>
<highlighted_quote>
Global warming is just a myth pushed by elites
</highlighted_quote>
<response>
{
    "category": "1_not_happening"
}
</response>
</example>

<example>
<extract>
These so-called climate experts are just chasing grant money, hyping up fear to keep their funding flowing.
</extract>
<highlighted_quote>
climate experts are just chasing grant money
</highlighted_quote>
<response>
{
    "category": "6_scientists_are_biased"
}
</response>
</example>

<example>
<extract>
Climate models have been wrong so many times; why should we trust them now? They're just guesses based on incomplete data.
</extract>
<highlighted_quote>
Climate models have been wrong so many times
</highlighted_quote>
<response>
{
    "category": "5_science_is_unreliable"
}
</response>
</example>

<example>
<extract>
Climate has always changed naturally throughout history; blaming humans for it is absurd and ignores natural cycles like solar activity.
</extract>
<highlighted_quote>
Climate has always changed naturally throughout history
</highlighted_quote>
<response>
{
    "category": "2_not_human"
}
</response>
</example>

<example>
<extract>
Switching to renewable energy will cripple our economy without making any real difference to the planet's temperature.
</extract>
<highlighted_quote>
Switching to renewable energy will cripple our economy
</highlighted_quote>
<response>
{
    "category": "4_solutions_wont_work"
}
</response>
</example>

<example>
<extract>
Without fossil fuels, our society would collapse—we can't power modern civilization on wind turbines and solar panels alone.
</extract>
<highlighted_quote>
Without fossil fuels, our society would collapse
</highlighted_quote>
<response>
{
    "category": "7_need_fossil_fuel"
}
</response>
</example>

</examples>"""

FEW_SHOT_PROMPT = INSTRUCTION_PROMPT + EXAMPLES_PROMPT

encoding = tiktoken.encoding_for_model("gpt-4o-mini")
len(encoding.encode(FEW_SHOT_PROMPT))

914

In [65]:



USER_PROMPT_TEMPLATE = """
Extract to analyze:
<extract>
{context}
</extract>
<highlighted_quote>
{quote}
</highlighted_quote>
"""


In [66]:
class MediatreeClaimCardsClassification(BaseModel):
    category: str = Field(
        description="Label of the disinformation type"
    )


async def classify(
    system_prompt: str, user_prompt: str
) -> MediatreeClaimCardsClassification:
    response = await acompletion(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        model="gpt-4o-mini",
        max_tokens=1000,
        temperature=0.01,
        response_format={ "type": "json_object" }
    )
    claim_validity = response.choices[0].message.content
    return MediatreeClaimCardsClassification(**json.loads(claim_validity))


async def classify_claims(claims_df: pd.DataFrame) -> pd.DataFrame:
    claim_classifications: list[MediatreeClaimCardsClassification] = await tqdm.gather(
        *[
            classify(
                FEW_SHOT_PROMPT,
                USER_PROMPT_TEMPLATE.format(
                    context=claim.context, quote=claim.quote
                ),
            )
            for claim in claims_df.itertuples()
        ]
    )

    claim_classifications_df = pd.DataFrame(
        [classification.model_dump(exclude_none=True) for classification in claim_classifications],
        index=claims_df.index,
    )

    claim_classifications_df = pd.concat([claims_df, claim_classifications_df], axis=1)

    return claim_classifications_df

In [67]:
classified_claims_df = await classify_claims(claim_detections)


100%|██████████| 5474/5474 [02:45<00:00, 33.01it/s]


In [68]:
classified_claims_df.category.value_counts()

category
0_accepted                 4565
4_solutions_wont_work       518
1_not_happening             147
6_scientists_are_biased      76
5_science_is_unreliable      64
2_not_human                  40
3_not_bad                    35
7_need_fossil_fuel           29
Name: count, dtype: int64

In [69]:
classified_claims_df[["category", "quote", "context", "text"]].to_csv("../../data/processed/claim_detections_classified.csv")




In [9]:
import pandas as pd
classified_df = pd.read_csv("../../data/processed/claim_detections_classified.csv")

In [19]:
stratified_sample = classified_df.groupby('category').apply(
    lambda x: x.sample(n=20)
).reset_index(drop=True).set_index("id")

stratified_sample_no_0 = classified_df[classified_df.category != "0_accepted"].groupby('category').apply(
    lambda x: x.sample(n=20)
).reset_index(drop=True).set_index("id")

  stratified_sample = classified_df.groupby('category').apply(
  stratified_sample_no_0 = classified_df[classified_df.category != "0_accepted"].groupby('category').apply(


In [20]:
sample_claims_df = stratified_sample.sample(frac=1)
sample_claims_df[["category", "quote", "context", "text"]].to_csv("../../data/processed/claim_detections_classified_sample.csv")

sample_claims_df_no_0 = stratified_sample_no_0.sample(frac=1)
sample_claims_df_no_0[["category", "quote", "context", "text"]].to_csv("../../data/processed/claim_detections_classified_sample_no_0.csv")
