In [2]:
import pandas as pd

In [7]:
df = pd.read_parquet("../../data/18_channels_2023_09_to_2024_09.parquet")
print(df.shape)
df.start = pd.to_datetime(df.start)
cutoff = df.start.max() - pd.Timedelta(days=21)
df = df[df.start >= cutoff]
print(df.shape)


(185738, 10)
(3022, 10)


In [22]:
display(df.text.str.len().describe())
display(df.num_tokens.describe())

display(df.channel_is_radio.value_counts())
display(df.channel_program_type.value_counts())

count    185738.000000
mean       2136.974200
std         435.559148
min          76.000000
25%        1966.000000
50%        2229.000000
75%        2419.000000
max        3891.000000
Name: text, dtype: float64

count    185738.00000
mean        473.80472
std          97.39974
min          26.00000
25%         433.00000
50%         493.00000
75%         537.00000
max         827.00000
Name: num_tokens, dtype: float64

channel_is_radio
False    109739
True      75999
Name: count, dtype: int64

channel_program_type
Information en continu            113094
Information - Magazine             47678
Information - Journal              21401
Information - Autres émissions      3565
Name: count, dtype: int64

In [17]:
import litellm
from litellm import acompletion, completion_cost
from pydantic import BaseModel, Field
from typing import Callable, Awaitable, Union
from functools import wraps
import asyncio
from tqdm.asyncio import tqdm


class MediatreePrediction(BaseModel):
    cards_label_pred: str
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
    cost: float
    claim_pred: str | None = None


class MediatreeClaimIdentifier(BaseModel):
    claim: str = Field(
        description="Reformulation courte et claire de l'affirmation à vérifier"
    )
    context: str = Field(
        description="Un seul paragraphe résumant le contexte essentiel pour comprendre l'affirmation"
    )
    quote: str = Field(description="Citation exacte du texte contenant l'affirmation")


class MediatreeClaimIdentifierResponse(BaseModel):
    claims: Union[list[MediatreeClaimIdentifier], None]
    prompt_tokens: Union[int, None]
    completion_tokens: Union[int, None]
    total_tokens: Union[int, None]
    cost: Union[float, None]


async def report_experiment_results(
    df: pd.DataFrame, predict_experiment: Callable[[str], Awaitable[None]]
) -> None:
    # Copy df to avoid modifications
    df = df.copy()

    # Run the experiment
    mediatree_predictions: list[MediatreeClaimIdentifierResponse] = await tqdm.gather(
        *[predict_experiment(text) for text in df["text"]]
    )
    # Create lists to store individual claims and their metadata
    rows = []
    
    for idx, pred in zip(df.index, mediatree_predictions):
        if pred is not None and pred.claims is not None:
            for claim in pred.claims:
                rows.append({
                    'id': idx,
                    'claim': claim.claim,
                    'context': claim.context,
                    'quote': claim.quote,
                    'prompt_tokens': pred.prompt_tokens,
                    'completion_tokens': pred.completion_tokens,
                    'total_tokens': pred.total_tokens,
                    'cost': pred.cost
                })
    
    # Create DataFrame with one claim per row
    mediatree_predictions_df = pd.DataFrame(rows)
    
    if not mediatree_predictions_df.empty:
        # Set id as index if there are any claims
        mediatree_predictions_df = mediatree_predictions_df.set_index('id')
    # df = pd.concat([df, mediatree_predictions_df], axis=1)
    df = df.merge(mediatree_predictions_df, left_index=True, right_index=True, how="outer")

    # Show stats and performance
    show_llm_usage(df)

    return df


def show_llm_usage(df: pd.DataFrame) -> None:
    print("\nLLM USAGE\n=========\n")
    print(
        f"Median token usage:\n- Prompt: {int(df["prompt_tokens"].median())}\n"
        f"- Completion: {int(df["completion_tokens"].median())}\n"
        f"- Total: {int(df["total_tokens"].median())}"
    )
    print(f"\nTotal cost: ${df["cost"].sum():.3f}\n")


# Limit concurrent requests to avoid API rate limiting
# (it depends on the model you use and your API tier)
semaphore = asyncio.Semaphore(5)


# Decorator that ensures `acompletion` uses the semaphore
def with_semaphore(acquire_semaphore):
    def decorator(func):
        @wraps(func)
        async def wrapper(*args, **kwargs):
            async with acquire_semaphore:
                return await func(*args, **kwargs)

        return wrapper

    return decorator


acompletion = with_semaphore(semaphore)(acompletion)

In [14]:
import json


async def say(text, sec):
    await asyncio.sleep(sec)
    print(text)


async def extract_claims(text: str) -> MediatreeClaimIdentifierResponse:
    system_prompt = """
Tu es un assistant spécialisé dans l'analyse de désinformation environnementale.

TÂCHE:
Analyse l'extrait de transcription TV/Radio fourni et identifie les affirmations (claims) qui nécessitent une vérification factuelle sur les thèmes suivants:
- Changement climatique
- Transition écologique
- Énergie
- Biodiversité
- Pollution
- Pesticides
- Ressources naturelles (eau, minéraux, etc.)

FORMAT DE RÉPONSE:
Tu dois OBLIGATOIREMENT répondre au format JSON suivant:
{
    "claims": [
        {
            "claim": "Reformulation courte et claire de l'affirmation à vérifier",
            "context": "Un seul paragraphe résumant le contexte essentiel pour comprendre l'affirmation",
            "quote": "Citation exacte du texte contenant l'affirmation"
        }
    ]
}

RÈGLES IMPORTANTES:
1. Inclure UNIQUEMENT les affirmations vérifiables sur les thèmes environnementaux
2. Chaque claim doit être unique
3. Le format JSON doit être strictement respecté
4. Si aucune affirmation à vérifier n'est trouvée, renvoyer un tableau claims vide
5. Maximum 3 claims par analyse

Analyse maintenant le texte suivant:"""
    response = await acompletion(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": text.strip()},
        ],
        model="gpt-4o-mini",
        max_tokens=2000,
        temperature=0,
    )
    claim_identifier = response.choices[0].message.content
    claims = (
        [
            MediatreeClaimIdentifier(**claim)
            for claim in json.loads(claim_identifier)["claims"]
        ]
        if json.loads(claim_identifier)["claims"] != []
        else None
    )
    return (
        MediatreeClaimIdentifierResponse(
            claims=claims,
            prompt_tokens=response.usage.prompt_tokens,
            completion_tokens=response.usage.completion_tokens,
            total_tokens=response.usage.total_tokens,
            cost=completion_cost(response),
        )
        if claims is not None
        else None
    )


claim_detections = await report_experiment_results(df.iloc[:100], extract_claims)

100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


LLM USAGE

Median token usage:
- Prompt: 740
- Completion: 295
- Total: 1044

Total cost: $0.038






In [16]:

claim_detections.to_csv("../../data/claim_detections_video_extractions.csv")