In [17]:
import json
import re

import pandas as pd

In [18]:
# Load the labels from the human labelled Excel file
df_labels = pd.read_csv(
    "../../data/raw/4_channels_review_09_2023_09_2024.xlsx - Claims review.csv",
    index_col=0,
)


# Recreate a new "comment" column from the 3 columns we had in Excel
# In LabelStudio, the quote columns would be useless since we can modify the quotes ourselves
def create_label_studio_comment(row: pd.Series) -> str:
    cards_comment = row["commentaire_cards"]
    quote_is_correct = row["quote_is_correct"]
    quote_comment = row["commentaire_quote"]
    comment = ""
    if not pd.isna(cards_comment):
        comment += cards_comment + "\n"
    if not pd.isna(quote_comment) or quote_is_correct == "FALSE":
        comment += "The quote is not correct ❌\n"
    if not pd.isna(quote_comment):
        comment += quote_comment
    comment = comment.strip()
    return comment


df_labels["comment"] = df_labels.apply(create_label_studio_comment, axis=1)


# Only keep relevant columns
df_labels = df_labels[["cards_ground_truth", "comment"]].rename(
    columns={"cards_ground_truth": "cards_true"}
)

df_labels

Unnamed: 0_level_0,cards_true,comment
id,Unnamed: 1_level_1,Unnamed: 2_level_1
52061c3902c0257c7bfae7086ae50ea3998fea4204bcd6628588e41d71340dfc,0_accepted,The quote is not correct ❌\ntermes honnêtes = ...
34a41bf34b35ee91fc147601fb8c21a366a2f568b9060bbb629698dfd9319801,0_accepted,"Contexte spécifique de la phrase, les mots de ..."
0fb8db32982baea27fa4a92220e76331e703d61852c0e8d95550ef6853ffd842,0_accepted,"Contexte spécifique de la phrase, la claim men..."
4792f93c6614b1e7ef39e301cc6c1d0f4d3d18b9421cc3a9b18aa5c7581c9e02,6_proponents_biased,
74c05fdbca4aeffb643abf0de486b57f8299051d4ef71b58b532791a33aba423,2_humans_not_the_cause,
...,...,...
22d30e47c43d8defe1bdab51e6e4c6374ba982d76de8896c2abb711950121329,0_accepted,The quote is not correct ❌\nProblème de retran...
58feacdc4a4d31023a87bce158ee902d37076d9f5562f3fe41f768ba815b7b6a,4_solutions_harmful_unnecessary,
6d39a0d45c359fd2183c0f1f001b76c4b60d283240e5c34308b71c22c78217ec,0_accepted,Potentiellement plutôt du greenwashing
0718ef4df3559735ad33436f7e1e39802a71d923cbc0b9ec1c82a2ebfb251527,0_accepted,Légitime mais tellement court que ça a l'air d...


In [19]:
# Load the file that contains the extra metadata and the remaining records to label
# Ideally we wouldn't have to do this JOIN step
df_data = pd.read_csv(
    "../../data/raw/4_channels_predictions_with_cards_rewritten_09_2023_09_2024.csv",
    index_col=0,
)

# Remove the few claims that came up with no quotes in bold
df_data = df_data[
    (
        # Keep texts if the number of quotes is a multiple of 2
        (df_data["improved_text_with_quote"].str.count(r"\*\*") % 2 == 0)
        # Keep texts if they have at least one quoted passage
        & (df_data["improved_text_with_quote"].str.count(r"\*\*") >= 2)
    )
]


# Dirty fix to retrieve quote start/end and remove the bold **markers**
def get_quotes_start_end(
    improved_text_with_quote: str,
) -> tuple[str, list[tuple[int, int]]]:
    # Find the text quote using **markdown** markers
    # NOTE: this should have been done in the previous step
    bold_searches = re.finditer(r"\*\*(.*?)\*\*", improved_text_with_quote, flags=re.M)

    quotes_spans = []
    for i, bold_search in enumerate(bold_searches):
        # Account for the **bold** markers that are removed
        quote_start = bold_search.start() - (i) * (2 + 2)
        quote_end = bold_search.end() - (i + 1) * (2 + 2)
        quotes_spans.append((quote_start, quote_end))

    return quotes_spans


df_data["quotes_spans"] = (
    df_data["improved_text_with_quote"]
    .apply(lambda x: get_quotes_start_end(x))
    .tolist()
)
df_data["improved_text_with_quote"] = df_data["improved_text_with_quote"].str.replace(
    r"\*\*", "", regex=True
)

# Keep relevant columns
df_data = df_data[
    [
        "start",
        "channel_name",
        "channel_program_type",
        "channel_program",
        "text",
        "improved_text_with_quote",
        "quotes_spans",
        "cards",
        "analysis",
    ]
].rename(
    columns={
        "text": "original_text",
        "improved_text_with_quote": "text_to_review",
        "cards": "cards_pred",
        "analysis": "model_analysis",
    }
)

df_data

Unnamed: 0_level_0,start,channel_name,channel_program_type,channel_program,original_text,text_to_review,quotes_spans,cards_pred,model_analysis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
52061c3902c0257c7bfae7086ae50ea3998fea4204bcd6628588e41d71340dfc,2023-09-08 19:34:00,europe1,Information - Magazine,Soir,c'est non seulement que les utilisent pas mais...,Ce n'est pas seulement qu'ils ne les utilisent...,"[(824, 879)]",5_science_uncertain,Cette affirmation pourrait créer un doute sur ...
34a41bf34b35ee91fc147601fb8c21a366a2f568b9060bbb629698dfd9319801,2023-09-15 19:30:00,europe1,Information - Magazine,Soir,jusqu'au trente septembre détaille sofia au qu...,"Jusqu'au trente septembre, détaille Sofia au q...","[(1686, 1777)]",5_science_uncertain,Cette allégation semble suggérer que le réchau...
0fb8db32982baea27fa4a92220e76331e703d61852c0e8d95550ef6853ffd842,2023-09-20 07:50:00,europe1,Information - Magazine,Europe 1 Matin,tente d'échapper à une tempête de pluie d'acid...,On tente d'échapper à une tempête de pluie d'a...,"[(26, 59)]",1_its_not_happening,L'allégation mentionne une tempête de pluie d'...
4792f93c6614b1e7ef39e301cc6c1d0f4d3d18b9421cc3a9b18aa5c7581c9e02,2023-10-02 07:54:00,europe1,Information - Magazine,Europe 1 Matin,de français n'arrive pas à se loger c'est offi...,"Les Français n'arrivent pas à se loger, c'est ...","[(733, 874)]",3_impacts_not_bad,Cette affirmation présente des mesures qui n'o...
74c05fdbca4aeffb643abf0de486b57f8299051d4ef71b58b532791a33aba423,2023-10-05 19:54:00,europe1,Information - Magazine,Soir,climatique le réchauffement collecte de grenob...,Le réchauffement climatique a été collecté à G...,"[(1184, 1246)]",2_humans_not_the_cause,Cette assertion minimise le consensus scientif...
...,...,...,...,...,...,...,...,...,...
22d30e47c43d8defe1bdab51e6e4c6374ba982d76de8896c2abb711950121329,2024-04-23 20:36:00,tf1,Information - Journal,JT 20h + météo,plus chaud de ces dix dernières années le temp...,"Plus chaud de ces dix dernières années, le tem...","[(117, 151)]",1_its_not_happening,Cette affirmation semble exagérée et pourrait ...
58feacdc4a4d31023a87bce158ee902d37076d9f5562f3fe41f768ba815b7b6a,2024-04-28 20:40:00,tf1,Information - Journal,JT 20h + météo,hui ce fameux pacte d'immigration qui a été vo...,"Aujourd'hui, ce fameux pacte d'immigration qui...","[(1313, 1433)]",4_solutions_harmful_unnecessary,Cette affirmation attribue une intention de dé...
6d39a0d45c359fd2183c0f1f001b76c4b60d283240e5c34308b71c22c78217ec,2024-06-16 18:04:00,tf1,Information - Magazine,Sept à huit Life,avec l'aide de nos fournisseurs et si il s'avè...,"Avec l'aide de nos fournisseurs, si il s'avère...","[(161, 329)]",4_solutions_harmful_unnecessary,Cette allégation semble minimiser les impacts ...
0718ef4df3559735ad33436f7e1e39802a71d923cbc0b9ec1c82a2ebfb251527,2024-07-21 17:14:00,tf1,Information - Magazine,Sept à huit Life,france plusieurs millions de maison menace de ...,"En France, plusieurs millions de maisons sont ...","[(207, 248)]",2_humans_not_the_cause,Cette affirmation contredit le consensus scien...


In [20]:
# Join the metadata and predictions with the Excel labels
df = df_data.join(df_labels)
df

Unnamed: 0_level_0,start,channel_name,channel_program_type,channel_program,original_text,text_to_review,quotes_spans,cards_pred,model_analysis,cards_true,comment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
52061c3902c0257c7bfae7086ae50ea3998fea4204bcd6628588e41d71340dfc,2023-09-08 19:34:00,europe1,Information - Magazine,Soir,c'est non seulement que les utilisent pas mais...,Ce n'est pas seulement qu'ils ne les utilisent...,"[(824, 879)]",5_science_uncertain,Cette affirmation pourrait créer un doute sur ...,0_accepted,The quote is not correct ❌\ntermes honnêtes = ...
34a41bf34b35ee91fc147601fb8c21a366a2f568b9060bbb629698dfd9319801,2023-09-15 19:30:00,europe1,Information - Magazine,Soir,jusqu'au trente septembre détaille sofia au qu...,"Jusqu'au trente septembre, détaille Sofia au q...","[(1686, 1777)]",5_science_uncertain,Cette allégation semble suggérer que le réchau...,0_accepted,"Contexte spécifique de la phrase, les mots de ..."
0fb8db32982baea27fa4a92220e76331e703d61852c0e8d95550ef6853ffd842,2023-09-20 07:50:00,europe1,Information - Magazine,Europe 1 Matin,tente d'échapper à une tempête de pluie d'acid...,On tente d'échapper à une tempête de pluie d'a...,"[(26, 59)]",1_its_not_happening,L'allégation mentionne une tempête de pluie d'...,0_accepted,"Contexte spécifique de la phrase, la claim men..."
4792f93c6614b1e7ef39e301cc6c1d0f4d3d18b9421cc3a9b18aa5c7581c9e02,2023-10-02 07:54:00,europe1,Information - Magazine,Europe 1 Matin,de français n'arrive pas à se loger c'est offi...,"Les Français n'arrivent pas à se loger, c'est ...","[(733, 874)]",3_impacts_not_bad,Cette affirmation présente des mesures qui n'o...,6_proponents_biased,
74c05fdbca4aeffb643abf0de486b57f8299051d4ef71b58b532791a33aba423,2023-10-05 19:54:00,europe1,Information - Magazine,Soir,climatique le réchauffement collecte de grenob...,Le réchauffement climatique a été collecté à G...,"[(1184, 1246)]",2_humans_not_the_cause,Cette assertion minimise le consensus scientif...,2_humans_not_the_cause,
...,...,...,...,...,...,...,...,...,...,...,...
22d30e47c43d8defe1bdab51e6e4c6374ba982d76de8896c2abb711950121329,2024-04-23 20:36:00,tf1,Information - Journal,JT 20h + météo,plus chaud de ces dix dernières années le temp...,"Plus chaud de ces dix dernières années, le tem...","[(117, 151)]",1_its_not_happening,Cette affirmation semble exagérée et pourrait ...,0_accepted,The quote is not correct ❌\nProblème de retran...
58feacdc4a4d31023a87bce158ee902d37076d9f5562f3fe41f768ba815b7b6a,2024-04-28 20:40:00,tf1,Information - Journal,JT 20h + météo,hui ce fameux pacte d'immigration qui a été vo...,"Aujourd'hui, ce fameux pacte d'immigration qui...","[(1313, 1433)]",4_solutions_harmful_unnecessary,Cette affirmation attribue une intention de dé...,4_solutions_harmful_unnecessary,
6d39a0d45c359fd2183c0f1f001b76c4b60d283240e5c34308b71c22c78217ec,2024-06-16 18:04:00,tf1,Information - Magazine,Sept à huit Life,avec l'aide de nos fournisseurs et si il s'avè...,"Avec l'aide de nos fournisseurs, si il s'avère...","[(161, 329)]",4_solutions_harmful_unnecessary,Cette allégation semble minimiser les impacts ...,0_accepted,Potentiellement plutôt du greenwashing
0718ef4df3559735ad33436f7e1e39802a71d923cbc0b9ec1c82a2ebfb251527,2024-07-21 17:14:00,tf1,Information - Magazine,Sept à huit Life,france plusieurs millions de maison menace de ...,"En France, plusieurs millions de maisons sont ...","[(207, 248)]",2_humans_not_the_cause,Cette affirmation contredit le consensus scien...,0_accepted,Légitime mais tellement court que ça a l'air d...


In [21]:
# Renaming some CARDS labels to something better
rename_map = {
    "2_humans_not_the_cause": "2_humans_are_not_the_cause",
    "3_impacts_not_bad": "3_impacts_are_not_bad",
    "4_solutions_harmful_unnecessary": "4_solutions_are_ineffective_or_harmful",
    "5_science_uncertain": "5_science_is_uncertain",
    "6_proponents_biased": "6_advocates_are_biased",
    "7_fossil_fuels_needed": "7_fossil_fuels_are_needed",
}
df["cards_true"] = df["cards_true"].replace(rename_map)
df["cards_pred"] = df["cards_pred"].replace(rename_map)
df["cards_true"].value_counts().sort_index()

cards_true
0_accepted                                118
1_its_not_happening                        31
2_humans_are_not_the_cause                 19
3_impacts_are_not_bad                      15
4_solutions_are_ineffective_or_harmful     88
5_science_is_uncertain                     22
6_advocates_are_biased                     70
7_fossil_fuels_are_needed                   7
Name: count, dtype: int64

In [22]:
# We noticed (too late) that some rewritten texts were much shorter than their original version.
# We may have inadvertly removed critical information beforr the labelling of the transcripts.
# We could also have hallucinated things that weren't actually said.
# To have trustworthy labels, we will discard the records that have a difference in length greater than 20%
absolute_size_diff_percent = (
    (
        (df["original_text"].str.len() - df["text_to_review"].str.len())
        / df["original_text"].str.len()
    )
    * 100
).abs()
display(
    absolute_size_diff_percent.quantile([0, 0.25, 0.5, 0.75, 0.8, 0.9, 0.99, 1]).round(
        0
    )
)


print(
    f"We are discarding {(absolute_size_diff_percent > 20).sum()/len(absolute_size_diff_percent):.0%} "
    "of our ground truth records because of shorter/longer rewritten transcription"
)
df = df[absolute_size_diff_percent <= 20]

0.00     0.0
0.25     6.0
0.50    10.0
0.75    16.0
0.80    19.0
0.90    24.0
0.99    39.0
1.00    71.0
dtype: float64

We are discarding 16% of our ground truth records because of shorter/longer rewritten transcription


In [23]:
# Format the data for LabelStudio
tasks = []
for id, row in df.iterrows():
    has_ground_truth = not pd.isna(row["cards_true"])

    # Task
    # ====
    task = {
        "data": {
            # Metadata
            "id": id,
            "start": str(row["start"]),
            "channel_name": row["channel_name"],
            "channel_program_type": row["channel_program_type"],
            "channel_program": row["channel_program"],
            # Text
            "original_text": row["original_text"],
            "text_to_review": row["text_to_review"],
            # Model analysis
            "model_analysis": row["model_analysis"],
        },
    }

    # Predictions
    # ===========
    task["predictions"] = [
        {
            "model_version": "december_1st_batch",
            "result": [
                # relevant_passages
                *[
                    {
                        "value": {
                            "start": quote_start,
                            "end": quote_end,
                            "text": row["text_to_review"][quote_start:quote_end],
                            "labels": ["relevant_passages"],
                        },
                        "from_name": "relevant_passages",
                        "to_name": "text_to_review",
                        "type": "labels",
                    }
                    for quote_start, quote_end in row["quotes_spans"]
                ],
                # transcription_is_clear
                {
                    "value": {"choices": ["yes"]},
                    "from_name": "transcription_is_clear",
                    "to_name": "text_to_review",
                    "type": "choices",
                },
                # cards_true
                {
                    "value": {"choices": [row["cards_pred"]]},
                    "from_name": "cards_true",
                    "to_name": "text_to_review",
                    "type": "choices",
                },
            ],
        }
    ]

    # Annotations
    # ===========
    if has_ground_truth:
        result = [
            # relevant_passages
            *[
                {
                    "value": {
                        "start": quote_start,
                        "end": quote_end,
                        "text": row["text_to_review"][quote_start:quote_end],
                        "labels": ["relevant_passages"],
                    },
                    "from_name": "relevant_passages",
                    "to_name": "text_to_review",
                    "type": "labels",
                }
                for quote_start, quote_end in row["quotes_spans"]
            ],
            # transcription_is_clear
            {
                "value": {"choices": ["yes"]},
                "from_name": "transcription_is_clear",
                "to_name": "text_to_review",
                "type": "choices",
            },
            # cards_true
            {
                "value": {"choices": [row["cards_true"]]},
                "from_name": "cards_true",
                "to_name": "text_to_review",
                "type": "choices",
            },
        ]
        if not pd.isna(row["comment"]):
            result.append(
                {
                    "value": {"text": row["comment"].strip()},
                    "from_name": "comment",
                    "to_name": "text_to_review",
                    "type": "textarea",
                }
            )
        task["annotations"] = [{"result": result}]

    tasks.append(task)

In [24]:
# Save the LabelStudio tasks for import
with open(
    "../../data/raw/4_channels_review_excel_to_labelstudio_09_2023_09_2024.json", "w"
) as f:
    json.dump(tasks, f)

**Label Studio Labelling interface**

```xml
<View>
	<Header value="Données en entrée" />
	<Collapse bordered="true">
		<Panel value="Métadonnées">
			<Text name="id" value="id : $id" />
			<Text name="channel_name"
                value="Chaîne : $channel_name" />
			<Text name="channel_program_type"
                value="Type de programme : $channel_program_type" />
			<Text name="channel_program"
                value="Programme : $channel_program" />
			<Text name="start" value="Date : $start" />
		</Panel>
	</Collapse>
	<Collapse bordered="true">
		<Panel value="Extrait Mediatree original">
			<Text name="original_text" value="$original_text" />
		</Panel>
	</Collapse>
	<Header value="Extrait à examiner" />
	<Labels name="relevant_passages" toName="text_to_review">
		<Label value="relevant_passages" html="Passages intéressants" background="#ED371F" hotkey="p" />
	</Labels>
	<Text name="text_to_review" value="$text_to_review" granularity="word" />
	<Header value="La transcription est claire" />
	<Choices name="transcription_is_clear" toName="text_to_review" choice="single-radio"
        required="true" showInline="true">
		<Choice value="Oui" hotkey="y" alias="yes" selected="true" />
		<Choice value="Non" hotkey="n" alias="no" />
	</Choices>
	<View visibleWhen="choice-selected"
        whenTagName="transcription_is_clear" whenChoiceValue="yes">
		<Header value="Classe CARDS associée à l'extrait" />
		<Collapse bordered="true">
			<Panel value="Analyse du modèle">
				<Text name="model_analysis" value="$model_analysis" />
			</Panel>
		</Collapse>
		<Choices name="cards_true" toName="text_to_review" choice="single-radio" required="true"
            showInline="true">
			<Choice value="0. Accepted" alias="0_accepted" hotkey="0" />
			<Choice value="1. It's not happening" alias="1_its_not_happening" hotkey="1"
                hint="Global warming is not happing. Climate change is NOT leading to melting ice (such as glaciers, sea ice, and permafrost), increased extreme weather, or rising sea levels. Cold weather also shows that climate change is not happening." />
			<Choice value="2. Humans are not the cause" alias="2_humans_are_not_the_cause" hotkey="2"
                hint="Greenhouse gases from humans are not the causing climate change." />
			<Choice value="3. Impacts are not bad" alias="3_impacts_are_not_bad" hotkey="3"
                hint="The impacts of climate change will not be bad and might even be beneficial." />
			<Choice value="4. Solutions are ineffective or harmful"
                alias="4_solutions_are_ineffective_or_harmful" hotkey="4"
                hint="Climate solutions are harmful or unnecessary." />
			<Choice value="5. Science is uncertain" alias="5_science_is_uncertain" hotkey="5"
                hint="Climate science is uncertain, unsound, unreliable, or biased." />
			<Choice value="6. Advocates are biased" alias="6_advocates_are_biased" hotkey="6"
                hint="Climate scientists and proponents of climate action are alarmist, biased, wrong, hypocritical, corrupt, and/or politically motivated." />
			<Choice value="7. Fossil fuels are needed" alias="7_fossil_fuels_are_needed" hotkey="7"
                hint="We need fossil fuels for economic growth, prosperity, and to maintain our standard of living." />
		</Choices>
	</View>
	<Header value="Commentaire" />
	<TextArea name="comment" toName="text_to_review" rows="1" editable="true" maxSubmissions="1" />
</View>
```
