In [None]:
from setfit import SetFitModel

# Download from the 🤗 Hub
model = SetFitModel.from_pretrained("gmguarino/multilingual-e5-base-climateguard")


In [None]:
CHUNK_SIZE = 512
CHUNK_OVERLAP = 256

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
)

dataset_id = "DataForGood/climateguard"
# Load raw dataset
dataset = load_dataset(dataset_id, split="test")

records = []
for idx, record in enumerate(dataset):
    chunks = splitter.split_text(record["plaintext"].lower().replace(".", "").replace(",", ""))
    for chunk in chunks:
        records.append(
            {
                "id": idx,
                "text": chunk,
                "label": int(record["misinformation"]),
            }
        )

claims_dataset = Dataset.from_pandas(pd.DataFrame.from_records(records))

In [None]:
claims_dataset[0]

In [None]:
# # Tokenize helper function
# def tokenize(batch):
#     return tokenizer(
#         batch["text"],
#         # padding="max_length",
#         truncation=True,
#         max_length=CHUNK_SIZE,
#         return_tensors="pt",
#     )


def format_labels(example):
    return {"label": int(example["label"])}


claims_dataset = claims_dataset.map(format_labels, batched=False)



In [None]:
import os

from openai import OpenAI
from dotenv import load_dotenv

load_dotenv(override=True)
# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

_system_prompt = """You are an assistant helping editors to moderate TV and radio content.
You will be provided with a transcript delimited by triple backticks.
Bear in mind that the transcript may be missing punctuation and may be of very low quality, with incorrect vocabulary, cuts in the wrong places, or may include some phonetic transcription.

Does the text promote climate change misinformation that undermines well-established scientific consensus, such as denying the existence of climate change or the factors that contribute to it ?

Answer just a number on a scale of 0 - does not promote - to 10 - certainly promotes.

text:"""

In [None]:
import re

def parse_response(response: str):
    """Parse response containing only a score."""
    match = re.match(r"^[^\d]*(\d+)", response)
    if match:
        score = int(match.group(1))  # Extract score as an integer
    else:
        score = 0
    return int(score >= 8)

In [None]:
import torch
from tqdm.notebook import tqdm

predictions = []
openai_predictions = []
labels = []
outputs = []

with torch.no_grad():
    for record in tqdm(claims_dataset):
        # prompt = _system_prompt + f" '''{record["text"]}'''"
        # messages = [{"role": "user", "content": prompt}]

        output = model.predict_proba(record["text"]).cpu()
        prediction = model(record["text"])
        # response = client.chat.completions.create(
        #     model="ft:gpt-4o-mini-2024-07-18:personal::B1xWiJRm",
        #     messages=messages,
        #     temperature=0,
        # )
        # result = response.choices[0].message.content.strip()
        outputs.append(output.numpy())
        predictions.append(prediction)
        # openai_predictions.append(parse_response(result))


In [None]:
df = claims_dataset.to_pandas()
df["predictions"] = predictions
# df["openai_predictions"] = openai_predictions
df = df.groupby(["id"]).agg("max").drop(columns="text")

In [None]:
from sklearn.metrics import classification_report
print("setfit")
print(classification_report(df.label, df.predictions))

In [None]:
pd.DataFrame(outputs, columns=["logit_0", "logit_1"]).plot(kind="kde")

In [None]:
from sklearn.metrics import classification_report
print("OpenAI")
print(classification_report(df.label, df.openai_predictions))
print("setfit")
print(classification_report(df.label, df.predictions))

In [None]:
display(df.openai_predictions.value_counts())
display(df.predictions.value_counts())