In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_id = "./models/camembertav2-base-climateguard"

model = AutoModelForSequenceClassification.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)


In [None]:
for parameter in model.parameters():
    parameter.requires_grad = False

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=256,
    chunk_overlap=64,
)

dataset_id = "DataForGood/climateguard"
# Load raw dataset
dataset = load_dataset(dataset_id, split="test")

records = []
for idx, record in enumerate(dataset):
    chunks = splitter.split_text(record["plaintext_whisper"].lower().replace(".", "").replace(",", ""))
    for chunk in chunks:
        records.append(
            {
                "id": idx,
                "text": chunk,
                "label": int(record["misinformation"]),
            }
        )

claims_dataset = Dataset.from_pandas(pd.DataFrame.from_records(records))

In [None]:
claims_dataset[0]

In [None]:
# Tokenize helper function
def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )


def format_labels(example):
    return {"label": int(example["label"])}


claims_dataset = claims_dataset.map(format_labels, batched=False)



In [None]:
import torch
from tqdm.notebook import tqdm

predictions = []
labels = []
outputs = []

with torch.no_grad():
    # for record in tokenized_dataset:
    for record in tqdm(claims_dataset):
        inputs = tokenizer(
            record["text"],
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt",
        )
        output = model(inputs["input_ids"], inputs["attention_mask"])
        outputs.append(output.logits[0])
        predictions.append(output.logits.numpy().argmax(1)[0])


In [None]:
df = claims_dataset.to_pandas()
df["predictions"] = predictions
df = df.groupby(["id"]).agg("max").drop(columns="text")

In [None]:
from sklearn.metrics import classification_report

print(classification_report(df.label, df.predictions))