# Assignment 3
## SemEval 2025 Task 9: The Food Hazard Detection Challenge

The Food Hazard Detection task evaluates explainable classification systems for titles of food-incident reports collected from the web. The implementation below aims to address the aforementioned challenge, utilizing the RoBERTa model.

<br>The Assignment refer to the following Sub-Tasks:
<br>(ST1) Text classification for food hazard prediction, predicting the type of hazard and product.
<br>(ST2) Food hazard and product “vector” detection, predicting the exact hazard and product.
<br>Before we proceed, we need to present the necessary libraries and extensions.

In [None]:
!pip install torch joblib nltk transformers datasets 'accelerate>=0.26.0'

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from datasets import Dataset as HFDataset
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re
from scipy.sparse import hstack
import joblib
import nltk
from nltk.corpus import stopwords
import zipfile

We also need to establish the preprocessing and data-cleaning steps that we followed:

In [None]:
def preprocess_data(data):
    data = data[['title', 'text', 'hazard-category', 'product-category', 'hazard', 'product']]
    return data


def clean_text_extended(text):
    # URL Removal
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Number Removal
    text = re.sub(r'\b\d+-\d+\b', '', text)
    text = re.sub(r'\b\d+\b', '', text)
    # Special Character Removal
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Stop Words Removal
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    # Whitespace Removal
    text = re.sub(r'\s+', ' ', text).strip()
    return text


nltk.download('stopwords')

train_data = pd.read_csv('incidents_train.csv')
valid_data = pd.read_csv('incidents_valid.csv')
test_data = pd.read_csv('incidents_test.csv')

train_data = preprocess_data(train_data)
valid_data = preprocess_data(valid_data)
test_data = preprocess_data(test_data)

train_data['text'] = train_data['text'].apply(clean_text_extended)
valid_data['text'] = valid_data['text'].apply(clean_text_extended)

# ST1

In [None]:
# Function to update label encoders
def update_label_encoders(encoder, data, column_name):

    new_labels = set(data[column_name].unique()) - set(encoder.classes_)
    if new_labels:
        encoder.classes_ = np.append(encoder.classes_, list(new_labels))
    labels = data[column_name].apply(lambda x: x if x in encoder.classes_ else 'unknown')
    return encoder.transform(labels)


def tokenize_function(examples):
    return tokenizer(examples["title"], examples["text"], truncation=True, padding=True, max_length=512)


# Conversion to Hugging Face Dataset
def create_hf_dataset(df, labels):
    df = df.copy()
    df["labels"] = labels
    hf_dataset = HFDataset.from_pandas(df)
    hf_dataset = hf_dataset.map(tokenize_function, batched=True)
    hf_dataset = hf_dataset.remove_columns(["title", "text", "hazard-category", "product-category"])
    return hf_dataset


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    return {"eval_f1": f1_score(labels, preds, average="macro")}


def train_model(train_dataset, valid_dataset, num_labels):
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        compute_metrics=compute_metrics
    )
    trainer.train()
    return trainer


# Evaluation
def evaluate_model(trainer, dataset, true_labels):
    predictions = trainer.predict(dataset)
    preds = np.argmax(predictions.predictions, axis=-1)
    return f1_score(true_labels, preds, average='macro')


def get_predictions(trainer, dataset, label_encoder):
    predictions = trainer.predict(dataset)
    preds = np.argmax(predictions.predictions, axis=-1)
    decoded_preds = label_encoder.inverse_transform(preds)
    return decoded_preds


label_encoder_hazard = LabelEncoder()
label_encoder_product = LabelEncoder()

hazard_labels_train = label_encoder_hazard.fit_transform(train_data['hazard-category'])
product_labels_train = label_encoder_product.fit_transform(train_data['product-category'])

hazard_labels_valid = update_label_encoders(label_encoder_hazard, valid_data, 'hazard-category')
product_labels_valid = update_label_encoders(label_encoder_product, valid_data, 'product-category')

hazard_labels_test = update_label_encoders(label_encoder_hazard, test_data, 'hazard-category')
product_labels_test = update_label_encoders(label_encoder_product, test_data, 'product-category')

MODEL_NAME = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_dataset_hazard = create_hf_dataset(train_data, hazard_labels_train)
valid_dataset_hazard = create_hf_dataset(valid_data, hazard_labels_valid)
test_dataset_hazard = create_hf_dataset(test_data, hazard_labels_test)

train_dataset_product = create_hf_dataset(train_data, product_labels_train)
valid_dataset_product = create_hf_dataset(valid_data, product_labels_valid)
test_dataset_product = create_hf_dataset(test_data, product_labels_test)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    save_total_limit=1,
    report_to="none",
)


trainer_hazard = train_model(train_dataset_hazard, valid_dataset_hazard, num_labels=len(label_encoder_hazard.classes_))
trainer_product = train_model(train_dataset_product, valid_dataset_product, num_labels=len(label_encoder_product.classes_))

f1_hazard_valid = evaluate_model(trainer_hazard, valid_dataset_hazard, hazard_labels_valid)
f1_product_valid = evaluate_model(trainer_product, valid_dataset_product, product_labels_valid)

final_score_valid = (f1_hazard_valid + f1_product_valid) / 2.0

print(f"Validation F1 Score for Hazard: {f1_hazard_valid}")
print(f"Validation F1 Score for Product: {f1_product_valid}")
print(f"Validation Final Score: {final_score_valid}")

f1_hazard_test = evaluate_model(trainer_hazard, test_dataset_hazard, hazard_labels_test)
f1_product_test = evaluate_model(trainer_product, test_dataset_product, product_labels_test)
final_score_test = (f1_hazard_test + f1_product_test) / 2.0

print(f"Test F1 Score for Hazard: {f1_hazard_test}")
print(f"Test F1 Score for Product: {f1_product_test}")
print(f"Test Final Score: {final_score_test}")

test_hazard_preds = get_predictions(trainer_hazard, test_dataset_hazard, label_encoder_hazard)
test_product_preds = get_predictions(trainer_product, test_dataset_product, label_encoder_product)

output_df = pd.DataFrame({
    "Index": test_data.index,
    "Predicted_Hazard_Category": test_hazard_preds,
    "Predicted_Product_Category": test_product_preds
})

output_df.to_csv("submission.csv", index=False)
print("Predictions saved to predictions.csv")

# ST2

In [None]:
# Function to update label encoders
def update_label_encoders(encoder, data, column_name):

    new_labels = set(data[column_name].unique()) - set(encoder.classes_)
    if new_labels:
        encoder.classes_ = np.append(encoder.classes_, list(new_labels))
    labels = data[column_name].apply(lambda x: x if x in encoder.classes_ else 'unknown')
    return encoder.transform(labels)


def tokenize_function(examples):
    return tokenizer(examples["title"], examples["text"], truncation=True, padding=True, max_length=512)


# Conversion to Hugging Face Dataset
def create_hf_dataset(df, labels):
    df = df.copy()
    df["labels"] = labels  # Add labels explicitly to the dataset
    hf_dataset = HFDataset.from_pandas(df)
    hf_dataset = hf_dataset.map(tokenize_function, batched=True)
    hf_dataset = hf_dataset.remove_columns(["title", "text", "hazard-category", "product-category", "hazard", "product"])
    return hf_dataset

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    return {"eval_f1": f1_score(labels, preds, average="macro")}


def train_model(train_dataset, valid_dataset, num_labels):
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        compute_metrics=compute_metrics
    )
    trainer.train()
    return trainer


# Evaluation
def evaluate_model(trainer, dataset, true_labels):
    predictions = trainer.predict(dataset)
    preds = np.argmax(predictions.predictions, axis=-1)
    return f1_score(true_labels, preds, average='macro')


# Decoding predictions function
def get_predictions(trainer, dataset, label_encoder):
    predictions = trainer.predict(dataset)
    preds = np.argmax(predictions.predictions, axis=-1)
    decoded_preds = label_encoder.inverse_transform(preds)
    return decoded_preds


label_encoder_hazard_vector = LabelEncoder()
label_encoder_product_vector = LabelEncoder()

hazard_vector_labels_train = label_encoder_hazard_vector.fit_transform(train_data['hazard'])
product_vector_labels_train = label_encoder_product_vector.fit_transform(train_data['product'])

hazard_vector_labels_valid = update_label_encoders(label_encoder_hazard_vector, valid_data, 'hazard')
product_vector_labels_valid = update_label_encoders(label_encoder_product_vector, valid_data, 'product')

hazard_vector_labels_test = update_label_encoders(label_encoder_hazard_vector, test_data, 'hazard')
product_vector_labels_test = update_label_encoders(label_encoder_product_vector, test_data, 'product')

MODEL_NAME = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_dataset_hazard_vector = create_hf_dataset(train_data, hazard_vector_labels_train)
valid_dataset_hazard_vector = create_hf_dataset(valid_data, hazard_vector_labels_valid)
test_dataset_hazard_vector = create_hf_dataset(test_data, hazard_vector_labels_test)

train_dataset_product_vector = create_hf_dataset(train_data, product_vector_labels_train)
valid_dataset_product_vector = create_hf_dataset(valid_data, product_vector_labels_valid)
test_dataset_product_vector = create_hf_dataset(test_data, product_vector_labels_test)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    save_total_limit=1,
    report_to="none",
)


# Model Training for ST2
trainer_hazard_vector = train_model(train_dataset_hazard_vector, valid_dataset_hazard_vector, num_labels=len(label_encoder_hazard_vector.classes_))
trainer_product_vector = train_model(train_dataset_product_vector, valid_dataset_product_vector, num_labels=len(label_encoder_product_vector.classes_))

f1_hazard_vector_valid = evaluate_model(trainer_hazard_vector, valid_dataset_hazard_vector, hazard_vector_labels_valid)
f1_product_vector_valid = evaluate_model(trainer_product_vector, valid_dataset_product_vector, product_vector_labels_valid)
final_score_valid_st2 = (f1_hazard_vector_valid + f1_product_vector_valid) / 2.0

print(f"Validation F1 Score for Hazard Vector (ST2): {f1_hazard_vector_valid}")
print(f"Validation F1 Score for Product Vector (ST2): {f1_product_vector_valid}")
print(f"Validation Final Score (ST2): {final_score_valid_st2}")

f1_hazard_vector_test = evaluate_model(trainer_hazard_vector, test_dataset_hazard_vector, hazard_vector_labels_test)
f1_product_vector_test = evaluate_model(trainer_product_vector, test_dataset_product_vector, product_vector_labels_test)
final_score_test_st2 = (f1_hazard_vector_test + f1_product_vector_test) / 2.0

print(f"Test F1 Score for Hazard Vector (ST2): {f1_hazard_vector_test}")
print(f"Test F1 Score for Product Vector (ST2): {f1_product_vector_test}")
print(f"Test Final Score (ST2): {final_score_test_st2}")

# Predicted labels for ST2
test_hazard_vector_preds = get_predictions(trainer_hazard_vector, test_dataset_hazard_vector, label_encoder_hazard_vector)
test_product_vector_preds = get_predictions(trainer_product_vector, test_dataset_product_vector, label_encoder_product_vector)

# File Submission

In [None]:
# DataFrame with ST1 predictions
st1_df = pd.DataFrame({
    "Index": range(len(test_hazard_preds)),
    "Predicted_Hazard_Category": test_hazard_preds,
    "Predicted_Product_Category": test_product_preds
})

# DataFrame with ST2 predictions
st2_df = pd.DataFrame({
    "Index": range(len(test_hazard_vector_preds)),
    "Predicted_Hazard_Vector": test_hazard_vector_preds,
    "Predicted_Product_Vector": test_product_vector_preds
})

final_df = pd.merge(st1_df, st2_df, on="Index")
final_df.to_csv("submission.csv", index=False)
print("Updated predictions saved to submission.csv")


# File Zip

In [None]:
csv_filename = "submission.csv"
zip_filename = "submission.zip"

with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(csv_filename)

print(f"Zipped file saved as {zip_filename}")