In [31]:
%pip install numpy pandas transformers scikit-learn hf_xet 'accelerate>=0.26.0' datasets
%pip install --upgrade transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

test = pd.read_csv("https://github.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/blob/main/data/incidents_test.csv?raw=true")
train = pd.read_csv("https://github.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/blob/main/data/incidents_train.csv?raw=true")
valid = pd.read_csv("https://github.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/blob/main/data/incidents_valid.csv?raw=true")






In [33]:
import torch
import torch.nn as nn
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, Dataset
from typing import Dict, List, Optional, Tuple

# 1. Load and Process Dataset
print("Loading dataset...")
data_files = {
    "train": "https://github.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/blob/main/data/incidents_train.csv?raw=true", 
    "test": "https://github.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/blob/main/data/incidents_test.csv?raw=true"
}
dataset = load_dataset("csv", data_files=data_files)

# 2. Analyze and Preprocess Dataset
print("Analyzing dataset...")

# Examine unique values in key columns
hazard_categories = dataset["train"].unique("hazard-category")
product_categories = dataset["train"].unique("product-category")

print(f"Number of hazard categories: {len(hazard_categories)}")
print(f"Number of product categories: {len(product_categories)}")

# Create label mappings
hazard_label_mapping = {cat: idx for idx, cat in enumerate(hazard_categories)}
product_label_mapping = {cat: idx for idx, cat in enumerate(product_categories)}

# Map string labels to integers
def map_labels(example):
    hazard_label = hazard_label_mapping.get(example["hazard-category"], -1)
    product_label = product_label_mapping.get(example["product-category"], -1)
    return {"hazard_label": hazard_label, "product_label": product_label}

dataset = dataset.map(map_labels)

# 3. Tokenize Dataset
print("Tokenizing dataset...")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    # Using only the title field for tokenization as specified
    return tokenizer(
        examples["title"],
        padding="max_length",
        truncation=True,
        max_length=128  # As specified in requirements
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 4. Custom BERT Model with Two Classification Heads
class BERTWithDualHeads(nn.Module):
    def __init__(self, bert_model_name, num_hazards, num_products):
        super().__init__()
        self.bert = AutoModel.from_pretrained(bert_model_name)
        
        # Dropout for regularization
        self.dropout = nn.Dropout(0.1)
        
        # Two classification heads
        self.hazard_classifier = nn.Linear(self.bert.config.hidden_size, num_hazards)
        self.product_classifier = nn.Linear(self.bert.config.hidden_size, num_products)
    
    def forward(self, input_ids, attention_mask, labels=None):
        # Get BERT outputs
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Get the [CLS] token representation
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        
        # Pass through the classification heads
        hazard_logits = self.hazard_classifier(pooled_output)
        product_logits = self.product_classifier(pooled_output)
        
        return {
            "hazard_logits": hazard_logits,
            "product_logits": product_logits
        }

# 5. Custom Dataset for Dual Classification
class DualClassificationDataset(Dataset):
    def __init__(self, tokenized_dataset):
        self.tokenized_dataset = tokenized_dataset
    
    def __len__(self):
        return len(self.tokenized_dataset)
    
    def __getitem__(self, idx):
        item = self.tokenized_dataset[idx]
        return {
            "input_ids": torch.tensor(item["input_ids"]),
            "attention_mask": torch.tensor(item["attention_mask"]),
            "hazard_label": torch.tensor(item["hazard_label"]),
            "product_label": torch.tensor(item["product_label"])
        }

# Create custom datasets
train_dataset = DualClassificationDataset(tokenized_datasets["train"])
test_dataset = DualClassificationDataset(tokenized_datasets["test"])

# 7. Evaluation Function
def evaluate_model(model, test_loader, device):
    model.eval()
    all_hazard_preds = []
    all_product_preds = []
    all_hazard_true = []
    all_product_true = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            # Get predictions
            hazard_preds = torch.argmax(outputs["hazard_logits"], dim=1).cpu().numpy()
            product_preds = torch.argmax(outputs["product_logits"], dim=1).cpu().numpy()
            
            all_hazard_preds.extend(hazard_preds)
            all_product_preds.extend(product_preds)
            all_hazard_true.extend(batch["hazard_label"].numpy())
            all_product_true.extend(batch["product_label"].numpy())
    
    # Convert to numpy arrays for easier manipulation
    all_hazard_preds = np.array(all_hazard_preds)
    all_product_preds = np.array(all_product_preds)
    all_hazard_true = np.array(all_hazard_true)
    all_product_true = np.array(all_product_true)
    
    # Calculate macro-F1 scores
    f1_hazard = f1_score(all_hazard_true, all_hazard_preds, average='macro')
    
    # Calculate product F1 score only for instances where hazard prediction is correct
    correct_hazard_mask = all_hazard_preds == all_hazard_true
    
    if sum(correct_hazard_mask) > 0:
        f1_product = f1_score(
            all_product_true[correct_hazard_mask], 
            all_product_preds[correct_hazard_mask], 
            average='macro'
        )
    else:
        f1_product = 0.0
    
    # Final score as per the requirement
    final_score = (f1_hazard + f1_product) / 2
    
    print(f"Hazard Macro-F1: {f1_hazard:.4f}")
    print(f"Product Macro-F1 (for correct hazards): {f1_product:.4f}")
    print(f"Final Score: {final_score:.4f}")
    
    return final_score

# Hyperparameters
hyperparams = {
    "learning_rate": 2e-5,  # As specified in requirements
    "batch_size": 16,       # As specified in requirements
    "epochs": 3            # As specified in requirements
}

# Initialize model
print("Initializing model...")
model = BERTWithDualHeads(
    bert_model_name="bert-base-uncased", 
    num_hazards=len(hazard_categories),  # 10 hazard categories
    num_products=len(product_categories)  # 22 product categories
)

# Train model
print("Training model...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model.to(device)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=hyperparams["learning_rate"])

# Loss function
loss_fn = nn.CrossEntropyLoss()

# DataLoaders
train_loader = DataLoader(
    train_dataset, 
    batch_size=hyperparams["batch_size"], 
    shuffle=True
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=hyperparams["batch_size"]
)

# Training loop
for epoch in range(hyperparams["epochs"]):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        # Move batch to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        hazard_labels = batch["hazard_label"].to(device)
        product_labels = batch["product_label"].to(device)
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Calculate loss
        hazard_loss = loss_fn(outputs["hazard_logits"], hazard_labels)
        product_loss = loss_fn(outputs["product_logits"], product_labels)
        loss = hazard_loss + product_loss
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{hyperparams['epochs']}, Loss: {avg_loss:.4f}")
    
    # Evaluate after each epoch
    evaluate_model(model, test_loader, device)

# Save the model
print("Saving model...")
torch.save(model.state_dict(), "bert_food_hazard_model.pt")
print("Model saved successfully!")


Loading dataset...
Analyzing dataset...
Number of hazard categories: 10
Number of product categories: 22
Tokenizing dataset...
Initializing model...
Training model...
Using device: cpu
Epoch 1/3, Loss: 2.7119


RuntimeError: Numpy is not available