In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import os

# GPU setup
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [2]:

# Load datasets
train_data = pd.read_csv('/kaggle/input/task9-semeval/incidents_train.csv')
valid_data = pd.read_csv('/kaggle/input/task9-semeval/incidents_valid.csv')
test_data = pd.read_csv('/kaggle/input/task9-semeval/incidents_test.csv')


In [None]:
def preprocess_data(data, is_test=False):
    data = data.copy()
    if not is_test:
        data = data[['text', 'hazard-category', 'product-category']].dropna()
    else:
        data = data[['text']].copy()
        
        data['hazard-category'] = 0  
        data['product-category'] = 0
    return data


In [5]:
# Preprocess datasets
train_data = preprocess_data(train_data)
valid_data = preprocess_data(valid_data, is_test=True)
test_data = preprocess_data(test_data, is_test=True)

print(valid_data.head())
print(test_data.head())


                                                text  hazard-category  \
0  Case Number: 017-94   \n            Date Opene...                0   
1  Case Number: 048-94   \n            Date Opene...                0   
2  Case Number: 032-95   \n            Date Opene...                0   
3  PRESENCE OF UNDECLARED NUTS IN ORIGINALE AUGUS...                0   
4  Case Number: 018-98  Recall Notification Repor...                0   

   product-category  
0                 0  
1                 0  
2                 0  
3                 0  
4                 0  
                                                text  hazard-category  \
0  Case Number: 039-94   \n            Date Opene...                0   
1  Case Number: 026-95   \n            Date Opene...                0   
2  Case Number: 028-95   \n            Date Opene...                0   
3  PRA No. 1998/3500 Date published 17 Mar 1998 P...                0   
4  PRA No. 1998/3645 Date published 10 Sep 1998 P...            

In [6]:

hazard_encoder = LabelEncoder()
product_encoder = LabelEncoder()
train_data['hazard-category'] = hazard_encoder.fit_transform(train_data['hazard-category'])
train_data['product-category'] = product_encoder.fit_transform(train_data['product-category'])


In [None]:
# Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        text = row['text']
        hazard_label = row.get('hazard-category', 0) 
        product_label = row.get('product-category', 0)
    
        inputs = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
    
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'hazard_label': torch.tensor(hazard_label, dtype=torch.long),
            'product_label': torch.tensor(product_label, dtype=torch.long)
        }


In [8]:
# Model Definition
class MultiTaskModel(nn.Module):
    def __init__(self, model_name, num_hazard_labels, num_product_labels):
        super(MultiTaskModel, self).__init__()
        self.base_model = AutoModel.from_pretrained(model_name)
        hidden_size = self.base_model.config.hidden_size

        self.hazard_classifier = nn.Linear(hidden_size, num_hazard_labels)
        self.product_classifier = nn.Linear(hidden_size, num_product_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # CLS token representation

        hazard_logits = self.hazard_classifier(pooled_output)
        product_logits = self.product_classifier(pooled_output)

        return hazard_logits, product_logits


In [9]:

# Initialize tokenizer and datasets
model_name = "facebook/bart-large-mnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = 128


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [10]:
train_dataset = CustomDataset(train_data, tokenizer, max_length)
valid_dataset = CustomDataset(valid_data, tokenizer, max_length)


In [11]:
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    hazard_labels = torch.stack([item['hazard_label'] for item in batch])
    product_labels = torch.stack([item['product_label'] for item in batch])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'hazard_labels': hazard_labels,
        'product_labels': product_labels
    }


In [12]:

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)


In [13]:

# Initialize model
num_hazard_labels = len(hazard_encoder.classes_)
num_product_labels = len(product_encoder.classes_)
model = MultiTaskModel(model_name, num_hazard_labels, num_product_labels).to(device)


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [14]:
# Optimizer and Loss
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
criterion_hazard = nn.CrossEntropyLoss()
criterion_product = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True)


In [None]:
# Early Stopping Class
class EarlyStopping:
    def __init__(self, patience=3, delta=0):
        self.patience = patience
        self.delta = delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None or val_loss < self.best_loss - self.delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True



In [16]:
def train_epoch(model, dataloader, optimizer, criterion_hazard, criterion_product, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        hazard_labels = batch['hazard_labels'].to(device)
        product_labels = batch['product_labels'].to(device)

        optimizer.zero_grad()

        hazard_logits, product_logits = model(input_ids, attention_mask)
        loss_hazard = criterion_hazard(hazard_logits, hazard_labels)
        loss_product = criterion_product(product_logits, product_labels)
        loss = loss_hazard + loss_product

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)


In [17]:
def validate_epoch(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_hazard_preds = []
    all_product_preds = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            hazard_labels = batch['hazard_labels'].to(device)
            product_labels = batch['product_labels'].to(device)

            hazard_logits, product_logits = model(input_ids, attention_mask)
            loss_hazard = criterion_hazard(hazard_logits, hazard_labels)
            loss_product = criterion_product(product_logits, product_labels)
            total_loss += (loss_hazard + loss_product).item()

            hazard_preds = torch.argmax(hazard_logits, dim=1).cpu().numpy()
            product_preds = torch.argmax(product_logits, dim=1).cpu().numpy()

            all_hazard_preds.extend(hazard_preds)
            all_product_preds.extend(product_preds)

    return total_loss / len(dataloader), all_hazard_preds, all_product_preds


In [None]:
# Initialize variables for tracking the best train loss
best_train_loss = float('inf')
patience = 3 
patience_counter = 0


epochs = 20
for epoch in range(epochs):
  
    train_loss = train_epoch(model, train_loader, optimizer, criterion_hazard, criterion_product, device)
    val_loss, hazard_preds, product_preds = validate_epoch(model, valid_loader, device)
    
    print(f"Epoch {epoch + 1}: Train Loss = {train_loss}, Validation Loss = {val_loss}")
    
   
    scheduler.step(val_loss)
    
   
    for param_group in optimizer.param_groups:
        print(f"Learning rate: {param_group['lr']}")
    
    
    if train_loss < best_train_loss:
        best_train_loss = train_loss
        torch.save(model.state_dict(), 'best_model_ver2.pth')
        print("Best model saved.")
        patience_counter = 0 
    else:
        patience_counter += 1
    
    if patience_counter >= patience:
        print(f"Stopping training early after {epoch + 1} epochs due to no improvement in train loss.")
        break


Epoch 1: Train Loss = 2.0871595853717073, Validation Loss = 12.262657589382595
Learning rate: 2e-05
Best model saved.
Epoch 2: Train Loss = 0.9904616326562263, Validation Loss = 14.479464530944824
Learning rate: 2e-05
Best model saved.
Epoch 3: Train Loss = 0.665567483706107, Validation Loss = 15.54702811770969
Learning rate: 2e-05
Best model saved.
Epoch 4: Train Loss = 0.45717340575402265, Validation Loss = 14.7837057378557
Learning rate: 2.0000000000000003e-06
Best model saved.
Epoch 5: Train Loss = 0.2471741163552557, Validation Loss = 16.28653205765618
Learning rate: 2.0000000000000003e-06
Best model saved.
Epoch 6: Train Loss = 0.19184493334413324, Validation Loss = 16.686556180318195
Learning rate: 2.0000000000000003e-06
Best model saved.
Epoch 7: Train Loss = 0.1579879388716892, Validation Loss = 16.88746600680881
Learning rate: 2.0000000000000004e-07
Best model saved.
Epoch 8: Train Loss = 0.13693741443069102, Validation Loss = 16.920150836308796
Learning rate: 2.0000000000000

In [19]:
print(valid_data.head())
print(test_data.head())


                                                text  hazard-category  \
0  Case Number: 017-94   \n            Date Opene...                0   
1  Case Number: 048-94   \n            Date Opene...                0   
2  Case Number: 032-95   \n            Date Opene...                0   
3  PRESENCE OF UNDECLARED NUTS IN ORIGINALE AUGUS...                0   
4  Case Number: 018-98  Recall Notification Repor...                0   

   product-category  
0                 0  
1                 0  
2                 0  
3                 0  
4                 0  
                                                text  hazard-category  \
0  Case Number: 039-94   \n            Date Opene...                0   
1  Case Number: 026-95   \n            Date Opene...                0   
2  Case Number: 028-95   \n            Date Opene...                0   
3  PRA No. 1998/3500 Date published 17 Mar 1998 P...                0   
4  PRA No. 1998/3645 Date published 10 Sep 1998 P...            

In [None]:

torch.save(model.state_dict(), 'multitask_model_ver2.pth')


In [None]:

valid_dataset = CustomDataset(valid_data, tokenizer, max_length)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)


def predict_valid(model, dataloader, device):
    model.eval()
    all_hazard_preds = []
    all_product_preds = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            hazard_logits, product_logits = model(input_ids, attention_mask)
            hazard_preds = torch.argmax(hazard_logits, dim=1).cpu().numpy()
            product_preds = torch.argmax(product_logits, dim=1).cpu().numpy()

            all_hazard_preds.extend(hazard_preds)
            all_product_preds.extend(product_preds)

    return all_hazard_preds, all_product_preds


hazard_preds, product_preds = predict_valid(model, valid_loader, device)


valid_data['hazard-category'] = hazard_encoder.inverse_transform(hazard_preds)
valid_data['product-category'] = product_encoder.inverse_transform(product_preds)


pred_df = valid_data[['hazard-category', 'product-category']]


pred_df.to_csv('validation_predictions.csv', index=False)


print(pred_df.head())


  hazard-category              product-category
0      biological  meat, egg and dairy products
1      biological  meat, egg and dairy products
2      biological  meat, egg and dairy products
3       allergens             ices and desserts
4      biological  meat, egg and dairy products


In [None]:

pred_df = pd.read_csv('/kaggle/working/validation_predictions.csv')  


pred_df.reset_index(inplace=True)
pred_df.rename(columns={'index': ''}, inplace=True)


pred_df = pred_df[['', 'hazard-category', 'product-category']]


pred_df.to_csv('submission_with_stt_ver2.csv', index=False)


print(pred_df.head())


     hazard-category              product-category
0  0      biological  meat, egg and dairy products
1  1      biological  meat, egg and dairy products
2  2      biological  meat, egg and dairy products
3  3       allergens             ices and desserts
4  4      biological  meat, egg and dairy products


In [None]:

eval_data = pd.read_csv('/kaggle/input/eval-data/incidents.csv')  


eval_data = preprocess_data(eval_data, is_test=True)  


eval_data['hazard'] = ""  
eval_data['product'] = ""


eval_dataset = CustomDataset(eval_data, tokenizer, max_length)
eval_loader = DataLoader(eval_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)


hazard_preds, product_preds = predict_valid(model, eval_loader, device)


eval_data['hazard-category'] = hazard_encoder.inverse_transform(hazard_preds)
eval_data['product-category'] = product_encoder.inverse_transform(product_preds)


submission_df = eval_data[['hazard-category','product-category','hazard', 'product']]


submission_file = '/kaggle/working/submission.csv'
submission_df.to_csv(submission_file, index=False)


import zipfile
with zipfile.ZipFile('/kaggle/working/submission_eval_phase.zip', 'w') as zipf:
    zipf.write(submission_file, arcname='submission.csv')


print(submission_df.head())

  hazard-category              product-category hazard product
0      biological  meat, egg and dairy products               
1      biological  meat, egg and dairy products               
2      biological  meat, egg and dairy products               
3      biological                       seafood               
4  foreign bodies  meat, egg and dairy products               
