In [1]:
! pip install pandas scikit-learn
!pip install -U imbalanced-learn



In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# load training data:
data = pd.read_csv('gdrive/MyDrive/SemEval25/food_recall_incidents.csv', index_col=0)
trainset, devset = train_test_split(data, test_size=0.2, random_state=2024)

trainset.sample()

Unnamed: 0,year,month,day,title,product,product-category,product-title,hazard,hazard-category,hazard-title,language,country
3802,2019,10,17,Hanf PROTEIN,hemp protein powder,"dietetic foods, food supplements, fortified foods","(5,11)",product category/characteristics,fraud,"(0,3)",de,de


In [4]:
# load test data:
testset = pd.read_csv('gdrive/MyDrive/SemEval25/incidents.csv', index_col=0)

testset.sample()

Unnamed: 0,year,month,day,country,title,text
934,2021,12,3,us,BUBS Naturals is Recalling Two Lots of Fountai...,FINAL BUBS Naturals Press Release 12/03/2021 B...


In [5]:
from imblearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler

text_clf_lr = Pipeline([

    ('vect', TfidfVectorizer(strip_accents='unicode', analyzer='char', ngram_range=(2,5), max_df=0.5, min_df=5)),
    ('sm', RandomOverSampler(random_state=0)),
     ('clf', RandomForestClassifier(random_state=0)),
    ])

In [8]:
from sklearn.metrics import classification_report, f1_score

for label in ('hazard-category', 'product-category', 'hazard', 'product'):
  print(label.upper())
  text_clf_lr.fit(trainset.title, trainset[label])

  # get development scores:
  devset['predictions-' + label] = text_clf_lr.predict(devset.title)
  print(f'  macro: {f1_score(devset[label], devset["predictions-" + label], zero_division=0, average="macro"):.2f}')
  print(f'  micro: {f1_score(devset[label], devset["predictions-" + label], zero_division=0, average="micro"):.2f}')

  # predict test set:
  testset[label] = text_clf_lr.predict(testset.title)

HAZARD-CATEGORY
  macro: 0.58
  micro: 0.76
PRODUCT-CATEGORY
  macro: 0.48
  micro: 0.66
HAZARD
  macro: 0.32
  micro: 0.56
PRODUCT
  macro: 0.18
  micro: 0.37


In [9]:
def compute_score(hazards_true, products_true, hazards_pred, products_pred):
  # compute f1 for hazards:
  f1_hazards = f1_score(
    hazards_true,
    hazards_pred,
    average='macro'
  )

  # compute f1 for products:
  f1_products = f1_score(
    products_true[hazards_pred == hazards_true],
    products_pred[hazards_pred == hazards_true],
    average='macro'
  )

  return (f1_hazards + f1_products) / 2.

print(f"Score Sub-Task 1: {compute_score(devset['hazard-category'], devset['product-category'], devset['predictions-hazard-category'], devset['predictions-product-category']):.3f}")
print(f"Score Sub-Task 2: {compute_score(devset['hazard'], devset['product'], devset['predictions-hazard'], devset['predictions-product']):.3f}")

Score Sub-Task 1: 0.566
Score Sub-Task 2: 0.284


In [13]:
import os
from shutil import make_archive

# save predictions to a new folder:
os.makedirs('./submission/', exist_ok=True)
testset[['hazard-category', 'product-category', 'hazard', 'product']].to_csv('gdrive/MyDrive/SemEval25/submission.csv')

# zip the folder (zipfile can be directly uploaded to codalab):
#make_archive('/submission', 'zip', './submission')