In [None]:
! pip install pandas scikit-learn
!pip install -U imbalanced-learn



In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re

# load training data:
data = pd.read_csv('gdrive/MyDrive/SemEval25/food_recall_incidents.csv', index_col=0)

trainset, devset = train_test_split(data, test_size=0.2, random_state=2024)

trainset.sample()

Unnamed: 0,year,month,day,title,product,product-category,product-title,hazard,hazard-category,hazard-title,language,country
1410,2015,10,27,CA National Services Pty Ltd—Peach Coconut Drink,juices,non-alcoholic beverages,,milk and products thereof,allergens,"(0,27)",en,au


In [None]:
# load test data:
testset = pd.read_csv('gdrive/MyDrive/SemEval25/incidents.csv', index_col=0)

testset.sample()

Unnamed: 0,year,month,day,country,title,text
308,2016,2,16,ca,Ingredion corn starch products recalled due to...,Notice This archive of previously issued food ...


In [None]:
filter = data['title'].str.contains('Recall Notification:')
filtered_data=data[filter]
REPLACE_BY_SPACE_RE = re.compile('[/(){}[]|@,;]')
def clean_text(text):
  text = text.lower() # lowercase text
text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
text = re.sub(r'\W+', '', text)
return text

data['title'] =data['title'].apply(clean_text)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler

text_clf_lr = Pipeline([
    ('vect', TfidfVectorizer(sublinear_tf=True,strip_accents='unicode', analyzer='char', ngram_range=(2,5), max_df=0.5, min_df=5,norm='l2', stop_words='english')),
   # ('sm', RandomOverSampler(random_state=0)),
     ('clf', LinearSVC(multi_class='crammer_singer')),
    ])

In [None]:
from sklearn.metrics import classification_report, f1_score

for label in ('hazard-category', 'product-category'):
  print(label.upper())
  text_clf_lr.fit(trainset.title, trainset[label])

  # get development scores:
  devset['predictions-' + label] = text_clf_lr.predict(devset.title)
  print(f'  macro: {f1_score(devset[label], devset["predictions-" + label], zero_division=0, average="macro"):.2f}')
  print(f'  micro: {f1_score(devset[label], devset["predictions-" + label], zero_division=0, average="micro"):.2f}')

  #predict test set:
  testset[label] = text_clf_lr.predict(testset.title)

HAZARD-CATEGORY




  macro: 0.62
  micro: 0.80
PRODUCT-CATEGORY




  macro: 0.57
  micro: 0.73


In [None]:
testset['hazard']= ' '
testset['product']=' '
testset.sample()

Unnamed: 0,year,month,day,country,title,text,hazard-category,product-category,hazard,product
896,2021,7,25,ca,Harvest Fresh brand Zucchini Spirals recalled ...,Food Recall Warning - Harvest Fresh brand Zucc...,biological,fruits and vegetables,,


In [None]:
def compute_score(hazards_true, products_true, hazards_pred, products_pred):
  # compute f1 for hazards:
  f1_hazards = f1_score(
    hazards_true,
    hazards_pred,
    average='macro'
  )

  # compute f1 for products:
  f1_products = f1_score(
    products_true[hazards_pred == hazards_true],
    products_pred[hazards_pred == hazards_true],
    average='macro'
  )

  return (f1_hazards + f1_products) / 2.

print(f"Score Sub-Task 1: {compute_score(devset['hazard-category'], devset['product-category'], devset['predictions-hazard-category'], devset['predictions-product-category']):.3f}")
#print(f"Score Sub-Task 2: {compute_score(devset['hazard'], devset['product'], devset['predictions-hazard'], devset['predictions-product']):.3f}")

Score Sub-Task 1: 0.639


In [None]:
import os
from shutil import make_archive

# save predictions to a new folder:
os.makedirs('./submission/', exist_ok=True)
testset[['hazard-category', 'product-category', 'hazard', 'product']].to_csv('gdrive/MyDrive/SemEval25/submission.csv')

# zip the folder (zipfile can be directly uploaded to codalab):
#make_archive('/submission', 'zip', './submission')