In [1]:
# %pip install numpy pandas transformers scikit-learn hf_xet 'accelerate>=0.26.0' datasets
# %pip install --upgrade transformers

In [2]:
import sys
import os

script_dir = os.path.dirname(os.path.abspath(os.getcwd()))
parent_dir = os.path.abspath(os.path.join(script_dir, os.pardir))
sys.path.append(script_dir)
from utils import *

import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
# import matplotlib.pyplot as plt
# import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')


In [3]:
DATA_PATH = "https://github.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/blob/main/data/"

train = pd.read_csv(os.path.join(DATA_PATH, "incidents_train.csv?raw=true"))
valid = pd.read_csv(os.path.join(DATA_PATH, "incidents_valid.csv?raw=true"))
test = pd.read_csv(os.path.join(DATA_PATH, "incidents_test.csv?raw=true"))

# Configuration
CONFIG = {
    'st1_task': True,  # True dla ST1 (kategorie), False dla ST2 (wektory)
    'use_both_text_title': True,  # Czy używać title+text czy tylko title
    'max_features': 10000,  # Ograniczenie TF-IDF dla wydajności
    'random_state': 42
}

In [4]:
print("=== FOOD HAZARD DETECTION - IMPROVED BASELINE ===")
print(f"Task: {'ST1 (Categories)' if CONFIG['st1_task'] else 'ST2 (Vectors)'}")
print(f"Features: {'Title + Text' if CONFIG['use_both_text_title'] else 'Title only'}")

print(f"Train size: {len(train)}, Valid size: {len(valid)}, Test size: {len(test)}")

# 2. DATA ANALYSIS & PREPROCESSING
print("\n2. Data analysis...")

# Wybór labelów na podstawie zadania
if CONFIG['st1_task']:
    hazard_col = 'hazard-category'
    product_col = 'product-category'
    task_name = "ST1"
else:
    hazard_col = 'hazard'
    product_col = 'product'
    task_name = "ST2"

# Statystyki
hazard_counts = train[hazard_col].value_counts()
product_counts = train[product_col].value_counts()

print(f"\n{task_name} Statistics:")
print(f"Unique hazards: {len(hazard_counts)}")
print(f"Unique products: {len(product_counts)}")
print(f"Most common hazard: {hazard_counts.index[0]} ({hazard_counts.iloc[0]} samples)")
print(f"Most common product: {product_counts.index[0]} ({product_counts.iloc[0]} samples)")

# 3. FEATURE ENGINEERING
print("\n3. Feature engineering...")

# Przygotowanie tekstów
train_texts = prepare_text_features(train, CONFIG)
valid_texts = prepare_text_features(valid, CONFIG)
test_texts = prepare_text_features(test, CONFIG)

# TF-IDF Vectorization
print("Creating TF-IDF features...")
vectorizer = TfidfVectorizer(
    max_features=CONFIG['max_features'],
    ngram_range=(1, 2),  # 1-gramy i 2-gramy
    min_df=2,
    max_df=0.95,
    stop_words='english'
)

X_train = vectorizer.fit_transform(train_texts)
X_valid = vectorizer.transform(valid_texts) 
X_test = vectorizer.transform(test_texts)

print(f"TF-IDF shape: {X_train.shape}")

# 4. LABEL PREPARATION
print("\n4. Label preparation...")

# Etykiety dla hazard
y_train_hazard = train[hazard_col]
y_valid_hazard = valid[hazard_col]
y_test_hazard = test[hazard_col]

# Etykiety dla product
y_train_product = train[product_col]
y_valid_product = valid[product_col]
y_test_product = test[product_col]

# 5. MODEL TRAINING
print("\n5. Training models...")

# Class weights dla niezbalansowanych danych
hazard_classes = np.unique(y_train_hazard)
product_classes = np.unique(y_train_product)

hazard_weights = compute_class_weight('balanced', classes=hazard_classes, y=y_train_hazard)
product_weights = compute_class_weight('balanced', classes=product_classes, y=y_train_product)

hazard_weight_dict = dict(zip(hazard_classes, hazard_weights))
product_weight_dict = dict(zip(product_classes, product_weights))

# Modele z class weights
hazard_model = LogisticRegression(
    class_weight=hazard_weight_dict,
    max_iter=1000,
    random_state=CONFIG['random_state']
)

product_model = LogisticRegression(
    class_weight=product_weight_dict,
    max_iter=1000,
    random_state=CONFIG['random_state']
)

print("Training hazard classifier...")
hazard_model.fit(X_train, y_train_hazard)

print("Training product classifier...")
product_model.fit(X_train, y_train_product)

# 7. PREDICTIONS AND EVALUATION
print("\n6. Making predictions...")

# Predykcje na validation set
hazard_pred_valid = hazard_model.predict(X_valid)
product_pred_valid = product_model.predict(X_valid)

# Predykcje na test set
hazard_pred_test = hazard_model.predict(X_test)
product_pred_test = product_model.predict(X_test)

# Ocena na validation set
print("\n=== VALIDATION RESULTS ===")
valid_scores = compute_food_hazard_score(
    y_valid_hazard.values, y_valid_product.values,
    hazard_pred_valid, product_pred_valid
)

print(f"Hazard F1: {valid_scores['f1_hazards']:.4f}")
print(f"Product F1: {valid_scores['f1_products']:.4f}")
print(f"Final Score: {valid_scores['final_score']:.4f}")

# Ocena na test set
print("\n=== TEST RESULTS ===")
test_scores = compute_food_hazard_score(
    y_test_hazard.values, y_test_product.values,
    hazard_pred_test, product_pred_test
)

print(f"Hazard F1: {test_scores['f1_hazards']:.4f}")
print(f"Product F1: {test_scores['f1_products']:.4f}")
print(f"Final Score: {test_scores['final_score']:.4f}")

# 8. DETAILED ANALYSIS FOR REPORT
print("\n=== DETAILED ANALYSIS FOR REPORT ===")

# Porównanie z majority classifier
from sklearn.dummy import DummyClassifier

dummy_hazard = DummyClassifier(strategy='most_frequent')
dummy_product = DummyClassifier(strategy='most_frequent')

dummy_hazard.fit(X_train, y_train_hazard)
dummy_product.fit(X_train, y_train_product)

dummy_hazard_pred = dummy_hazard.predict(X_test)
dummy_product_pred = dummy_product.predict(X_test)

dummy_scores = compute_food_hazard_score(
    y_test_hazard.values, y_test_product.values,
    dummy_hazard_pred, dummy_product_pred
)

print(f"\nBaseline (Majority Classifier): {dummy_scores['final_score']:.4f}")
print(f"Our Model: {test_scores['final_score']:.4f}")
print(f"Improvement: {test_scores['final_score'] - dummy_scores['final_score']:.4f}")

# Analiza błędów - najczęściej mylone klasy
print(f"\n=== ERROR ANALYSIS ({task_name}) ===")

# Top 5 najczęściej mylonych hazard classes
hazard_errors = []
for true_label, pred_label in zip(y_test_hazard, hazard_pred_test):
    if true_label != pred_label:
        hazard_errors.append((true_label, pred_label))

if hazard_errors:
    hazard_error_counter = Counter(hazard_errors)
    print(f"\nTop 5 most confused hazard pairs:")
    for (true_h, pred_h), count in hazard_error_counter.most_common(5):
        print(f"  {true_h} → {pred_h}: {count} times")

# Klasy z najniższym F1
hazard_f1_per_class = f1_score(y_test_hazard, hazard_pred_test, average=None, labels=hazard_classes)
worst_hazard_classes = sorted(zip(hazard_classes, hazard_f1_per_class), key=lambda x: x[1])[:3]

print(f"\nWorst performing hazard classes:")
for class_name, f1 in worst_hazard_classes:
    class_count = sum(y_test_hazard == class_name)
    print(f"  {class_name}: F1={f1:.3f} (n={class_count})")

# 9. RESULTS SUMMARY FOR REPORT
print("\n" + "="*50)
print("SUMMARY FOR REPORT")
print("="*50)

results_summary = {
    'Task': task_name,
    'Features': 'Title + Text' if CONFIG['use_both_text_title'] else 'Title only',
    'Method': 'TF-IDF + Logistic Regression with Class Weights',
    'Validation_F1_Hazard': f"{valid_scores['f1_hazards']:.4f}",
    'Validation_F1_Product': f"{valid_scores['f1_products']:.4f}",
    'Validation_Final_Score': f"{valid_scores['final_score']:.4f}",
    'Test_F1_Hazard': f"{test_scores['f1_hazards']:.4f}",
    'Test_F1_Product': f"{test_scores['f1_products']:.4f}",
    'Test_Final_Score': f"{test_scores['final_score']:.4f}",
    'Baseline_Score': f"{dummy_scores['final_score']:.4f}",
    'Improvement': f"{test_scores['final_score'] - dummy_scores['final_score']:.4f}"
}

for key, value in results_summary.items():
    print(f"{key}: {value}")

# 10. SAVE RESULTS
results_df = pd.DataFrame([results_summary])
results_df.to_csv(f'results_{task_name.lower()}.csv', index=False)
print(f"\nResults saved to: results_{task_name.lower()}.csv")

print("\n=== EXPERIMENT COMPLETED ===")
print(f"Configuration used: {CONFIG}")
print(f"Final {task_name} Score: {test_scores['final_score']:.4f}")

# Szybkie porównanie z wynikami z konkursu
if CONFIG['st1_task']:
    print(f"\nReference (Competition):")
    print(f"Best ST1 result: 0.8223 (Anastasia)")
    print(f"BERT baseline: ~0.667")
    print(f"Your result: {test_scores['final_score']:.4f}")
else:
    print(f"\nReference (Competition):")
    print(f"Best ST2 result: 0.5473 (SRCB)")
    print(f"BERT baseline: ~0.498") 
    print(f"Your result: {test_scores['final_score']:.4f}")

=== FOOD HAZARD DETECTION - IMPROVED BASELINE ===
Task: ST1 (Categories)
Features: Title + Text
Train size: 5082, Valid size: 565, Test size: 997

2. Data analysis...

ST1 Statistics:
Unique hazards: 10
Unique products: 22
Most common hazard: allergens (1854 samples)
Most common product: meat, egg and dairy products (1434 samples)

3. Feature engineering...
Creating TF-IDF features...
TF-IDF shape: (5082, 10000)

4. Label preparation...

5. Training models...
Training hazard classifier...
Training product classifier...

6. Making predictions...

=== VALIDATION RESULTS ===
Hazard F1: 0.6892
Product F1: 0.5335
Final Score: 0.6113

=== TEST RESULTS ===
Hazard F1: 0.6173
Product F1: 0.5783
Final Score: 0.5978

=== DETAILED ANALYSIS FOR REPORT ===

Baseline (Majority Classifier): 0.0352
Our Model: 0.5978
Improvement: 0.5626

=== ERROR ANALYSIS (ST1) ===

Top 5 most confused hazard pairs:
  allergens → fraud: 31 times
  biological → organoleptic aspects: 11 times
  fraud → allergens: 11 time

In [None]:
# Configuration
CONFIG = {
    'st1_task': False,  # True dla ST1 (kategorie), False dla ST2 (wektory)
    'use_both_text_title': True,  # Czy używać title+text czy tylko title
    'max_features': 10000,  # Ograniczenie TF-IDF dla wydajności
    'random_state': 42
}

print("=== FOOD HAZARD DETECTION - IMPROVED BASELINE ===")
print(f"Task: {'ST1 (Categories)' if CONFIG['st1_task'] else 'ST2 (Vectors)'}")
print(f"Features: {'Title + Text' if CONFIG['use_both_text_title'] else 'Title only'}")

# 1. LOAD DATA
print("\n1. Loading datasets...")
DATA_PATH = "https://github.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/blob/main/data/"

train = pd.read_csv(os.path.join(DATA_PATH, "incidents_train.csv?raw=true"))
valid = pd.read_csv(os.path.join(DATA_PATH, "incidents_valid.csv?raw=true"))
test = pd.read_csv(os.path.join(DATA_PATH, "incidents_test.csv?raw=true"))

print(f"Train size: {len(train)}, Valid size: {len(valid)}, Test size: {len(test)}")

# 2. DATA ANALYSIS & PREPROCESSING
print("\n2. Data analysis...")

# Wybór labelów na podstawie zadania
if CONFIG['st1_task']:
    hazard_col = 'hazard-category'
    product_col = 'product-category'
    task_name = "ST1"
else:
    hazard_col = 'hazard'
    product_col = 'product'
    task_name = "ST2"

# Statystyki
hazard_counts = train[hazard_col].value_counts()
product_counts = train[product_col].value_counts()

print(f"\n{task_name} Statistics:")
print(f"Unique hazards: {len(hazard_counts)}")
print(f"Unique products: {len(product_counts)}")
print(f"Most common hazard: {hazard_counts.index[0]} ({hazard_counts.iloc[0]} samples)")
print(f"Most common product: {product_counts.index[0]} ({product_counts.iloc[0]} samples)")

# 3. FEATURE ENGINEERING
print("\n3. Feature engineering...")



# Przygotowanie tekstów
train_texts = prepare_text_features(train, CONFIG)
valid_texts = prepare_text_features(valid, CONFIG)
test_texts = prepare_text_features(test, CONFIG)

# TF-IDF Vectorization
print("Creating TF-IDF features...")
vectorizer = TfidfVectorizer(
    max_features=CONFIG['max_features'],
    ngram_range=(1, 2),  # 1-gramy i 2-gramy
    min_df=2,
    max_df=0.95,
    stop_words='english'
)

X_train = vectorizer.fit_transform(train_texts)
X_valid = vectorizer.transform(valid_texts) 
X_test = vectorizer.transform(test_texts)

print(f"TF-IDF shape: {X_train.shape}")

# 4. LABEL PREPARATION
print("\n4. Label preparation...")

# Etykiety dla hazard
y_train_hazard = train[hazard_col]
y_valid_hazard = valid[hazard_col]
y_test_hazard = test[hazard_col]

# Etykiety dla product
y_train_product = train[product_col]
y_valid_product = valid[product_col]
y_test_product = test[product_col]

# 5. MODEL TRAINING
print("\n5. Training models...")

# Class weights dla niezbalansowanych danych
hazard_classes = np.unique(y_train_hazard)
product_classes = np.unique(y_train_product)

hazard_weights = compute_class_weight('balanced', classes=hazard_classes, y=y_train_hazard)
product_weights = compute_class_weight('balanced', classes=product_classes, y=y_train_product)

hazard_weight_dict = dict(zip(hazard_classes, hazard_weights))
product_weight_dict = dict(zip(product_classes, product_weights))

# Modele z class weights
hazard_model = LogisticRegression(
    class_weight=hazard_weight_dict,
    max_iter=1000,
    random_state=CONFIG['random_state']
)

product_model = LogisticRegression(
    class_weight=product_weight_dict,
    max_iter=1000,
    random_state=CONFIG['random_state']
)

print("Training hazard classifier...")
hazard_model.fit(X_train, y_train_hazard)

print("Training product classifier...")
product_model.fit(X_train, y_train_product)

# 6. EVALUATION FUNCTION


# 7. PREDICTIONS AND EVALUATION
print("\n6. Making predictions...")

# Predykcje na validation set
hazard_pred_valid = hazard_model.predict(X_valid)
product_pred_valid = product_model.predict(X_valid)

# Predykcje na test set
hazard_pred_test = hazard_model.predict(X_test)
product_pred_test = product_model.predict(X_test)

# Ocena na validation set
print("\n=== VALIDATION RESULTS ===")
valid_scores = compute_food_hazard_score(
    y_valid_hazard.values, y_valid_product.values,
    hazard_pred_valid, product_pred_valid
)

print(f"Hazard F1: {valid_scores['f1_hazards']:.4f}")
print(f"Product F1: {valid_scores['f1_products']:.4f}")
print(f"Final Score: {valid_scores['final_score']:.4f}")

# Ocena na test set
print("\n=== TEST RESULTS ===")
test_scores = compute_food_hazard_score(
    y_test_hazard.values, y_test_product.values,
    hazard_pred_test, product_pred_test
)

print(f"Hazard F1: {test_scores['f1_hazards']:.4f}")
print(f"Product F1: {test_scores['f1_products']:.4f}")
print(f"Final Score: {test_scores['final_score']:.4f}")

# 8. DETAILED ANALYSIS FOR REPORT
print("\n=== DETAILED ANALYSIS FOR REPORT ===")

# Porównanie z majority classifier
from sklearn.dummy import DummyClassifier

dummy_hazard = DummyClassifier(strategy='most_frequent')
dummy_product = DummyClassifier(strategy='most_frequent')

dummy_hazard.fit(X_train, y_train_hazard)
dummy_product.fit(X_train, y_train_product)

dummy_hazard_pred = dummy_hazard.predict(X_test)
dummy_product_pred = dummy_product.predict(X_test)

dummy_scores = compute_food_hazard_score(
    y_test_hazard.values, y_test_product.values,
    dummy_hazard_pred, dummy_product_pred
)

print(f"\nBaseline (Majority Classifier): {dummy_scores['final_score']:.4f}")
print(f"Our Model: {test_scores['final_score']:.4f}")
print(f"Improvement: {test_scores['final_score'] - dummy_scores['final_score']:.4f}")

# Analiza błędów - najczęściej mylone klasy
print(f"\n=== ERROR ANALYSIS ({task_name}) ===")

# Top 5 najczęściej mylonych hazard classes
hazard_errors = []
for true_label, pred_label in zip(y_test_hazard, hazard_pred_test):
    if true_label != pred_label:
        hazard_errors.append((true_label, pred_label))

if hazard_errors:
    hazard_error_counter = Counter(hazard_errors)
    print(f"\nTop 5 most confused hazard pairs:")
    for (true_h, pred_h), count in hazard_error_counter.most_common(5):
        print(f"  {true_h} → {pred_h}: {count} times")

# Klasy z najniższym F1
hazard_f1_per_class = f1_score(y_test_hazard, hazard_pred_test, average=None, labels=hazard_classes)
worst_hazard_classes = sorted(zip(hazard_classes, hazard_f1_per_class), key=lambda x: x[1])[:3]

print(f"\nWorst performing hazard classes:")
for class_name, f1 in worst_hazard_classes:
    class_count = sum(y_test_hazard == class_name)
    print(f"  {class_name}: F1={f1:.3f} (n={class_count})")

# 9. RESULTS SUMMARY FOR REPORT
print("\n" + "="*50)
print("SUMMARY FOR REPORT")
print("="*50)

results_summary = {
    'Task': task_name,
    'Features': 'Title + Text' if CONFIG['use_both_text_title'] else 'Title only',
    'Method': 'TF-IDF + Logistic Regression with Class Weights',
    'Validation_F1_Hazard': f"{valid_scores['f1_hazards']:.4f}",
    'Validation_F1_Product': f"{valid_scores['f1_products']:.4f}",
    'Validation_Final_Score': f"{valid_scores['final_score']:.4f}",
    'Test_F1_Hazard': f"{test_scores['f1_hazards']:.4f}",
    'Test_F1_Product': f"{test_scores['f1_products']:.4f}",
    'Test_Final_Score': f"{test_scores['final_score']:.4f}",
    'Baseline_Score': f"{dummy_scores['final_score']:.4f}",
    'Improvement': f"{test_scores['final_score'] - dummy_scores['final_score']:.4f}"
}

for key, value in results_summary.items():
    print(f"{key}: {value}")

# 10. SAVE RESULTS
results_df = pd.DataFrame([results_summary])
results_df.to_csv(f'results_{task_name.lower()}.csv', index=False)
print(f"\nResults saved to: results_{task_name.lower()}.csv")

print("\n=== EXPERIMENT COMPLETED ===")
print(f"Configuration used: {CONFIG}")
print(f"Final {task_name} Score: {test_scores['final_score']:.4f}")

# Szybkie porównanie z wynikami z konkursu
if CONFIG['st1_task']:
    print(f"\nReference (Competition):")
    print(f"  Best ST1 result: 0.8223 (Anastasia)")
    print(f"  BERT baseline: ~0.667")
    print(f"  Your result: {test_scores['final_score']:.4f}")
else:
    print(f"\nReference (Competition):")
    print(f"  Best ST2 result: 0.5473 (SRCB)")
    print(f"  BERT baseline: ~0.498") 
    print(f"  Your result: {test_scores['final_score']:.4f}")