In [1]:
%pip install numpy pandas transformers scikit-learn hf_xet 'accelerate>=0.26.0' datasets
%pip install --upgrade transformers

Note: you may need to restart the kernel to use updated packages.
Collecting transformers
  Downloading transformers-4.52.3-py3-none-any.whl.metadata (40 kB)
Downloading transformers-4.52.3-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.3
    Uninstalling transformers-4.51.3:
      Successfully uninstalled transformers-4.51.3
Successfully installed transformers-4.52.3
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
import json
import warnings
warnings.filterwarnings('ignore')

print("=== SIMPLE FOOD HAZARD DETECTION - COMPATIBILITY MODE ===")
print(f"PyTorch version: {torch.__version__}")

# Configuration - Ultra simple
CONFIG = {
    'st1_task': True,  # Change to False for ST2
    'use_bert': False,  # Set to True if you want to try BERT
    'max_features': 10000,  # TF-IDF features
    'test_bert_loading': False  # Test if BERT loading works
}

task_name = "ST1" if CONFIG['st1_task'] else "ST2"
print(f"Task: {task_name}")
print(f"Method: {'BERT' if CONFIG['use_bert'] else 'TF-IDF + LogReg'}")

# 1. Load Data
print("\n1. Loading data...")
train = pd.read_csv("https://github.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/blob/main/data/incidents_train.csv?raw=true")
valid = pd.read_csv("https://github.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/blob/main/data/incidents_valid.csv?raw=true")
test = pd.read_csv("https://github.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/blob/main/data/incidents_test.csv?raw=true")

print(f"✅ Data loaded - Train: {len(train)}, Valid: {len(valid)}, Test: {len(test)}")

# Select columns based on task
if CONFIG['st1_task']:
    hazard_col = 'hazard-category'
    product_col = 'product-category'
else:
    hazard_col = 'hazard'
    product_col = 'product'

print(f"Target columns: {hazard_col}, {product_col}")

# 2. Data Analysis
print("\n2. Data analysis...")
hazard_counts = train[hazard_col].value_counts()
product_counts = train[product_col].value_counts()

print(f"Hazard classes: {len(hazard_counts)}")
print(f"Product classes: {len(product_counts)}")
print(f"Most common hazard: {hazard_counts.index[0]} ({hazard_counts.iloc[0]} samples)")
print(f"Most common product: {product_counts.index[0]} ({product_counts.iloc[0]} samples)")

# Check for imbalance
imbalance_ratio_h = hazard_counts.iloc[0] / hazard_counts.iloc[-1]
imbalance_ratio_p = product_counts.iloc[0] / product_counts.iloc[-1]
print(f"Hazard imbalance ratio: {imbalance_ratio_h:.1f}x")
print(f"Product imbalance ratio: {imbalance_ratio_p:.1f}x")

# 3. Text preprocessing
print("\n3. Text preparation...")

def prepare_text(df):
    """Simple text preparation"""
    texts = []
    for _, row in df.iterrows():
        # Combine title and text
        text = str(row['title']) + " " + str(row.get('text', ''))
        # Basic cleaning
        text = text.lower().replace('\n', ' ').replace('\r', ' ')
        text = ' '.join(text.split())  # Remove extra spaces
        texts.append(text)
    return texts

train_texts = prepare_text(train)
valid_texts = prepare_text(valid)
test_texts = prepare_text(test)

print(f"✅ Texts prepared - Average length: {np.mean([len(t.split()) for t in train_texts[:100]]):.1f} words")

# 4. Test BERT loading if requested
if CONFIG['test_bert_loading']:
    print("\n4. Testing BERT loading...")
    try:
        from transformers import AutoTokenizer, AutoModel
        tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')  # Lighter model
        model = AutoModel.from_pretrained('distilbert-base-uncased')
        print("✅ BERT loading successful! You can set use_bert=True")
        CONFIG['use_bert'] = True
    except Exception as e:
        print(f"❌ BERT loading failed: {e}")
        print("Continuing with TF-IDF...")
        CONFIG['use_bert'] = False

# 5. Model Training - TF-IDF Version (Always works)
if not CONFIG['use_bert']:
    print("\n5. Training TF-IDF + LogReg model...")
    
    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(
        max_features=CONFIG['max_features'],
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95,
        stop_words='english'
    )
    
    print("  Creating TF-IDF features...")
    X_train = vectorizer.fit_transform(train_texts)
    X_valid = vectorizer.transform(valid_texts)
    X_test = vectorizer.transform(test_texts)
    
    print(f"  ✅ TF-IDF shape: {X_train.shape}")
    
    # Prepare labels
    y_train_hazard = train[hazard_col].values
    y_valid_hazard = valid[hazard_col].values
    y_test_hazard = test[hazard_col].values
    
    y_train_product = train[product_col].values
    y_valid_product = valid[product_col].values
    y_test_product = test[product_col].values
    
    # Class weights for imbalanced data
    hazard_classes = np.unique(y_train_hazard)
    product_classes = np.unique(y_train_product)
    
    hazard_weights = compute_class_weight('balanced', classes=hazard_classes, y=y_train_hazard)
    product_weights = compute_class_weight('balanced', classes=product_classes, y=y_train_product)
    
    hazard_weight_dict = dict(zip(hazard_classes, hazard_weights))
    product_weight_dict = dict(zip(product_classes, product_weights))
    
    print(f"  Class weights computed - Hazard: {len(hazard_weight_dict)}, Product: {len(product_weight_dict)}")
    
    # Train models
    print("  Training hazard classifier...")
    hazard_model = LogisticRegression(
        class_weight=hazard_weight_dict,
        max_iter=1000,
        random_state=42
    )
    hazard_model.fit(X_train, y_train_hazard)
    
    print("  Training product classifier...")
    product_model = LogisticRegression(
        class_weight=product_weight_dict,
        max_iter=1000,
        random_state=42
    )
    product_model.fit(X_train, y_train_product)
    
    print("  ✅ Models trained")
    
    # Predictions
    print("\n6. Making predictions...")
    hazard_pred_valid = hazard_model.predict(X_valid)
    product_pred_valid = product_model.predict(X_valid)
    
    hazard_pred_test = hazard_model.predict(X_test)
    product_pred_test = product_model.predict(X_test)

# 6. Simple BERT Version (if compatible)
elif CONFIG['use_bert']:
    print("\n5. Training Simple BERT...")
    
    from transformers import AutoTokenizer, AutoModel
    from torch.utils.data import Dataset, DataLoader
    
    # Simple dataset
    class SimpleDataset(Dataset):
        def __init__(self, texts, hazard_labels, product_labels, tokenizer, max_length=128):
            self.texts = texts
            self.hazard_labels = hazard_labels
            self.product_labels = product_labels
            self.tokenizer = tokenizer
            self.max_length = max_length
            
            # Create label mappings
            unique_hazards = sorted(list(set(hazard_labels)))
            unique_products = sorted(list(set(product_labels)))
            
            self.hazard_to_id = {h: i for i, h in enumerate(unique_hazards)}
            self.product_to_id = {p: i for i, p in enumerate(unique_products)}
            
        def __len__(self):
            return len(self.texts)
        
        def __getitem__(self, idx):
            text = self.texts[idx]
            
            encoding = self.tokenizer(
                text,
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='pt'
            )
            
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'hazard_label': torch.tensor(self.hazard_to_id[self.hazard_labels[idx]], dtype=torch.long),
                'product_label': torch.tensor(self.product_to_id[self.product_labels[idx]], dtype=torch.long)
            }
    
    # Simple BERT model
    class SimpleBERT(nn.Module):
        def __init__(self, model_name, num_hazards, num_products):
            super().__init__()
            self.bert = AutoModel.from_pretrained(model_name)
            hidden_size = self.bert.config.hidden_size
            self.hazard_head = nn.Linear(hidden_size, num_hazards)
            self.product_head = nn.Linear(hidden_size, num_products)
            
        def forward(self, input_ids, attention_mask):
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            pooled = outputs.last_hidden_state.mean(dim=1)
            return self.hazard_head(pooled), self.product_head(pooled)
    
    # Initialize
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
    
    train_dataset = SimpleDataset(train_texts, train[hazard_col].values, train[product_col].values, tokenizer)
    valid_dataset = SimpleDataset(valid_texts, valid[hazard_col].values, valid[product_col].values, tokenizer)
    test_dataset = SimpleDataset(test_texts, test[hazard_col].values, test[product_col].values, tokenizer)
    
    model = SimpleBERT('distilbert-base-uncased', len(train_dataset.hazard_to_id), len(train_dataset.product_to_id))
    
    # Simple training (just 1 epoch for demo)
    device = torch.device('cpu')  # Force CPU to avoid device issues
    model.to(device)
    
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()
    
    print("  Training (1 epoch for demo)...")
    model.train()
    for batch_idx, batch in enumerate(train_loader):
        if batch_idx > 50:  # Just first 50 batches for demo
            break
            
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        hazard_labels = batch['hazard_label'].to(device)
        product_labels = batch['product_label'].to(device)
        
        optimizer.zero_grad()
        hazard_logits, product_logits = model(input_ids, attention_mask)
        
        loss = criterion(hazard_logits, hazard_labels) + criterion(product_logits, product_labels)
        loss.backward()
        optimizer.step()
        
        if batch_idx % 10 == 0:
            print(f"    Batch {batch_idx}/50, Loss: {loss.item():.4f}")
    
    print("  ✅ BERT training completed (demo)")
    
    # Simple predictions for BERT would go here...
    # For now, fall back to TF-IDF results
    print("  Note: Using TF-IDF results for evaluation")

# 7. Evaluation Function
def compute_food_hazard_score(hazards_true, products_true, hazards_pred, products_pred):
    """Official SemEval scoring function"""
    f1_hazards = f1_score(hazards_true, hazards_pred, average='macro')
    
    correct_hazard_mask = hazards_pred == hazards_true
    if sum(correct_hazard_mask) > 0:
        f1_products = f1_score(
            products_true[correct_hazard_mask],
            products_pred[correct_hazard_mask],
            average='macro'
        )
    else:
        f1_products = 0.0
    
    return {
        'f1_hazards': f1_hazards,
        'f1_products': f1_products,
        'final_score': (f1_hazards + f1_products) / 2
    }

# 8. Results
print("\n7. Evaluation...")

# Validation results
valid_scores = compute_food_hazard_score(
    y_valid_hazard, y_valid_product,
    hazard_pred_valid, product_pred_valid
)

# Test results  
test_scores = compute_food_hazard_score(
    y_test_hazard, y_test_product,
    hazard_pred_test, product_pred_test
)

print("\n=== VALIDATION RESULTS ===")
print(f"Hazard F1: {valid_scores['f1_hazards']:.4f}")
print(f"Product F1: {valid_scores['f1_products']:.4f}")
print(f"Final Score: {valid_scores['final_score']:.4f}")

print("\n=== TEST RESULTS ===")
print(f"Hazard F1: {test_scores['f1_hazards']:.4f}")
print(f"Product F1: {test_scores['f1_products']:.4f}")
print(f"Final Score: {test_scores['final_score']:.4f}")

# 9. Comparison with baselines
print("\n8. Baseline comparison...")

# Majority classifier
from sklearn.dummy import DummyClassifier
dummy_hazard = DummyClassifier(strategy='most_frequent')
dummy_product = DummyClassifier(strategy='most_frequent')

dummy_hazard.fit(X_train, y_train_hazard)
dummy_product.fit(X_train, y_train_product)

dummy_hazard_pred = dummy_hazard.predict(X_test)
dummy_product_pred = dummy_product.predict(X_test)

dummy_scores = compute_food_hazard_score(
    y_test_hazard, y_test_product,
    dummy_hazard_pred, dummy_product_pred
)

print(f"Majority Baseline: {dummy_scores['final_score']:.4f}")
print(f"Our Model: {test_scores['final_score']:.4f}")
print(f"Improvement: +{test_scores['final_score'] - dummy_scores['final_score']:.4f}")

# 10. Competition comparison
print(f"\n=== COMPETITION COMPARISON ({task_name}) ===")
if CONFIG['st1_task']:
    competition_best = 0.8223
    competition_bert = 0.667
    print(f"Competition Best (Anastasia): {competition_best:.4f}")
    print(f"Competition BERT Baseline: {competition_bert:.4f}")
else:
    competition_best = 0.5473
    competition_bert = 0.498
    print(f"Competition Best (SRCB): {competition_best:.4f}")
    print(f"Competition BERT Baseline: {competition_bert:.4f}")

print(f"Your Result: {test_scores['final_score']:.4f}")

if CONFIG['st1_task']:
    if test_scores['final_score'] > competition_bert:
        print(f"🎉 You beat the BERT baseline by {test_scores['final_score'] - competition_bert:.4f}!")
    gap_to_best = competition_best - test_scores['final_score']
    print(f"Gap to best: {gap_to_best:.4f}")
else:
    if test_scores['final_score'] > competition_bert:
        print(f"🎉 You beat the BERT baseline by {test_scores['final_score'] - competition_bert:.4f}!")
    gap_to_best = competition_best - test_scores['final_score']
    print(f"Gap to best: {gap_to_best:.4f}")

# 11. Save results
results_summary = {
    'task': task_name,
    'method': 'TF-IDF + LogReg with Class Weights',
    'config': CONFIG,
    'data_stats': {
        'train_size': len(train),
        'valid_size': len(valid),
        'test_size': len(test),
        'hazard_classes': len(hazard_counts),
        'product_classes': len(product_counts),
        'hazard_imbalance': float(imbalance_ratio_h),
        'product_imbalance': float(imbalance_ratio_p)
    },
    'results': {
        'validation': valid_scores,
        'test': test_scores,
        'baseline': dummy_scores,
        'improvement': float(test_scores['final_score'] - dummy_scores['final_score'])
    },
    'competition_comparison': {
        'competition_best': float(competition_best),
        'competition_bert': float(competition_bert),
        'our_result': float(test_scores['final_score']),
        'gap_to_best': float(competition_best - test_scores['final_score'])
    }
}

filename = f'simple_results_{task_name.lower()}.json'
with open(filename, 'w') as f:
    json.dump(results_summary, f, indent=2, default=str)

print(f"\n✅ Results saved to {filename}")

print("\n=== EXPERIMENT COMPLETED ===")
print(f"Method: TF-IDF + Logistic Regression")
print(f"Final {task_name} Score: {test_scores['final_score']:.4f}")

# Quick suggestions for improvement
print(f"\n💡 NEXT STEPS FOR BETTER RESULTS:")
print(f"1. Try different TF-IDF parameters (max_features, ngram_range)")
print(f"2. Add more feature engineering (text length, country, date)")
print(f"3. Try ensemble methods (combine multiple models)")
print(f"4. If BERT works, try: 'test_bert_loading': True")
print(f"5. For ST2 task, set: 'st1_task': False")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/michal/miniconda/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/michal/miniconda/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/michal/miniconda/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/Users/m

=== SIMPLE FOOD HAZARD DETECTION - COMPATIBILITY MODE ===
PyTorch version: 2.2.2
Task: ST1
Method: TF-IDF + LogReg

1. Loading data...
✅ Data loaded - Train: 5082, Valid: 565, Test: 997
Target columns: hazard-category, product-category

2. Data analysis...
Hazard classes: 10
Product classes: 22
Most common hazard: allergens (1854 samples)
Most common product: meat, egg and dairy products (1434 samples)
Hazard imbalance ratio: 618.0x
Product imbalance ratio: 286.8x

3. Text preparation...
✅ Texts prepared - Average length: 169.4 words

5. Training TF-IDF + LogReg model...
  Creating TF-IDF features...
  ✅ TF-IDF shape: (5082, 10000)
  Class weights computed - Hazard: 10, Product: 22
  Training hazard classifier...
  Training product classifier...
  ✅ Models trained

6. Making predictions...

7. Evaluation...

=== VALIDATION RESULTS ===
Hazard F1: 0.6892
Product F1: 0.5335
Final Score: 0.6113

=== TEST RESULTS ===
Hazard F1: 0.6173
Product F1: 0.5783
Final Score: 0.5978

8. Baseline comp

In [6]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
import json
import warnings
warnings.filterwarnings('ignore')

print("=== SIMPLE FOOD HAZARD DETECTION - COMPATIBILITY MODE ===")
print(f"PyTorch version: {torch.__version__}")

# Configuration - Ultra simple
CONFIG = {
    'st1_task': False,  # Change to False for ST2
    'use_bert': False,  # Set to True if you want to try BERT
    'max_features': 10000,  # TF-IDF features
    'test_bert_loading': False  # Test if BERT loading works
}

task_name = "ST1" if CONFIG['st1_task'] else "ST2"
print(f"Task: {task_name}")
print(f"Method: {'BERT' if CONFIG['use_bert'] else 'TF-IDF + LogReg'}")

# 1. Load Data
print("\n1. Loading data...")
train = pd.read_csv("https://github.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/blob/main/data/incidents_train.csv?raw=true")
valid = pd.read_csv("https://github.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/blob/main/data/incidents_valid.csv?raw=true")
test = pd.read_csv("https://github.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/blob/main/data/incidents_test.csv?raw=true")

print(f"✅ Data loaded - Train: {len(train)}, Valid: {len(valid)}, Test: {len(test)}")

# Select columns based on task
if CONFIG['st1_task']:
    hazard_col = 'hazard-category'
    product_col = 'product-category'
else:
    hazard_col = 'hazard'
    product_col = 'product'

print(f"Target columns: {hazard_col}, {product_col}")

# 2. Data Analysis
print("\n2. Data analysis...")
hazard_counts = train[hazard_col].value_counts()
product_counts = train[product_col].value_counts()

print(f"Hazard classes: {len(hazard_counts)}")
print(f"Product classes: {len(product_counts)}")
print(f"Most common hazard: {hazard_counts.index[0]} ({hazard_counts.iloc[0]} samples)")
print(f"Most common product: {product_counts.index[0]} ({product_counts.iloc[0]} samples)")

# Check for imbalance
imbalance_ratio_h = hazard_counts.iloc[0] / hazard_counts.iloc[-1]
imbalance_ratio_p = product_counts.iloc[0] / product_counts.iloc[-1]
print(f"Hazard imbalance ratio: {imbalance_ratio_h:.1f}x")
print(f"Product imbalance ratio: {imbalance_ratio_p:.1f}x")

# 3. Text preprocessing
print("\n3. Text preparation...")

def prepare_text(df):
    """Simple text preparation"""
    texts = []
    for _, row in df.iterrows():
        # Combine title and text
        text = str(row['title']) + " " + str(row.get('text', ''))
        # Basic cleaning
        text = text.lower().replace('\n', ' ').replace('\r', ' ')
        text = ' '.join(text.split())  # Remove extra spaces
        texts.append(text)
    return texts

train_texts = prepare_text(train)
valid_texts = prepare_text(valid)
test_texts = prepare_text(test)

print(f"✅ Texts prepared - Average length: {np.mean([len(t.split()) for t in train_texts[:100]]):.1f} words")

# 4. Test BERT loading if requested
if CONFIG['test_bert_loading']:
    print("\n4. Testing BERT loading...")
    try:
        from transformers import AutoTokenizer, AutoModel
        tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')  # Lighter model
        model = AutoModel.from_pretrained('distilbert-base-uncased')
        print("✅ BERT loading successful! You can set use_bert=True")
        CONFIG['use_bert'] = True
    except Exception as e:
        print(f"❌ BERT loading failed: {e}")
        print("Continuing with TF-IDF...")
        CONFIG['use_bert'] = False

# 5. Model Training - TF-IDF Version (Always works)
if not CONFIG['use_bert']:
    print("\n5. Training TF-IDF + LogReg model...")
    
    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(
        max_features=CONFIG['max_features'],
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95,
        stop_words='english'
    )
    
    print("  Creating TF-IDF features...")
    X_train = vectorizer.fit_transform(train_texts)
    X_valid = vectorizer.transform(valid_texts)
    X_test = vectorizer.transform(test_texts)
    
    print(f"  ✅ TF-IDF shape: {X_train.shape}")
    
    # Prepare labels
    y_train_hazard = train[hazard_col].values
    y_valid_hazard = valid[hazard_col].values
    y_test_hazard = test[hazard_col].values
    
    y_train_product = train[product_col].values
    y_valid_product = valid[product_col].values
    y_test_product = test[product_col].values
    
    # Class weights for imbalanced data
    hazard_classes = np.unique(y_train_hazard)
    product_classes = np.unique(y_train_product)
    
    hazard_weights = compute_class_weight('balanced', classes=hazard_classes, y=y_train_hazard)
    product_weights = compute_class_weight('balanced', classes=product_classes, y=y_train_product)
    
    hazard_weight_dict = dict(zip(hazard_classes, hazard_weights))
    product_weight_dict = dict(zip(product_classes, product_weights))
    
    print(f"  Class weights computed - Hazard: {len(hazard_weight_dict)}, Product: {len(product_weight_dict)}")
    
    # Train models
    print("  Training hazard classifier...")
    hazard_model = LogisticRegression(
        class_weight=hazard_weight_dict,
        max_iter=1000,
        random_state=42
    )
    hazard_model.fit(X_train, y_train_hazard)
    
    print("  Training product classifier...")
    product_model = LogisticRegression(
        class_weight=product_weight_dict,
        max_iter=1000,
        random_state=42
    )
    product_model.fit(X_train, y_train_product)
    
    print("  ✅ Models trained")
    
    # Predictions
    print("\n6. Making predictions...")
    hazard_pred_valid = hazard_model.predict(X_valid)
    product_pred_valid = product_model.predict(X_valid)
    
    hazard_pred_test = hazard_model.predict(X_test)
    product_pred_test = product_model.predict(X_test)

# 6. Simple BERT Version (if compatible)
elif CONFIG['use_bert']:
    print("\n5. Training Simple BERT...")
    
    from transformers import AutoTokenizer, AutoModel
    from torch.utils.data import Dataset, DataLoader
    
    # Simple dataset
    class SimpleDataset(Dataset):
        def __init__(self, texts, hazard_labels, product_labels, tokenizer, max_length=128):
            self.texts = texts
            self.hazard_labels = hazard_labels
            self.product_labels = product_labels
            self.tokenizer = tokenizer
            self.max_length = max_length
            
            # Create label mappings
            unique_hazards = sorted(list(set(hazard_labels)))
            unique_products = sorted(list(set(product_labels)))
            
            self.hazard_to_id = {h: i for i, h in enumerate(unique_hazards)}
            self.product_to_id = {p: i for i, p in enumerate(unique_products)}
            
        def __len__(self):
            return len(self.texts)
        
        def __getitem__(self, idx):
            text = self.texts[idx]
            
            encoding = self.tokenizer(
                text,
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='pt'
            )
            
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'hazard_label': torch.tensor(self.hazard_to_id[self.hazard_labels[idx]], dtype=torch.long),
                'product_label': torch.tensor(self.product_to_id[self.product_labels[idx]], dtype=torch.long)
            }
    
    # Simple BERT model
    class SimpleBERT(nn.Module):
        def __init__(self, model_name, num_hazards, num_products):
            super().__init__()
            self.bert = AutoModel.from_pretrained(model_name)
            hidden_size = self.bert.config.hidden_size
            self.hazard_head = nn.Linear(hidden_size, num_hazards)
            self.product_head = nn.Linear(hidden_size, num_products)
            
        def forward(self, input_ids, attention_mask):
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            pooled = outputs.last_hidden_state.mean(dim=1)
            return self.hazard_head(pooled), self.product_head(pooled)
    
    # Initialize
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
    
    train_dataset = SimpleDataset(train_texts, train[hazard_col].values, train[product_col].values, tokenizer)
    valid_dataset = SimpleDataset(valid_texts, valid[hazard_col].values, valid[product_col].values, tokenizer)
    test_dataset = SimpleDataset(test_texts, test[hazard_col].values, test[product_col].values, tokenizer)
    
    model = SimpleBERT('distilbert-base-uncased', len(train_dataset.hazard_to_id), len(train_dataset.product_to_id))
    
    # Simple training (just 1 epoch for demo)
    device = torch.device('cpu')  # Force CPU to avoid device issues
    model.to(device)
    
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()
    
    print("  Training (1 epoch for demo)...")
    model.train()
    for batch_idx, batch in enumerate(train_loader):
        if batch_idx > 50:  # Just first 50 batches for demo
            break
            
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        hazard_labels = batch['hazard_label'].to(device)
        product_labels = batch['product_label'].to(device)
        
        optimizer.zero_grad()
        hazard_logits, product_logits = model(input_ids, attention_mask)
        
        loss = criterion(hazard_logits, hazard_labels) + criterion(product_logits, product_labels)
        loss.backward()
        optimizer.step()
        
        if batch_idx % 10 == 0:
            print(f"    Batch {batch_idx}/50, Loss: {loss.item():.4f}")
    
    print("  ✅ BERT training completed (demo)")
    
    # Simple predictions for BERT would go here...
    # For now, fall back to TF-IDF results
    print("  Note: Using TF-IDF results for evaluation")

# 7. Evaluation Function
def compute_food_hazard_score(hazards_true, products_true, hazards_pred, products_pred):
    """Official SemEval scoring function"""
    f1_hazards = f1_score(hazards_true, hazards_pred, average='macro')
    
    correct_hazard_mask = hazards_pred == hazards_true
    if sum(correct_hazard_mask) > 0:
        f1_products = f1_score(
            products_true[correct_hazard_mask],
            products_pred[correct_hazard_mask],
            average='macro'
        )
    else:
        f1_products = 0.0
    
    return {
        'f1_hazards': f1_hazards,
        'f1_products': f1_products,
        'final_score': (f1_hazards + f1_products) / 2
    }

# 8. Results
print("\n7. Evaluation...")

# Validation results
valid_scores = compute_food_hazard_score(
    y_valid_hazard, y_valid_product,
    hazard_pred_valid, product_pred_valid
)

# Test results  
test_scores = compute_food_hazard_score(
    y_test_hazard, y_test_product,
    hazard_pred_test, product_pred_test
)

print("\n=== VALIDATION RESULTS ===")
print(f"Hazard F1: {valid_scores['f1_hazards']:.4f}")
print(f"Product F1: {valid_scores['f1_products']:.4f}")
print(f"Final Score: {valid_scores['final_score']:.4f}")

print("\n=== TEST RESULTS ===")
print(f"Hazard F1: {test_scores['f1_hazards']:.4f}")
print(f"Product F1: {test_scores['f1_products']:.4f}")
print(f"Final Score: {test_scores['final_score']:.4f}")

# 9. Comparison with baselines
print("\n8. Baseline comparison...")

# Majority classifier
from sklearn.dummy import DummyClassifier
dummy_hazard = DummyClassifier(strategy='most_frequent')
dummy_product = DummyClassifier(strategy='most_frequent')

dummy_hazard.fit(X_train, y_train_hazard)
dummy_product.fit(X_train, y_train_product)

dummy_hazard_pred = dummy_hazard.predict(X_test)
dummy_product_pred = dummy_product.predict(X_test)

dummy_scores = compute_food_hazard_score(
    y_test_hazard, y_test_product,
    dummy_hazard_pred, dummy_product_pred
)

print(f"Majority Baseline: {dummy_scores['final_score']:.4f}")
print(f"Our Model: {test_scores['final_score']:.4f}")
print(f"Improvement: +{test_scores['final_score'] - dummy_scores['final_score']:.4f}")

# 10. Competition comparison
print(f"\n=== COMPETITION COMPARISON ({task_name}) ===")
if CONFIG['st1_task']:
    competition_best = 0.8223
    competition_bert = 0.667
    print(f"Competition Best (Anastasia): {competition_best:.4f}")
    print(f"Competition BERT Baseline: {competition_bert:.4f}")
else:
    competition_best = 0.5473
    competition_bert = 0.498
    print(f"Competition Best (SRCB): {competition_best:.4f}")
    print(f"Competition BERT Baseline: {competition_bert:.4f}")

print(f"Your Result: {test_scores['final_score']:.4f}")

if CONFIG['st1_task']:
    if test_scores['final_score'] > competition_bert:
        print(f"🎉 You beat the BERT baseline by {test_scores['final_score'] - competition_bert:.4f}!")
    gap_to_best = competition_best - test_scores['final_score']
    print(f"Gap to best: {gap_to_best:.4f}")
else:
    if test_scores['final_score'] > competition_bert:
        print(f"🎉 You beat the BERT baseline by {test_scores['final_score'] - competition_bert:.4f}!")
    gap_to_best = competition_best - test_scores['final_score']
    print(f"Gap to best: {gap_to_best:.4f}")

# 11. Save results
results_summary = {
    'task': task_name,
    'method': 'TF-IDF + LogReg with Class Weights',
    'config': CONFIG,
    'data_stats': {
        'train_size': len(train),
        'valid_size': len(valid),
        'test_size': len(test),
        'hazard_classes': len(hazard_counts),
        'product_classes': len(product_counts),
        'hazard_imbalance': float(imbalance_ratio_h),
        'product_imbalance': float(imbalance_ratio_p)
    },
    'results': {
        'validation': valid_scores,
        'test': test_scores,
        'baseline': dummy_scores,
        'improvement': float(test_scores['final_score'] - dummy_scores['final_score'])
    },
    'competition_comparison': {
        'competition_best': float(competition_best),
        'competition_bert': float(competition_bert),
        'our_result': float(test_scores['final_score']),
        'gap_to_best': float(competition_best - test_scores['final_score'])
    }
}

filename = f'simple_results_{task_name.lower()}.json'
with open(filename, 'w') as f:
    json.dump(results_summary, f, indent=2, default=str)

print(f"\n✅ Results saved to {filename}")

print("\n=== EXPERIMENT COMPLETED ===")
print(f"Method: TF-IDF + Logistic Regression")
print(f"Final {task_name} Score: {test_scores['final_score']:.4f}")

# Quick suggestions for improvement
print(f"\n💡 NEXT STEPS FOR BETTER RESULTS:")
print(f"1. Try different TF-IDF parameters (max_features, ngram_range)")
print(f"2. Add more feature engineering (text length, country, date)")
print(f"3. Try ensemble methods (combine multiple models)")
print(f"4. If BERT works, try: 'test_bert_loading': True")
print(f"5. For ST2 task, set: 'st1_task': False")

=== SIMPLE FOOD HAZARD DETECTION - COMPATIBILITY MODE ===
PyTorch version: 2.2.2
Task: ST2
Method: TF-IDF + LogReg

1. Loading data...
✅ Data loaded - Train: 5082, Valid: 565, Test: 997
Target columns: hazard, product

2. Data analysis...
Hazard classes: 128
Product classes: 1022
Most common hazard: listeria monocytogenes (665 samples)
Most common product: ice cream (185 samples)
Hazard imbalance ratio: 221.7x
Product imbalance ratio: 185.0x

3. Text preparation...
✅ Texts prepared - Average length: 169.4 words

5. Training TF-IDF + LogReg model...
  Creating TF-IDF features...
  ✅ TF-IDF shape: (5082, 10000)
  Class weights computed - Hazard: 128, Product: 1022
  Training hazard classifier...
  Training product classifier...
  ✅ Models trained

6. Making predictions...

7. Evaluation...

=== VALIDATION RESULTS ===
Hazard F1: 0.3874
Product F1: 0.0797
Final Score: 0.2336

=== TEST RESULTS ===
Hazard F1: 0.4241
Product F1: 0.0850
Final Score: 0.2546

8. Baseline comparison...
Majority B