In [None]:
# %pip install numpy pandas transformers scikit-learn hf_xet 'accelerate>=0.26.0' datasets
# %pip install --upgrade transformers

In [None]:
import sys
import os

script_dir = os.path.dirname(os.path.abspath(os.getcwd()))
parent_dir = os.path.abspath(os.path.join(script_dir, os.pardir))
sys.path.append(script_dir)
from utils import *

import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import warnings

from xgboost import XGBClassifier

import json

warnings.filterwarnings('ignore')

In [None]:
CONFIG = {
    'st1_task': True,  # Change to False for ST2
    'enhanced_tfidf': True,
    'feature_engineering': True,
    'max_features': 15000,  # Increased from 10000
    'ngram_range': (1, 3),  # Added trigrams
    'min_df': 1,           # More inclusive
    'max_df': 0.9,         # Less restrictive
    'random_state': 42
}

print("\n1. Loading data...")

DATA_PATH = "https://github.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/blob/main/data/"

train = pd.read_csv(os.path.join(DATA_PATH, "incidents_train.csv?raw=true"))
valid = pd.read_csv(os.path.join(DATA_PATH, "incidents_valid.csv?raw=true"))
test = pd.read_csv(os.path.join(DATA_PATH, "incidents_test.csv?raw=true"))

In [4]:
task_name = "ST1" if CONFIG['st1_task'] else "ST2"
print("=== ENHANCED FOOD HAZARD DETECTION - FIXED ===")
print(f"Task: {task_name}")
print(f"Enhanced TF-IDF: {CONFIG['enhanced_tfidf']}")
print(f"Feature Engineering: {CONFIG['feature_engineering']}")
print(f"Dataset sizes - Train: {len(train)}, Valid: {len(valid)}, Test: {len(test)}")

# Task selection
if CONFIG['st1_task']:
    hazard_col = 'hazard-category'
    product_col = 'product-category'
else:
    hazard_col = 'hazard'
    product_col = 'product'

# 2. Enhanced Text Preprocessing
print("\n2. Enhanced text preprocessing...")

# Apply enhanced preprocessing
train = enhanced_text_preparation(train)
valid = enhanced_text_preparation(valid)
test = enhanced_text_preparation(test)

print("Enhanced text preprocessing completed")

# 3. Safe Feature Engineering (no problematic features)
print("\n3. Safe feature engineering...")

if CONFIG['feature_engineering']:
    print("  Creating safe engineered features...")
    train_features = create_safe_features(train)
    valid_features = create_safe_features(valid)
    test_features = create_safe_features(test)
    
    print(f"Created {train_features.shape[1]} safe engineered features")
    print(f"Feature names: {list(train_features.columns[:10])}...")

# 4. Enhanced TF-IDF
print("\n4. Enhanced TF-IDF vectorization...")

if CONFIG['enhanced_tfidf']:
    vectorizer = TfidfVectorizer(
        max_features=CONFIG['max_features'],
        ngram_range=CONFIG['ngram_range'],
        min_df=CONFIG['min_df'],
        max_df=CONFIG['max_df'],
        stop_words='english',
        sublinear_tf=True,
        norm='l2',
        smooth_idf=True
    )
else:
    vectorizer = TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95,
        stop_words='english'
    )

print(f"  TF-IDF config: max_features={CONFIG['max_features']}, ngrams={CONFIG['ngram_range']}")

# Create TF-IDF features
X_train_tfidf = vectorizer.fit_transform(train['combined_text'])
X_valid_tfidf = vectorizer.transform(valid['combined_text'])
X_test_tfidf = vectorizer.transform(test['combined_text'])

print(f"TF-IDF shape: {X_train_tfidf.shape}")

# 5. Safe Feature Combination
print("\n5. Combining features safely...")

if CONFIG['feature_engineering']:
    # Ensure all feature sets have the same columns
    common_columns = train_features.columns.intersection(valid_features.columns).intersection(test_features.columns)
    
    train_features_safe = train_features[common_columns]
    valid_features_safe = valid_features[common_columns]
    test_features_safe = test_features[common_columns]
    
    print(f"  Using {len(common_columns)} common engineered features")
    
    # Scale engineered features
    scaler = StandardScaler()
    train_features_scaled = scaler.fit_transform(train_features_safe)
    valid_features_scaled = scaler.transform(valid_features_safe)
    test_features_scaled = scaler.transform(test_features_safe)
    
    # Combine TF-IDF + engineered features
    X_train = hstack([X_train_tfidf, train_features_scaled])
    X_valid = hstack([X_valid_tfidf, valid_features_scaled])
    X_test = hstack([X_test_tfidf, test_features_scaled])
    
    print(f"Combined features shape: {X_train.shape}")
    print(f"TF-IDF: {X_train_tfidf.shape[1]}, Engineered: {len(common_columns)}")
else:
    X_train = X_train_tfidf
    X_valid = X_valid_tfidf
    X_test = X_test_tfidf

# 6. Prepare Labels
print("\n6. Preparing labels...")

y_train_hazard = train[hazard_col].values
y_valid_hazard = valid[hazard_col].values
y_test_hazard = test[hazard_col].values

y_train_product = train[product_col].values
y_valid_product = valid[product_col].values
y_test_product = test[product_col].values

# Show class distribution
hazard_counts = pd.Series(y_train_hazard).value_counts()
product_counts = pd.Series(y_train_product).value_counts()

print(f"  Hazard classes: {len(hazard_counts)} (imbalance: {hazard_counts.iloc[0]/hazard_counts.iloc[-1]:.1f}x)")
print(f"  Product classes: {len(product_counts)} (imbalance: {product_counts.iloc[0]/product_counts.iloc[-1]:.1f}x)")

# 7. Enhanced Model Training
print("\n7. Training enhanced models...")

# Class weights
hazard_classes = np.unique(y_train_hazard)
product_classes = np.unique(y_train_product)

hazard_weights = compute_class_weight('balanced', classes=hazard_classes, y=y_train_hazard)
product_weights = compute_class_weight('balanced', classes=product_classes, y=y_train_product)

hazard_weight_dict = dict(zip(hazard_classes, hazard_weights))
product_weight_dict = dict(zip(product_classes, product_weights))

# Enhanced Logistic Regression models
print("  Training enhanced hazard classifier...")
hazard_model = LogisticRegression(
    class_weight=hazard_weight_dict,
    max_iter=2000,
    C=1.0,
    solver='liblinear',
    random_state=CONFIG['random_state']
)
hazard_model.fit(X_train, y_train_hazard)

print("  Training enhanced product classifier...")
product_model = LogisticRegression(
    class_weight=product_weight_dict,
    max_iter=2000,
    C=1.0,
    solver='liblinear',
    random_state=CONFIG['random_state']
)
product_model.fit(X_train, y_train_product)

print("Enhanced models trained")

# 8. Predictions
print("\n8. Making predictions...")

hazard_pred_valid = hazard_model.predict(X_valid)
product_pred_valid = product_model.predict(X_valid)

hazard_pred_test = hazard_model.predict(X_test)
product_pred_test = product_model.predict(X_test)

print("\n9. Results evaluation...")

# Validation results
valid_scores = compute_food_hazard_score(
    y_valid_hazard, y_valid_product,
    hazard_pred_valid, product_pred_valid
)

# Test results
test_scores = compute_food_hazard_score(
    y_test_hazard, y_test_product,
    hazard_pred_test, product_pred_test
)

print("\n=== ENHANCED RESULTS ===")
print(f"\nValidation Results:")
print(f"Hazard F1: {valid_scores['f1_hazards']:.4f}")
print(f"Product F1: {valid_scores['f1_products']:.4f}")
print(f"Final Score: {valid_scores['final_score']:.4f}")

print(f"\nTest Results:")
print(f"Hazard F1: {test_scores['f1_hazards']:.4f}")
print(f"Product F1: {test_scores['f1_products']:.4f}")
print(f"Final Score: {test_scores['final_score']:.4f}")

# 10. Comparison with Previous Results
print(f"\n=== IMPROVEMENT ANALYSIS ===")

# Previous baseline results (from your runs)
if CONFIG['st1_task']:
    previous_score = 0.5978
    competition_bert = 0.667
    competition_best = 0.8223
else:
    previous_score = 0.2546
    competition_bert = 0.498
    competition_best = 0.5473

improvement = test_scores['final_score'] - previous_score
print(f"Previous baseline: {previous_score:.4f}")
print(f"Enhanced model: {test_scores['final_score']:.4f}")
print(f"Improvement: {improvement:+.4f}")

if improvement > 0:
    print(f"{improvement:.4f} improvement achieved!")
    if improvement > 0.05:
        print(f"SIGNIFICANT improvement!")
else:
    print(f"{abs(improvement):.4f} decrease")

print(f"\nCompetition Comparison ({task_name}):")
print(f"Competition BERT baseline: {competition_bert:.4f}")
print(f"Competition best: {competition_best:.4f}")
print(f"Your enhanced result: {test_scores['final_score']:.4f}")

if test_scores['final_score'] > competition_bert:
    print(f"You beat the BERT baseline by {test_scores['final_score'] - competition_bert:.4f}!")
else:
    gap = competition_bert - test_scores['final_score']
    print(f"Gap to BERT baseline: {gap:.4f}")

gap_to_best = competition_best - test_scores['final_score']
print(f"Gap to best result: {gap_to_best:.4f}")

# 11. Feature Analysis
print(f"\n=== FEATURE ANALYSIS ===")

if CONFIG['feature_engineering']:
    print(f"Enhanced features used:")
    feature_list = train_features_safe.columns.tolist()
    for i, feat in enumerate(feature_list):
        if i < 15:  # Show first 15
            print(f"  {i+1:2d}. {feat}")
        elif i == 15:
            print(f"... and {len(feature_list)-15} more features")
            break

# Quick feature effectiveness test
print(f"\nFeature contribution analysis:")
print(f"Total features: {X_train.shape[1]}")
print(f"TF-IDF features: {X_train_tfidf.shape[1]}")
if CONFIG['feature_engineering']:
    print(f"Engineered features: {len(common_columns)}")
    print(f"Feature engineering impact: {improvement:.4f}")

# 12. Save Results
results_summary = {
    'task': task_name,
    'method': 'Enhanced TF-IDF + LogReg + Safe Feature Engineering',
    'config': CONFIG,
    'improvements': {
        'enhanced_tfidf': CONFIG['enhanced_tfidf'],
        'feature_engineering': CONFIG['feature_engineering'],
        'total_features': X_train.shape[1],
        'tfidf_features': X_train_tfidf.shape[1],
        'engineered_features': len(common_columns) if CONFIG['feature_engineering'] else 0
    },
    'results': {
        'validation': valid_scores,
        'test': test_scores,
        'improvement_over_baseline': float(improvement),
        'previous_baseline': float(previous_score)
    },
    'competition_comparison': {
        'competition_bert': float(competition_bert),
        'competition_best': float(competition_best),
        'beats_bert_baseline': bool(test_scores['final_score'] > competition_bert),
        'gap_to_best': float(gap_to_best)
    }
}

filename = f"enhanced_safe_results_{task_name.lower()}.json"

with open(filename, 'w') as f:
    json.dump(results_summary, f, indent=2, default=str)

print(f"\nEnhanced results saved to {filename}")

print("\n=== EXPERIMENT COMPLETED ===")
print(f"Enhanced {task_name} Score: {test_scores['final_score']:.4f}")
print(f"Improvement: {improvement:+.4f}")

# Actionable next steps
print(f"\nNEXT STEPS BASED ON RESULTS:")
if improvement > 0.05:
    print("EXCELLENT improvement! Ready for:")
    print("1. XGBoost ensemble (combine with LogReg)")
    print("2. Try ST2 with same approach")
    print("3. Advanced feature engineering")
elif improvement > 0:
    print("Good improvement! Try:")
    print("1. XGBoost model")
    print("2. Different TF-IDF parameters")
    print("3. Ensemble methods")
else:
    print("No improvement. Debug options:")
    print("1. Reduce complexity (fewer features)")
    print("2. Different ngram_range (1,2)")
    print("3. Traditional parameters")

# Performance summary
if CONFIG['st1_task']:
    if test_scores['final_score'] > 0.67:
        print("\nSTATUS: BERT baseline beaten! Ready for advanced techniques.")
    elif test_scores['final_score'] > 0.63:
        print("\nSTATUS: Close to BERT baseline. One more improvement should do it.")
    else:
        print("\nSTATUS: Need more work to reach BERT baseline.")
else:
    if test_scores['final_score'] > 0.35:
        print("\nSTATUS: Good progress on difficult ST2 task.")
    else:
        print("\nSTATUS: ST2 still challenging, try data augmentation.")

=== ENHANCED FOOD HAZARD DETECTION - FIXED ===
Task: ST1
Enhanced TF-IDF: True
Feature Engineering: True

1. Loading data...
Dataset sizes - Train: 5082, Valid: 565, Test: 997

2. Enhanced text preprocessing...
✅ Enhanced text preprocessing completed

3. Safe feature engineering...
  Creating safe engineered features...
  ✅ Created 28 safe engineered features
  Feature names: ['text_length', 'word_count', 'title_length', 'title_word_count', 'title_text_ratio', 'title_words_ratio', 'year', 'month', 'day', 'is_summer']...

4. Enhanced TF-IDF vectorization...
  TF-IDF config: max_features=15000, ngrams=(1, 3)
  ✅ TF-IDF shape: (5082, 15000)

5. Combining features safely...
  Using 28 common engineered features
  ✅ Combined features shape: (5082, 15028)
  TF-IDF: 15000, Engineered: 28

6. Preparing labels...
  Hazard classes: 10 (imbalance: 618.0x)
  Product classes: 22 (imbalance: 286.8x)

7. Training enhanced models...
  Training enhanced hazard classifier...
  Training enhanced product 

In [None]:
# Back-to-basics + XGBoost approach
CONFIG = {
    'st1_task': True,  # Change to False for ST2
    'models_to_try': ['logreg', 'xgb'],  # Which models to test
    'ensemble': True,   # Whether to combine models
    'grid_search_tfidf': True,  # Try different TF-IDF params
    'random_state': 42
}

In [5]:
task_name = "ST1" if CONFIG['st1_task'] else "ST2"
print("=== OPTIMIZED SIMPLE MODEL + ENSEMBLE ===")
print(f"Task: {task_name}")
print(f"Models to try: {CONFIG['models_to_try']}")
print(f"Ensemble: {CONFIG['ensemble']}")

# Task selection
if CONFIG['st1_task']:
    hazard_col = 'hazard-category'
    product_col = 'product-category'
else:
    hazard_col = 'hazard'
    product_col = 'product'

print(f"Dataset sizes - Train: {len(train)}, Valid: {len(valid)}, Test: {len(test)}")

# 2. Simple but effective text preprocessing
print("\n2. Simple text preprocessing...")

train = simple_but_effective_preprocessing(train)
valid = simple_but_effective_preprocessing(valid)
test = simple_but_effective_preprocessing(test)

print("Simple preprocessing completed")

# 3. TF-IDF Parameter Grid Search
print("\n3. TF-IDF parameter optimization...")

if CONFIG['grid_search_tfidf']:
    # Test different TF-IDF configurations
    tfidf_configs = [
        {'name': 'original', 'max_features': 10000, 'ngram_range': (1, 2), 'min_df': 2, 'max_df': 0.95},
        {'name': 'more_features', 'max_features': 15000, 'ngram_range': (1, 2), 'min_df': 2, 'max_df': 0.95},
        {'name': 'trigrams', 'max_features': 10000, 'ngram_range': (1, 3), 'min_df': 2, 'max_df': 0.95},
        {'name': 'less_restrictive', 'max_features': 10000, 'ngram_range': (1, 2), 'min_df': 1, 'max_df': 0.9},
        {'name': 'balanced', 'max_features': 12000, 'ngram_range': (1, 2), 'min_df': 1, 'max_df': 0.92}
    ]
    
    best_config = None
    best_score = 0
    tfidf_results = []
    
    # Labels for quick validation
    y_train_hazard = train[hazard_col].values
    y_valid_hazard = valid[hazard_col].values
    y_train_product = train[product_col].values
    y_valid_product = valid[product_col].values
    
    print("Testing TF-IDF configurations...")
    
    for config in tfidf_configs:
        print(f"Testing {config['name']}...")
        
        # Create vectorizer
        vectorizer = TfidfVectorizer(
            max_features=config['max_features'],
            ngram_range=config['ngram_range'],
            min_df=config['min_df'],
            max_df=config['max_df'],
            stop_words='english'
        )
        
        # Fit and transform
        X_train_temp = vectorizer.fit_transform(train['combined_text'])
        X_valid_temp = vectorizer.transform(valid['combined_text'])
        
        # Quick LogReg test
        hazard_classes = np.unique(y_train_hazard)
        hazard_weights = compute_class_weight('balanced', classes=hazard_classes, y=y_train_hazard)
        hazard_weight_dict = dict(zip(hazard_classes, hazard_weights))
        
        quick_model = LogisticRegression(
            class_weight=hazard_weight_dict,
            max_iter=1000,
            random_state=42
        )
        quick_model.fit(X_train_temp, y_train_hazard)
        
        # Quick evaluation
        hazard_pred_temp = quick_model.predict(X_valid_temp)
        quick_f1 = f1_score(y_valid_hazard, hazard_pred_temp, average='macro')
        
        tfidf_results.append({
            'config': config['name'],
            'f1_hazard': quick_f1,
            'params': config
        })
        
        print(f"      {config['name']}: Hazard F1 = {quick_f1:.4f}")
        
        if quick_f1 > best_score:
            best_score = quick_f1
            best_config = config
    
    print(f"Best TF-IDF config: {best_config['name']} (F1: {best_score:.4f})")
    
else:
    # Use original configuration
    best_config = {'max_features': 10000, 'ngram_range': (1, 2), 'min_df': 2, 'max_df': 0.95}

# 4. Create final TF-IDF features
print("\n4. Creating final TF-IDF features...")

vectorizer = TfidfVectorizer(
    max_features=best_config['max_features'],
    ngram_range=best_config['ngram_range'],
    min_df=best_config['min_df'],
    max_df=best_config['max_df'],
    stop_words='english'
)

X_train = vectorizer.fit_transform(train['combined_text'])
X_valid = vectorizer.transform(valid['combined_text'])
X_test = vectorizer.transform(test['combined_text'])

print(f"Final TF-IDF shape: {X_train.shape}")

# 5. Prepare labels
y_train_hazard = train[hazard_col].values
y_valid_hazard = valid[hazard_col].values
y_test_hazard = test[hazard_col].values

y_train_product = train[product_col].values
y_valid_product = valid[product_col].values
y_test_product = test[product_col].values

# 6. Train Multiple Models
print("\n5. Training multiple models...")

models = {}
predictions = {}

# Class weights
hazard_classes = np.unique(y_train_hazard)
product_classes = np.unique(y_train_product)

hazard_weights = compute_class_weight('balanced', classes=hazard_classes, y=y_train_hazard)
product_weights = compute_class_weight('balanced', classes=product_classes, y=y_train_product)

hazard_weight_dict = dict(zip(hazard_classes, hazard_weights))
product_weight_dict = dict(zip(product_classes, product_weights))

# Logistic Regression
if 'logreg' in CONFIG['models_to_try']:
    print("  Training Logistic Regression...")
    
    models['logreg_hazard'] = LogisticRegression(
        class_weight=hazard_weight_dict,
        max_iter=1000,
        C=1.0,
        random_state=42
    )
    models['logreg_product'] = LogisticRegression(
        class_weight=product_weight_dict,
        max_iter=1000,
        C=1.0,
        random_state=42
    )
    
    models['logreg_hazard'].fit(X_train, y_train_hazard)
    models['logreg_product'].fit(X_train, y_train_product)
    
    predictions['logreg'] = {
        'hazard_valid': models['logreg_hazard'].predict(X_valid),
        'product_valid': models['logreg_product'].predict(X_valid),
        'hazard_test': models['logreg_hazard'].predict(X_test),
        'product_test': models['logreg_product'].predict(X_test)
    }

# XGBoost
if 'xgb' in CONFIG['models_to_try']:
    print("  Training XGBoost...")
    
    # XGBoost with class weights
    models['xgb_hazard'] = XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42,
        n_jobs=-1
    )
    models['xgb_product'] = XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42,
        n_jobs=-1
    )
    
    # Convert sparse matrix to dense for XGBoost
    X_train_dense = X_train.toarray()
    X_valid_dense = X_valid.toarray()
    X_test_dense = X_test.toarray()
    
    # Calculate sample weights for XGBoost
    hazard_sample_weights = np.array([hazard_weight_dict[y] for y in y_train_hazard])
    product_sample_weights = np.array([product_weight_dict[y] for y in y_train_product])
    
    models['xgb_hazard'].fit(X_train_dense, y_train_hazard, sample_weight=hazard_sample_weights)
    models['xgb_product'].fit(X_train_dense, y_train_product, sample_weight=product_sample_weights)
    
    predictions['xgb'] = {
        'hazard_valid': models['xgb_hazard'].predict(X_valid_dense),
        'product_valid': models['xgb_product'].predict(X_valid_dense),
        'hazard_test': models['xgb_hazard'].predict(X_test_dense),
        'product_test': models['xgb_product'].predict(X_test_dense)
    }

print("All models trained")

# 7. Individual Model Evaluation
print("\n6. Individual model evaluation...")

model_results = {}

for model_name in predictions.keys():
    print(f"\n  {model_name.upper()} Results:")
    
    # Validation
    valid_scores = compute_food_hazard_score(
        y_valid_hazard, y_valid_product,
        predictions[model_name]['hazard_valid'],
        predictions[model_name]['product_valid']
    )
    
    # Test
    test_scores = compute_food_hazard_score(
        y_test_hazard, y_test_product,
        predictions[model_name]['hazard_test'],
        predictions[model_name]['product_test']
    )
    
    model_results[model_name] = {
        'validation': valid_scores,
        'test': test_scores
    }
    
    print(f"Validation: {valid_scores['final_score']:.4f} (H: {valid_scores['f1_hazards']:.4f}, P: {valid_scores['f1_products']:.4f})")
    print(f"Test: {test_scores['final_score']:.4f} (H: {test_scores['f1_hazards']:.4f}, P: {test_scores['f1_products']:.4f})")

# 8. Ensemble if requested
if CONFIG['ensemble'] and len(predictions) > 1:
    print("\n7. Ensemble combination...")
    
    from scipy import stats
    
    # Simple majority voting
    ensemble_hazard_valid = []
    ensemble_product_valid = []
    ensemble_hazard_test = []
    ensemble_product_test = []
    
    for i in range(len(y_valid_hazard)):
        hazard_votes = [predictions[model]['hazard_valid'][i] for model in predictions.keys()]
        ensemble_hazard_valid.append(stats.mode(hazard_votes, keepdims=True).mode[0])
        
        product_votes = [predictions[model]['product_valid'][i] for model in predictions.keys()]
        ensemble_product_valid.append(stats.mode(product_votes, keepdims=True).mode[0])
    
    for i in range(len(y_test_hazard)):
        hazard_votes = [predictions[model]['hazard_test'][i] for model in predictions.keys()]
        ensemble_hazard_test.append(stats.mode(hazard_votes, keepdims=True).mode[0])
        
        product_votes = [predictions[model]['product_test'][i] for model in predictions.keys()]
        ensemble_product_test.append(stats.mode(product_votes, keepdims=True).mode[0])
    
    # Evaluate ensemble
    ensemble_valid = compute_food_hazard_score(
        y_valid_hazard, y_valid_product,
        np.array(ensemble_hazard_valid), np.array(ensemble_product_valid)
    )
    
    ensemble_test = compute_food_hazard_score(
        y_test_hazard, y_test_product,
        np.array(ensemble_hazard_test), np.array(ensemble_product_test)
    )
    
    model_results['ensemble'] = {
        'validation': ensemble_valid,
        'test': ensemble_test
    }
    
    print(f"ENSEMBLE Results:")
    print(f"Validation: {ensemble_valid['final_score']:.4f} (H: {ensemble_valid['f1_hazards']:.4f}, P: {ensemble_valid['f1_products']:.4f})")
    print(f"Test: {ensemble_test['final_score']:.4f} (H: {ensemble_test['f1_hazards']:.4f}, P: {ensemble_test['f1_products']:.4f})")

# 9. Best Model Selection and Final Analysis
print(f"\n=== FINAL RESULTS COMPARISON ===")

# Find best model
best_model = max(model_results.keys(), key=lambda x: model_results[x]['test']['final_score'])
best_score = model_results[best_model]['test']['final_score']

print(f"\nModel Performance Summary:")
for model_name, results in model_results.items():
    test_score = results['test']['final_score']
    print(f"  {model_name:15s}: {test_score:.4f}")

print(f"\nBEST MODEL: {best_model.upper()}")
print(f"Best Test Score: {best_score:.4f}")

# Comparison with previous results
previous_baseline = 0.5978  # Original TF-IDF result
improvement = best_score - previous_baseline

print(f"\n=== IMPROVEMENT ANALYSIS ===")
print(f"Previous baseline: {previous_baseline:.4f}")
print(f"Best new model: {best_score:.4f}")
print(f"Improvement: {improvement:+.4f}")

if improvement > 0:
    print(f"{improvement:.4f} improvement achieved!")
    if improvement > 0.05:
        print(f"SIGNIFICANT improvement!")
else:
    print(f"{abs(improvement):.4f} decrease")

# Competition comparison
if CONFIG['st1_task']:
    competition_bert = 0.667
    competition_best = 0.8223
else:
    competition_bert = 0.498
    competition_best = 0.5473

print(f"\nCompetition Comparison ({task_name}):")
print(f"Competition BERT baseline: {competition_bert:.4f}")
print(f"Competition best: {competition_best:.4f}")
print(f"Your best result: {best_score:.4f}")

if best_score > competition_bert:
    print(f"You beat the BERT baseline by {best_score - competition_bert:.4f}!")
else:
    gap = competition_bert - best_score
    print(f"Gap to BERT baseline: {gap:.4f}")

# 10. Save Results
results_summary = {
    'task': task_name,
    'method': 'Optimized Simple + Ensemble',
    'config': CONFIG,
    'tfidf_optimization': tfidf_results if CONFIG['grid_search_tfidf'] else None,
    'best_tfidf_config': best_config,
    'model_results': model_results,
    'best_model': best_model,
    'best_score': float(best_score),
    'improvement_over_baseline': float(improvement),
    'competition_comparison': {
        'beats_bert_baseline': bool(best_score > competition_bert),
        'gap_to_best': float(competition_best - best_score)
    }
}

filename = f'optimized_results_{task_name.lower()}.json'
with open(filename, 'w') as f:
    json.dump(results_summary, f, indent=2, default=str)

print(f"\nResults saved to {filename}")

print(f"\n=== EXPERIMENT COMPLETED ===")
print(f"Best {task_name} Score: {best_score:.4f}")
print(f"Best Model: {best_model}")

# Next steps recommendation
print(f"\nNEXT STEPS:")
if best_score > competition_bert:
    print("EXCELLENT! You beat BERT baseline. Try:")
    print("1. ST2 with same approach")
    print("2. Data augmentation for even better results")
    print("3. More advanced ensemble methods")
elif improvement > 0.03:
    print("Good progress! Try:")
    print("1. More XGBoost hyperparameter tuning")
    print("2. Different ensemble methods (weighted voting)")
    print("3. ST2 application")
else:
    print("Need different approach. Consider:")
    print("1. Data augmentation")
    print("2. Different text preprocessing")
    print("3. Alternative models (Random Forest)")

print(f"\nCurrent distance to competition:")
print(f"BERT baseline: {max(0, competition_bert - best_score):.3f} points away")
print(f"Best result: {competition_best - best_score:.3f} points away")

ModuleNotFoundError: No module named 'xgboost'