# Model-Specific Top-15 Fusion: Greedy-Based Early Fusion

================================================================================
PURPOSE: Train models using greedy-selected top-15 features per model
================================================================================

**Methodology:**
- For each model×task combination, use greedy-selected top-15 features
- Select best 2 classifiers per model×task (based on dev set macro F1 from 3. notebook)
- Train on Train+Dev combined data (final training)
- **CRITICAL**: Only save probabilities (soft labels), NO hard predictions
- Save trained models for use in 5. notebook Type 2 evaluation

**Workflow:**
1. Load best classifiers from 3. notebook results (`all_results_dev.pkl`)
2. Load greedy-selected features from 3.5. notebook (`selected_features_all.json`)
3. For each model×task:
   - Extract top-15 greedy-selected features
   - Select best 2 classifiers (by macro F1)
   - Load Train+Dev features and labels
   - Train on combined Train+Dev data
   - Save trained models (only probabilities, no hard predictions)
4. Save metadata (model paths, selected features, F1 scores)

**Outputs:**
- Trained models: `models/fusion/method2_{model}_{task}_{classifier}.pkl`
- Metadata: `results/method2_trained_models.json` (Drive + GitHub)


In [None]:
# ============================================================================
# SETUP: Repository Clone, Drive Mount, and Path Configuration
# ============================================================================
import shutil
import os
import subprocess
import time
import requests
import zipfile
import sys
import json
import pickle
from pathlib import Path
from google.colab import drive
import numpy as np

!rm -rf /content/semeval-context-tree-modular
!git clone https://github.com/EonTechie/semeval-context-tree-modular.git
!cd /content/semeval-context-tree-modular && git pull

# Repository configuration
repo_dir = '/content/semeval-context-tree-modular'
repo_url = 'https://github.com/EonTechie/semeval-context-tree-modular.git'
zip_url = 'https://github.com/EonTechie/semeval-context-tree-modular/archive/refs/heads/main.zip'

# Clone repository (if not already present)
if not os.path.exists(repo_dir):
    print("Cloning repository from GitHub...")
    max_retries = 2
    clone_success = False

    for attempt in range(max_retries):
        try:
            result = subprocess.run(
                ['git', 'clone', repo_url],
                cwd='/content',
                capture_output=True,
                text=True,
                timeout=60
            )
            if result.returncode == 0:
                print("Repository cloned successfully via git")
                clone_success = True
                break
            else:
                if attempt < max_retries - 1:
                    time.sleep(3)
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(3)

    # Fallback: Download as ZIP if git clone fails
    if not clone_success:
        print("Git clone failed. Downloading repository as ZIP archive...")
        zip_path = '/tmp/repo.zip'
        try:
            response = requests.get(zip_url, stream=True, timeout=60)
            response.raise_for_status()
            with open(zip_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall('/content')
            extracted_dir = '/content/semeval-context-tree-modular-main'
            if os.path.exists(extracted_dir):
                os.rename(extracted_dir, repo_dir)
            os.remove(zip_path)
            print("Repository downloaded and extracted successfully")
        except Exception as e:
            raise RuntimeError(f"Failed to obtain repository: {e}")

# Mount Google Drive (if not already mounted)
try:
    drive.mount('/content/drive', force_remount=False)
except Exception:
    pass  # Already mounted

# Configure paths
BASE_PATH = Path('/content/semeval-context-tree-modular')
DATA_PATH = Path('/content/drive/MyDrive/semeval_data')

# Add repository to Python path
if str(BASE_PATH) not in sys.path:
    sys.path.insert(0, str(BASE_PATH))

# Verify imports work
try:
    from src.storage.manager import StorageManager
    from src.models.classifiers import get_classifier_dict, train_classifiers
    from src.features.extraction import get_feature_names
except ImportError as e:
    raise ImportError(
        f"Failed to import required modules. "
        f"Repository path: {BASE_PATH}, "
        f"Python path: {sys.path[:3]}, "
        f"Error: {e}"
    )

# Initialize StorageManager
storage = StorageManager(
    base_path=str(BASE_PATH),
    data_path=str(DATA_PATH),
    github_path=str(BASE_PATH)
)

print("Setup complete")
print(f"  Repository: {BASE_PATH}")
print(f"  Data storage: {DATA_PATH}")


# ============================================================================
# CONFIGURE MODELS, TASKS, AND CLASSIFIERS
# ============================================================================


In [None]:
# Models to process
MODELS = ['bert', 'bert_political', 'bert_ambiguity', 'roberta', 'deberta', 'xlnet']

# Tasks to process
TASKS = ['clarity', 'evasion']

# Label lists
CLARITY_LABELS = ['Ambivalent', 'Clear Non-Reply', 'Clear Reply']
EVASION_LABELS = ['Direct Answer', 'Indirect Answer', 'No Answer']

# Random seed
RANDOM_STATE = 42

# Number of top classifiers to use per model×task
TOP_K_CLASSIFIERS = 2

# Number of greedy-selected features to use
TOP_N_FEATURES = 15

print(f"Models: {MODELS}")
print(f"Tasks: {TASKS}")
print(f"Top-K Classifiers: {TOP_K_CLASSIFIERS}")
print(f"Top-N Features: {TOP_N_FEATURES}")


# ============================================================================
# STEP 1: LOAD BEST CLASSIFIERS FROM 3. NOTEBOOK RESULTS
# ============================================================================


In [None]:
# Load all results from 3. notebook
# Expected path: data_path/results/all_results_dev.pkl
expected_path = storage.data_path / 'results/all_results_dev.pkl'
print(f"Looking for all_results_dev.pkl at: {expected_path}")
print(f"File exists: {expected_path.exists()}")

if expected_path.exists():
    file_size = expected_path.stat().st_size / (1024 * 1024)  # Size in MB
    print(f"File size: {file_size:.2f} MB")

all_results = storage.load_all_results_dict(filename='all_results_dev.pkl')

if not all_results:
    raise FileNotFoundError(
        f"all_results_dev.pkl not found at: {expected_path}\n"
        f"Make sure you have run 03_train_evaluate.ipynb first.\n"
        f"The file should be saved to: {storage.data_path / 'results'}"
    )

# Verify loaded data structure
print(f"\n✓ Loaded all_results_dev.pkl successfully")
print(f"  Number of models: {len(all_results)}")
if all_results:
    first_model = list(all_results.keys())[0]
    print(f"  First model: {first_model}")
    if first_model in all_results:
        print(f"  Tasks for {first_model}: {list(all_results[first_model].keys())}")
        if all_results[first_model]:
            first_task = list(all_results[first_model].keys())[0]
            print(f"  Classifiers for {first_model}-{first_task}: {list(all_results[first_model][first_task].keys())}")

# Find best classifiers for each model×task combination
best_classifiers = {}

for model in MODELS:
    if model not in all_results:
        print(f"⚠ Warning: Model '{model}' not found in results")
        continue
    
    best_classifiers[model] = {}
    
    for task in TASKS:
        if task not in all_results[model]:
            print(f"⚠ Warning: Task '{task}' not found for model '{model}'")
            continue
        
        results = all_results[model][task]
        
        # Find best classifiers by macro F1
        classifier_scores = []
        for clf_name, result in results.items():
            metrics = result.get('metrics', {})
            macro_f1 = metrics.get('macro_f1', 0.0)
            classifier_scores.append((clf_name, macro_f1))
        
        # Sort by macro F1 (descending)
        classifier_scores.sort(key=lambda x: x[1], reverse=True)
        
        # Get top-K classifiers
        top_classifiers = classifier_scores[:TOP_K_CLASSIFIERS]
        
        best_classifiers[model][task] = {
            'best_clf': top_classifiers[0][0] if len(top_classifiers) > 0 else None,
            'best_f1': top_classifiers[0][1] if len(top_classifiers) > 0 else 0.0,
            '2nd_best_clf': top_classifiers[1][0] if len(top_classifiers) > 1 else None,
            '2nd_best_f1': top_classifiers[1][1] if len(top_classifiers) > 1 else 0.0,
            'all_scores': classifier_scores
        }
        
        print(f"{model} - {task}:")
        print(f"  Best: {top_classifiers[0][0]} (F1={top_classifiers[0][1]:.4f})" if len(top_classifiers) > 0 else "  No classifiers found")
        print(f"  2nd:  {top_classifiers[1][0]} (F1={top_classifiers[1][1]:.4f})" if len(top_classifiers) > 1 else "  No 2nd classifier")

print(f"\n✓ Loaded best classifiers for {len(best_classifiers)} models")


# ============================================================================
# STEP 2: LOAD GREEDY-SELECTED FEATURES FROM 3.5. NOTEBOOK
# ============================================================================


In [None]:
# Load greedy-selected features
ablation_dir = storage.data_path / 'results/ablation'
greedy_features_path = ablation_dir / 'selected_features_all.json'

if not greedy_features_path.exists():
    raise FileNotFoundError(
        f"Greedy selected features not found: {greedy_features_path}\n"
        f"Make sure you have run 03_5_ablation_study.ipynb (Greedy Forward Selection)"
    )

with open(greedy_features_path, 'r') as f:
    selected_features_dict = json.load(f)

# Convert to model_name -> task -> features format
greedy_features = {}
for model in MODELS:
    greedy_features[model] = {}
    for task in TASKS:
        key = f"{model}_{task}"
        if key in selected_features_dict:
            selected_features = selected_features_dict[key].get('selected_features', [])
            # Take top-N features
            greedy_features[model][task] = selected_features[:TOP_N_FEATURES]
            print(f"{model} - {task}: {len(greedy_features[model][task])} features")
        else:
            print(f"⚠ Warning: No greedy features found for {key}")
            greedy_features[model][task] = []

print(f"\n✓ Loaded greedy-selected features")


# ============================================================================
# STEP 3: GET FEATURE NAME TO INDEX MAPPING
# ============================================================================


In [None]:
# Get all feature names (19 features total)
all_feature_names = get_feature_names()
feature_name_to_idx = {name: idx for idx, name in enumerate(all_feature_names)}

print(f"Total features: {len(all_feature_names)}")
print(f"Feature names: {all_feature_names}")


# ============================================================================
# STEP 4: TRAIN MODELS WITH GREEDY-SELECTED FEATURES
# ============================================================================


In [None]:
# CRITICAL: Create output directories upfront (CHECKPOINT)
models_fusion_dir = storage.data_path / 'models/fusion'
results_dir = storage.data_path / 'results'
github_results_dir = storage.github_path / 'results'

models_fusion_dir.mkdir(parents=True, exist_ok=True)
results_dir.mkdir(parents=True, exist_ok=True)
github_results_dir.mkdir(parents=True, exist_ok=True)

print("Created output directories:")
print(f"  Drive models: {models_fusion_dir}")
print(f"  Drive results: {results_dir}")
print(f"  GitHub results: {github_results_dir}")

# Get classifier dictionary
classifiers = get_classifier_dict(random_state=RANDOM_STATE)

# Store results metadata
results_method2 = {}

# Process each model×task combination
for model in MODELS:
    if model not in best_classifiers:
        continue
    
    results_method2[model] = {}
    
    for task in TASKS:
        if task not in best_classifiers[model]:
            continue
        
        # Get best classifiers for this model×task
        best_info = best_classifiers[model][task]
        best_clf_name = best_info['best_clf']
        second_clf_name = best_info['2nd_best_clf']
        
        if best_clf_name is None:
            print(f"⚠ Skipping {model} - {task}: No best classifier found")
            continue
        
        # Get greedy-selected features
        selected_feature_names = greedy_features[model][task]
        
        if len(selected_feature_names) == 0:
            print(f"⚠ Skipping {model} - {task}: No greedy features found")
            continue
        
        # Get feature indices
        selected_indices = [feature_name_to_idx[name] for name in selected_feature_names if name in feature_name_to_idx]
        
        if len(selected_indices) == 0:
            print(f"⚠ Skipping {model} - {task}: No valid feature indices")
            continue
        
        print(f"\n{'='*80}")
        print(f"MODEL: {model.upper()} | TASK: {task.upper()}")
        print(f"{'='*80}")
        print(f"Selected features ({len(selected_indices)}): {selected_feature_names}")
        print(f"Best classifiers: {best_clf_name}, {second_clf_name if second_clf_name else 'N/A'}")
        
        # Select appropriate label list and dataset key for this task
        if task == 'clarity':
            label_list = CLARITY_LABELS
            label_key = 'clarity_label'
        else:  # evasion
            label_list = EVASION_LABELS
            label_key = 'evasion_label'
        
        # Load splits
        train_ds = storage.load_split('train', task=task)
        dev_ds = storage.load_split('dev', task=task)
        
        # Load features
        X_train_full = storage.load_features(model, task, 'train')
        X_dev_full = storage.load_features(model, task, 'dev')
        
        # Extract selected features only
        X_train = X_train_full[:, selected_indices]
        X_dev = X_dev_full[:, selected_indices]
        
        # Combine train+dev for final training
        X_train_combined = np.vstack([X_train, X_dev])
        
        # Extract labels
        y_train = np.array([train_ds[i][label_key] for i in range(len(train_ds))])
        y_dev = np.array([dev_ds[i][label_key] for i in range(len(dev_ds))])
        y_train_combined = np.concatenate([y_train, y_dev])
        
        print(f"Training: {X_train_combined.shape[0]} samples (train+dev combined)")
        print(f"Features: {X_train_combined.shape[1]} (selected from {X_train_full.shape[1]})")
        
        # Train best 2 classifiers
        classifiers_to_train = [best_clf_name]
        if second_clf_name:
            classifiers_to_train.append(second_clf_name)
        
        results_method2[model][task] = {}
        
        for clf_name in classifiers_to_train:
            print(f"\nTraining {clf_name}...")
            
            # Get classifier instance
            clf = classifiers[clf_name]
            
            # Train using train_classifiers function (handles label encoding)
            # We pass train_combined as train, and empty dev (we don't need dev predictions)
            # But we need to pass something for dev, so we use train_combined again
            training_results = train_classifiers(
                X_train_combined, y_train_combined,
                X_train_combined, y_train_combined,  # Dummy dev (we don't use predictions)
                classifiers={clf_name: clf},
                random_state=RANDOM_STATE
            )
            
            trained_model = training_results[clf_name]['model']
            
            # Save trained model
            model_filename = f"method2_{model}_{task}_{clf_name}.pkl"
            model_save_path = models_fusion_dir / model_filename
            
            with open(model_save_path, 'wb') as f:
                pickle.dump({
                    'model': trained_model,
                    'selected_indices': selected_indices,
                    'selected_feature_names': selected_feature_names,
                    'label_list': label_list,
                    'label_encoder': training_results[clf_name]['label_encoder']
                }, f)
            
            print(f"  ✓ Saved: {model_filename}")
            
            # Store metadata
            results_method2[model][task][clf_name] = {
                'selected_features': selected_feature_names,
                'selected_indices': selected_indices,
                'model_path': str(model_save_path),
                'macro_f1_dev': best_info['best_f1'] if clf_name == best_clf_name else best_info['2nd_best_f1']
            }

print(f"\n{'='*80}")
print("✓ Training complete for all model×task combinations")
print(f"{'='*80}")

# ============================================================================
# STEP 5: SAVE METADATA (DRIVE + GITHUB)
# ============================================================================

# Helper function to make JSON serializable
def make_json_serializable(obj):
    """Recursively convert numpy arrays and types to JSON-serializable Python types"""
    import numpy as np
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64, np.float32)):
        return float(obj)
    elif isinstance(obj, (bool, np.bool_)):
        return bool(obj)
    elif isinstance(obj, dict):
        return {key: make_json_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, (list, tuple)):
        return [make_json_serializable(item) for item in obj]
    elif hasattr(obj, 'item'):  # numpy scalar
        return obj.item()
    else:
        return obj

# Convert to JSON-serializable format
results_serializable = make_json_serializable(results_method2)

# Save to Drive
results_path_drive = results_dir / 'method2_trained_models.json'
with open(results_path_drive, 'w') as f:
    json.dump(results_serializable, f, indent=2)

print(f"✓ Saved metadata to Drive: {results_path_drive}")

# Save to GitHub (copy same content)
results_path_github = github_results_dir / 'method2_trained_models.json'
with open(results_path_github, 'w') as f:
    json.dump(results_serializable, f, indent=2)

print(f"✓ Saved metadata to GitHub: {results_path_github}")

print(f"\n{'='*80}")
print("✓ METHOD 2 TRAINING COMPLETE")
print(f"{'='*80}")
print(f"Trained models saved to: {models_fusion_dir}")
print(f"Metadata saved to: {results_path_drive} (Drive) and {results_path_github} (GitHub)")
