# Add UL Features to Existing Baseline

**Goal**: Simply add UL features to your existing 63% F1 baseline model

**Approach**:
1. Load your existing features (from 24_rebuild_temporal_split.ipynb)
2. Load UL features (from 50_unsupervised_learning.ipynb)
3. Merge them together (excluding leakage features)
4. Save enhanced feature sets
5. Re-run your existing classification notebook

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

## 1. Load Existing Features

In [None]:
# Load your existing temporal split features
feature_dir = Path('../data/features')

# Check if files exist
if not feature_dir.exists():
    print("⚠️  Feature directory doesn't exist!")
    print("Run notebooks/24_rebuild_temporal_split.ipynb first to generate features.")
else:
    print(f"✓ Feature directory found: {feature_dir}")
    print(f"\nAvailable files:")
    for f in sorted(feature_dir.glob('*.pkl')):
        print(f"  - {f.name}")

In [None]:
# Load existing features
X_train = pd.read_pickle(feature_dir / 'X_train_temporal.pkl')
X_test = pd.read_pickle(feature_dir / 'X_test_temporal.pkl')
y_train_cls = pd.read_pickle(feature_dir / 'y_train_cls_temporal.pkl')
y_test_cls = pd.read_pickle(feature_dir / 'y_test_cls_temporal.pkl')

print(f"Baseline Features:")
print(f"  X_train: {X_train.shape}")
print(f"  X_test: {X_test.shape}")
print(f"  y_train: {y_train_cls.shape} (high-impact: {y_train_cls.mean()*100:.1f}%)")
print(f"  y_test: {y_test_cls.shape} (high-impact: {y_test_cls.mean()*100:.1f}%)")

print(f"\nBaseline feature names ({len(X_train.columns)}):")
for i, col in enumerate(X_train.columns, 1):
    print(f"  {i}. {col}")

## 2. Load UL Features

In [None]:
# Load data with unsupervised features
df_ul = pd.read_pickle('../data/processed/data_with_unsupervised_features.pkl')

print(f"Data with UL features: {df_ul.shape}")
print(f"Date range: {df_ul['Year'].min()}-{df_ul['Year'].max()}")

In [None]:
# Identify UL features (EXCLUDE leakage features!)
ul_topic_features = [c for c in df_ul.columns if c.startswith('topic_')]
ul_nmf_features = [c for c in df_ul.columns if c.startswith('nmf_topic_')]
ul_pca_features = [c for c in df_ul.columns if c.startswith('pca_')]
ul_cluster_features = [c for c in df_ul.columns if 'cluster' in c.lower() and 'venue' not in c.lower()]
ul_other_features = ['dominant_topic', 'dominant_topic_weight', 'nmf_dominant_topic']

# CRITICAL: Exclude data leakage features!
leakage_features = ['citation_zscore', 'is_citation_outlier']

all_ul_features = (
    ul_topic_features + 
    ul_nmf_features + 
    ul_pca_features + 
    ul_cluster_features + 
    [f for f in ul_other_features if f in df_ul.columns]
)

# Remove leakage features
all_ul_features = [f for f in all_ul_features if f not in leakage_features]

print(f"Unsupervised Learning Features (Leakage-Free):")
print(f"  LDA topics: {len(ul_topic_features)}")
print(f"  NMF topics: {len(ul_nmf_features)}")
print(f"  PCA components: {len(ul_pca_features)}")
print(f"  Cluster labels: {len(ul_cluster_features)}")
print(f"  Other: {len([f for f in ul_other_features if f in df_ul.columns])}")
print(f"  Total: {len(all_ul_features)}")
print(f"\n⚠️  EXCLUDED (data leakage): {leakage_features}")

## 3. Align and Merge Features

In [None]:
# Get indices from existing splits
train_indices = X_train.index
test_indices = X_test.index

print(f"Train indices: {len(train_indices)}")
print(f"Test indices: {len(test_indices)}")

# Extract UL features for train/test splits
ul_train = df_ul.loc[train_indices, all_ul_features].copy()
ul_test = df_ul.loc[test_indices, all_ul_features].copy()

print(f"\nUL features extracted:")
print(f"  ul_train: {ul_train.shape}")
print(f"  ul_test: {ul_test.shape}")

In [None]:
# Merge baseline + UL features
X_train_enhanced = pd.concat([X_train, ul_train], axis=1)
X_test_enhanced = pd.concat([X_test, ul_test], axis=1)

print(f"Enhanced Feature Sets:")
print(f"  X_train_enhanced: {X_train_enhanced.shape}")
print(f"  X_test_enhanced: {X_test_enhanced.shape}")
print(f"\nFeature breakdown:")
print(f"  Baseline features: {X_train.shape[1]}")
print(f"  UL features added: {ul_train.shape[1]}")
print(f"  Total features: {X_train_enhanced.shape[1]}")

# Check for any NaN values
print(f"\nMissing values:")
print(f"  X_train_enhanced: {X_train_enhanced.isna().sum().sum()}")
print(f"  X_test_enhanced: {X_test_enhanced.isna().sum().sum()}")

## 4. Handle Missing Values (if any)

In [None]:
# Fill missing values with median (if any)
if X_train_enhanced.isna().sum().sum() > 0:
    print("Filling missing values with median...")
    
    for col in X_train_enhanced.columns:
        if X_train_enhanced[col].isna().any():
            median_val = X_train_enhanced[col].median()
            X_train_enhanced[col].fillna(median_val, inplace=True)
            X_test_enhanced[col].fillna(median_val, inplace=True)
            print(f"  Filled {col} with {median_val}")
    
    print(f"\n✓ All missing values filled")
else:
    print("✓ No missing values - data is clean!")

## 5. Save Enhanced Features

In [None]:
# Save enhanced feature sets
X_train_enhanced.to_pickle(feature_dir / 'X_train_temporal_with_ul.pkl')
X_test_enhanced.to_pickle(feature_dir / 'X_test_temporal_with_ul.pkl')

# Also save the UL feature list for reference
with open(feature_dir / 'ul_features_list.txt', 'w') as f:
    f.write(f"Unsupervised Learning Features ({len(all_ul_features)})\n")
    f.write("=" * 60 + "\n\n")
    for feat in all_ul_features:
        f.write(f"{feat}\n")

print("✓ Enhanced features saved:")
print(f"  {feature_dir / 'X_train_temporal_with_ul.pkl'}")
print(f"  {feature_dir / 'X_test_temporal_with_ul.pkl'}")
print(f"  {feature_dir / 'ul_features_list.txt'}")

## 6. Quick Comparison Test

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score

print("Quick comparison: Baseline vs Enhanced...\n")

# Test baseline
print("Training baseline model...")
clf_baseline = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1, class_weight='balanced')
clf_baseline.fit(X_train, y_train_cls)
y_pred_baseline = clf_baseline.predict(X_test)
y_proba_baseline = clf_baseline.predict_proba(X_test)[:, 1]

baseline_f1 = f1_score(y_test_cls, y_pred_baseline)
baseline_auc = roc_auc_score(y_test_cls, y_proba_baseline)
baseline_precision = precision_score(y_test_cls, y_pred_baseline)
baseline_recall = recall_score(y_test_cls, y_pred_baseline)

print(f"Baseline Results:")
print(f"  F1: {baseline_f1:.4f}")
print(f"  Precision: {baseline_precision:.4f}")
print(f"  Recall: {baseline_recall:.4f}")
print(f"  ROC-AUC: {baseline_auc:.4f}")

# Test enhanced
print(f"\nTraining enhanced model (+UL features)...")
clf_enhanced = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1, class_weight='balanced')
clf_enhanced.fit(X_train_enhanced, y_train_cls)
y_pred_enhanced = clf_enhanced.predict(X_test_enhanced)
y_proba_enhanced = clf_enhanced.predict_proba(X_test_enhanced)[:, 1]

enhanced_f1 = f1_score(y_test_cls, y_pred_enhanced)
enhanced_auc = roc_auc_score(y_test_cls, y_proba_enhanced)
enhanced_precision = precision_score(y_test_cls, y_pred_enhanced)
enhanced_recall = recall_score(y_test_cls, y_pred_enhanced)

print(f"Enhanced Results:")
print(f"  F1: {enhanced_f1:.4f} ({enhanced_f1-baseline_f1:+.4f})")
print(f"  Precision: {enhanced_precision:.4f} ({enhanced_precision-baseline_precision:+.4f})")
print(f"  Recall: {enhanced_recall:.4f} ({enhanced_recall-baseline_recall:+.4f})")
print(f"  ROC-AUC: {enhanced_auc:.4f} ({enhanced_auc-baseline_auc:+.4f})")

# Summary
print("\n" + "="*60)
print("IMPROVEMENT SUMMARY")
print("="*60)
f1_improvement = ((enhanced_f1 - baseline_f1) / baseline_f1 * 100)
auc_improvement = ((enhanced_auc - baseline_auc) / baseline_auc * 100)

print(f"F1 Score: {baseline_f1:.4f} → {enhanced_f1:.4f} ({f1_improvement:+.2f}%)")
print(f"ROC-AUC: {baseline_auc:.4f} → {enhanced_auc:.4f} ({auc_improvement:+.2f}%)")

if enhanced_f1 > baseline_f1:
    print(f"\n✅ UL features improved F1 by {f1_improvement:.2f}%")
else:
    print(f"\n⚠️  UL features decreased F1 by {abs(f1_improvement):.2f}% (might need feature selection)")

## 7. Next Steps

Now you can:

1. **Run your existing classification notebook** (30_classification_models.ipynb):
   - Just load `X_train_temporal_with_ul.pkl` instead of `X_train_temporal.pkl`
   - Compare all your models with enhanced features

2. **Feature selection** (if needed):
   - Select only top N UL features by correlation
   - Use feature importance from Random Forest
   - Try different UL feature combinations

3. **Hyperparameter tuning**:
   - Re-tune your models with the enhanced feature set
   - May need different max_depth, n_estimators, etc.