# Unsupervised Learning Feature Integration

**Goal**: Integrate unsupervised learning features into supervised models and measure performance improvement

**⚠️ CRITICAL - Data Leakage Prevention**:
- We EXCLUDE `citation_zscore` and `is_citation_outlier` features
- These are derived from the target variable (Citations) and would cause data leakage
- Using them would give artificially high performance that won't generalize

**Approach**:
1. Load data with UL features
2. Remove leakage features (citation_zscore, is_citation_outlier)
3. Select top-performing UL features based on correlation
4. Train baseline models (without UL features)
5. Train enhanced models (with UL features)
6. Compare performance and analyze improvements
7. Feature importance analysis
8. Recommendations for production models

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Sklearn imports
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, f1_score, precision_score, recall_score,
    mean_absolute_error, mean_squared_error, r2_score
)

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

%matplotlib inline

## 1. Load Data

In [None]:
# Load data with unsupervised features
df = pd.read_pickle('../data/processed/data_with_unsupervised_features.pkl')

print(f"Dataset shape: {df.shape}")
print(f"Columns: {len(df.columns)}")
print(f"\nDate range: {df['Year'].min()} - {df['Year'].max()}")
print(f"Citation stats: mean={df['Citations'].mean():.1f}, median={df['Citations'].median():.0f}")

In [None]:
# Identify feature categories
ul_topic_features = [c for c in df.columns if c.startswith('topic_')]
ul_nmf_features = [c for c in df.columns if c.startswith('nmf_topic_')]
ul_pca_features = [c for c in df.columns if c.startswith('pca_')]
ul_cluster_features = [c for c in df.columns if 'cluster' in c and c != 'venue_cluster']
ul_other_features = ['dominant_topic', 'dominant_topic_weight', 'nmf_dominant_topic']

# CRITICAL: Exclude citation_zscore and is_citation_outlier - they cause DATA LEAKAGE!
# These features are derived from the target variable (Citations) and would
# give the model unfair access to the answer during training.
leakage_features = ['citation_zscore', 'is_citation_outlier']
print(f"⚠️  EXCLUDING LEAKAGE FEATURES: {leakage_features}")

all_ul_features = (ul_topic_features + ul_nmf_features + ul_pca_features + 
                   ul_cluster_features + [f for f in ul_other_features if f in df.columns])

# Double-check: remove any leakage features that might have slipped in
all_ul_features = [f for f in all_ul_features if f not in leakage_features]

print(f"\nUnsupervised Learning Features (Leakage-Free):")
print(f"  LDA topics: {len(ul_topic_features)}")
print(f"  NMF topics: {len(ul_nmf_features)}")
print(f"  PCA components: {len(ul_pca_features)}")
print(f"  Cluster labels: {len(ul_cluster_features)}")
print(f"  Other: {len([f for f in ul_other_features if f in df.columns])}")
print(f"  Total UL features: {len(all_ul_features)}")

## 2. Feature Selection - Top UL Features

In [None]:
# Calculate correlations with Citations
ul_correlations = df[all_ul_features + ['Citations']].corr()['Citations'].drop('Citations')
ul_correlations_abs = ul_correlations.abs().sort_values(ascending=False)

print("Top 30 UL Features by Correlation with Citations:")
print(ul_correlations_abs.head(30))

# Select top features
top_n = 20
top_ul_features = ul_correlations_abs.head(top_n).index.tolist()

print(f"\nSelected top {top_n} UL features for modeling")

In [None]:
# Visualize top correlations
plt.figure(figsize=(10, 8))
ul_correlations_abs.head(30).plot(kind='barh')
plt.xlabel('Absolute Correlation with Citations')
plt.title('Top 30 Unsupervised Features by Correlation')
plt.axvline(x=0.05, color='r', linestyle='--', alpha=0.5, label='5% threshold')
plt.legend()
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 3. Prepare Baseline Features

Identify existing engineered features (non-UL) for baseline comparison

In [None]:
# Identify baseline features (excluding UL features and target)
exclude_cols = (
    all_ul_features + 
    ['Citations', 'high_impact', 'log_citations', 'citation_bin', 'citation_category'] +  # targets
    ['Title', 'Abstract', 'Authors', 'Author full names', 'Author(s) ID', 'Source title'] +  # text/metadata
    ['EID', 'DOI', 'PubMed ID', 'Link', 'Cited by', 'Abstract', 'Document Type']  # identifiers
)

# Get numeric columns for modeling
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
baseline_features = [c for c in numeric_cols if c not in exclude_cols]

print(f"Available baseline features: {len(baseline_features)}")
print(f"\nBaseline features:")
for i, feat in enumerate(baseline_features, 1):
    print(f"  {i}. {feat}")

## 4. Create Target Variables

In [None]:
# Create classification target (high impact = top 25% citations)
citation_threshold = df['Citations'].quantile(0.75)
df['high_impact'] = (df['Citations'] >= citation_threshold).astype(int)

# Create log-transformed regression target
df['log_citations'] = np.log1p(df['Citations'])

print(f"Classification target: high_impact (>= {citation_threshold:.0f} citations)")
print(f"  High impact: {df['high_impact'].sum()} papers ({df['high_impact'].mean()*100:.1f}%)")
print(f"  Regular: {(~df['high_impact'].astype(bool)).sum()} papers ({(1-df['high_impact'].mean())*100:.1f}%)")
print(f"\nRegression target: log(Citations + 1)")
print(f"  Range: {df['log_citations'].min():.2f} - {df['log_citations'].max():.2f}")

## 5. Train/Test Split - Temporal Split

In [None]:
# Use temporal split: train on earlier years, test on recent years
split_year = 2022

train_mask = df['Year'] < split_year
test_mask = df['Year'] >= split_year

df_train = df[train_mask].copy()
df_test = df[test_mask].copy()

print(f"Temporal Split (cutoff year: {split_year}):")
print(f"  Training set: {len(df_train)} papers ({df_train['Year'].min()}-{df_train['Year'].max()})")
print(f"  Test set: {len(df_test)} papers ({df_test['Year'].min()}-{df_test['Year'].max()})")
print(f"\nTarget distribution:")
print(f"  Train high_impact: {df_train['high_impact'].mean()*100:.1f}%")
print(f"  Test high_impact: {df_test['high_impact'].mean()*100:.1f}%")

## 6. Baseline Models (No UL Features)

In [None]:
# Prepare data - handle missing values
def prepare_features(df, features):
    """Prepare feature matrix, handling missing values"""
    X = df[features].copy()
    # Fill NaN with median for numeric columns
    for col in X.columns:
        if X[col].isna().any():
            X[col].fillna(X[col].median(), inplace=True)
    return X

# Baseline features
X_train_baseline = prepare_features(df_train, baseline_features)
X_test_baseline = prepare_features(df_test, baseline_features)

y_train_class = df_train['high_impact']
y_test_class = df_test['high_impact']

y_train_reg = df_train['log_citations']
y_test_reg = df_test['log_citations']

print(f"Baseline feature matrix: {X_train_baseline.shape}")
print(f"Features: {baseline_features}")

In [None]:
# Train baseline classification model
print("Training Baseline Classification Model (Random Forest)...")

baseline_clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

baseline_clf.fit(X_train_baseline, y_train_class)

# Predictions
y_pred_baseline = baseline_clf.predict(X_test_baseline)
y_pred_proba_baseline = baseline_clf.predict_proba(X_test_baseline)[:, 1]

# Metrics
baseline_f1 = f1_score(y_test_class, y_pred_baseline)
baseline_precision = precision_score(y_test_class, y_pred_baseline)
baseline_recall = recall_score(y_test_class, y_pred_baseline)
baseline_auc = roc_auc_score(y_test_class, y_pred_proba_baseline)

print("\nBaseline Classification Results:")
print(f"  F1 Score: {baseline_f1:.4f}")
print(f"  Precision: {baseline_precision:.4f}")
print(f"  Recall: {baseline_recall:.4f}")
print(f"  ROC-AUC: {baseline_auc:.4f}")

print("\nClassification Report:")
print(classification_report(y_test_class, y_pred_baseline))

In [None]:
# Train baseline regression model
print("Training Baseline Regression Model (Random Forest)...")

baseline_reg = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

baseline_reg.fit(X_train_baseline, y_train_reg)

# Predictions
y_pred_reg_baseline = baseline_reg.predict(X_test_baseline)

# Metrics
baseline_mae = mean_absolute_error(y_test_reg, y_pred_reg_baseline)
baseline_rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg_baseline))
baseline_r2 = r2_score(y_test_reg, y_pred_reg_baseline)

print("\nBaseline Regression Results:")
print(f"  MAE: {baseline_mae:.4f}")
print(f"  RMSE: {baseline_rmse:.4f}")
print(f"  R²: {baseline_r2:.4f}")

## 7. Enhanced Models (With Top UL Features)

In [None]:
# Enhanced features = baseline + top UL features
enhanced_features = baseline_features + top_ul_features

X_train_enhanced = prepare_features(df_train, enhanced_features)
X_test_enhanced = prepare_features(df_test, enhanced_features)

print(f"Enhanced feature matrix: {X_train_enhanced.shape}")
print(f"Added {len(top_ul_features)} UL features to {len(baseline_features)} baseline features")

In [None]:
# Train enhanced classification model
print("Training Enhanced Classification Model (Random Forest + UL)...")

enhanced_clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

enhanced_clf.fit(X_train_enhanced, y_train_class)

# Predictions
y_pred_enhanced = enhanced_clf.predict(X_test_enhanced)
y_pred_proba_enhanced = enhanced_clf.predict_proba(X_test_enhanced)[:, 1]

# Metrics
enhanced_f1 = f1_score(y_test_class, y_pred_enhanced)
enhanced_precision = precision_score(y_test_class, y_pred_enhanced)
enhanced_recall = recall_score(y_test_class, y_pred_enhanced)
enhanced_auc = roc_auc_score(y_test_class, y_pred_proba_enhanced)

print("\nEnhanced Classification Results:")
print(f"  F1 Score: {enhanced_f1:.4f} (Δ={enhanced_f1-baseline_f1:+.4f})")
print(f"  Precision: {enhanced_precision:.4f} (Δ={enhanced_precision-baseline_precision:+.4f})")
print(f"  Recall: {enhanced_recall:.4f} (Δ={enhanced_recall-baseline_recall:+.4f})")
print(f"  ROC-AUC: {enhanced_auc:.4f} (Δ={enhanced_auc-baseline_auc:+.4f})")

print("\nClassification Report:")
print(classification_report(y_test_class, y_pred_enhanced))

In [None]:
# Train enhanced regression model
print("Training Enhanced Regression Model (Random Forest + UL)...")

enhanced_reg = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

enhanced_reg.fit(X_train_enhanced, y_train_reg)

# Predictions
y_pred_reg_enhanced = enhanced_reg.predict(X_test_enhanced)

# Metrics
enhanced_mae = mean_absolute_error(y_test_reg, y_pred_reg_enhanced)
enhanced_rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg_enhanced))
enhanced_r2 = r2_score(y_test_reg, y_pred_reg_enhanced)

print("\nEnhanced Regression Results:")
print(f"  MAE: {enhanced_mae:.4f} (Δ={enhanced_mae-baseline_mae:+.4f})")
print(f"  RMSE: {enhanced_rmse:.4f} (Δ={enhanced_rmse-baseline_rmse:+.4f})")
print(f"  R²: {enhanced_r2:.4f} (Δ={enhanced_r2-baseline_r2:+.4f})")

## 8. Performance Comparison

In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Metric': ['F1 Score', 'Precision', 'Recall', 'ROC-AUC', 'MAE', 'RMSE', 'R²'],
    'Baseline': [baseline_f1, baseline_precision, baseline_recall, baseline_auc, 
                 baseline_mae, baseline_rmse, baseline_r2],
    'Enhanced': [enhanced_f1, enhanced_precision, enhanced_recall, enhanced_auc,
                 enhanced_mae, enhanced_rmse, enhanced_r2]
})

comparison_df['Improvement'] = comparison_df['Enhanced'] - comparison_df['Baseline']
comparison_df['Improvement %'] = (comparison_df['Improvement'] / comparison_df['Baseline'] * 100).round(2)

print("="*80)
print("PERFORMANCE COMPARISON: Baseline vs Enhanced (with UL features)")
print("="*80)
print(comparison_df.to_string(index=False))

In [None]:
# Visualize comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Classification metrics
class_metrics = ['F1 Score', 'Precision', 'Recall', 'ROC-AUC']
class_data = comparison_df[comparison_df['Metric'].isin(class_metrics)]

x = np.arange(len(class_metrics))
width = 0.35

axes[0, 0].bar(x - width/2, class_data['Baseline'], width, label='Baseline', alpha=0.8)
axes[0, 0].bar(x + width/2, class_data['Enhanced'], width, label='Enhanced', alpha=0.8)
axes[0, 0].set_xlabel('Metric')
axes[0, 0].set_ylabel('Score')
axes[0, 0].set_title('Classification Metrics Comparison')
axes[0, 0].set_xticks(x)
axes[0, 0].set_xticklabels(class_metrics, rotation=45)
axes[0, 0].legend()
axes[0, 0].grid(axis='y', alpha=0.3)

# Improvement percentages
class_data[['Metric', 'Improvement %']].set_index('Metric')['Improvement %'].plot(
    kind='barh', ax=axes[0, 1], color='green', alpha=0.7
)
axes[0, 1].set_xlabel('Improvement (%)')
axes[0, 1].set_title('Classification: % Improvement with UL Features')
axes[0, 1].axvline(x=0, color='black', linestyle='-', linewidth=0.5)
axes[0, 1].grid(axis='x', alpha=0.3)

# Regression metrics
reg_metrics = ['MAE', 'RMSE', 'R²']
reg_data = comparison_df[comparison_df['Metric'].isin(reg_metrics)]

x = np.arange(len(reg_metrics))

axes[1, 0].bar(x - width/2, reg_data['Baseline'], width, label='Baseline', alpha=0.8)
axes[1, 0].bar(x + width/2, reg_data['Enhanced'], width, label='Enhanced', alpha=0.8)
axes[1, 0].set_xlabel('Metric')
axes[1, 0].set_ylabel('Score')
axes[1, 0].set_title('Regression Metrics Comparison')
axes[1, 0].set_xticks(x)
axes[1, 0].set_xticklabels(reg_metrics)
axes[1, 0].legend()
axes[1, 0].grid(axis='y', alpha=0.3)

# Regression improvement
reg_data[['Metric', 'Improvement %']].set_index('Metric')['Improvement %'].plot(
    kind='barh', ax=axes[1, 1], color='orange', alpha=0.7
)
axes[1, 1].set_xlabel('Improvement (%)')
axes[1, 1].set_title('Regression: % Improvement with UL Features')
axes[1, 1].axvline(x=0, color='black', linestyle='-', linewidth=0.5)
axes[1, 1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Feature Importance Analysis

In [None]:
# Get feature importances from enhanced models
feature_importance_clf = pd.DataFrame({
    'feature': enhanced_features,
    'importance': enhanced_clf.feature_importances_
}).sort_values('importance', ascending=False)

feature_importance_reg = pd.DataFrame({
    'feature': enhanced_features,
    'importance': enhanced_reg.feature_importances_
}).sort_values('importance', ascending=False)

# Mark UL features
feature_importance_clf['is_ul'] = feature_importance_clf['feature'].isin(top_ul_features)
feature_importance_reg['is_ul'] = feature_importance_reg['feature'].isin(top_ul_features)

print("Top 20 Most Important Features (Classification):")
print(feature_importance_clf.head(20).to_string(index=False))

print("\nTop 20 Most Important Features (Regression):")
print(feature_importance_reg.head(20).to_string(index=False))

In [None]:
# Visualize feature importance
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

# Classification
top_20_clf = feature_importance_clf.head(20)
colors_clf = ['skyblue' if is_ul else 'lightgray' for is_ul in top_20_clf['is_ul']]

axes[0].barh(range(20), top_20_clf['importance'], color=colors_clf)
axes[0].set_yticks(range(20))
axes[0].set_yticklabels(top_20_clf['feature'])
axes[0].set_xlabel('Importance')
axes[0].set_title('Top 20 Features: Classification\n(Blue = UL features)')
axes[0].invert_yaxis()
axes[0].grid(axis='x', alpha=0.3)

# Regression
top_20_reg = feature_importance_reg.head(20)
colors_reg = ['orange' if is_ul else 'lightgray' for is_ul in top_20_reg['is_ul']]

axes[1].barh(range(20), top_20_reg['importance'], color=colors_reg)
axes[1].set_yticks(range(20))
axes[1].set_yticklabels(top_20_reg['feature'])
axes[1].set_xlabel('Importance')
axes[1].set_title('Top 20 Features: Regression\n(Orange = UL features)')
axes[1].invert_yaxis()
axes[1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Analyze UL feature contribution
ul_importance_clf = feature_importance_clf[feature_importance_clf['is_ul']]
ul_importance_reg = feature_importance_reg[feature_importance_reg['is_ul']]

print("UL Feature Importance Summary:")
print(f"\nClassification:")
print(f"  Total UL importance: {ul_importance_clf['importance'].sum():.4f}")
print(f"  Mean UL importance: {ul_importance_clf['importance'].mean():.4f}")
print(f"  Top UL feature: {ul_importance_clf.iloc[0]['feature']} ({ul_importance_clf.iloc[0]['importance']:.4f})")

print(f"\nRegression:")
print(f"  Total UL importance: {ul_importance_reg['importance'].sum():.4f}")
print(f"  Mean UL importance: {ul_importance_reg['importance'].mean():.4f}")
print(f"  Top UL feature: {ul_importance_reg.iloc[0]['feature']} ({ul_importance_reg.iloc[0]['importance']:.4f})")

print(f"\nTop 10 UL Features (Classification):")
print(ul_importance_clf.head(10)[['feature', 'importance']].to_string(index=False))

print(f"\nTop 10 UL Features (Regression):")
print(ul_importance_reg.head(10)[['feature', 'importance']].to_string(index=False))

## 10. Ablation Study - Different UL Feature Combinations

In [None]:
# Test different feature combinations
print("Testing different UL feature combinations...\n")

feature_combos = {
    'Baseline Only': baseline_features,
    '+ Top 5 UL': baseline_features + top_ul_features[:5],
    '+ Top 10 UL': baseline_features + top_ul_features[:10],
    '+ Top 20 UL': baseline_features + top_ul_features[:20],
    '+ All Topics': baseline_features + ul_topic_features + ul_nmf_features,
    '+ All PCA': baseline_features + ul_pca_features,
    '+ All UL': baseline_features + all_ul_features,
}

ablation_results = []

for combo_name, features in feature_combos.items():
    print(f"Testing: {combo_name} ({len(features)} features)...")
    
    # Prepare data
    X_train = prepare_features(df_train, features)
    X_test = prepare_features(df_test, features)
    
    # Train classification model
    clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1, class_weight='balanced')
    clf.fit(X_train, y_train_class)
    y_pred = clf.predict(X_test)
    y_pred_proba = clf.predict_proba(X_test)[:, 1]
    
    # Train regression model
    reg = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
    reg.fit(X_train, y_train_reg)
    y_pred_reg = reg.predict(X_test)
    
    # Store results
    ablation_results.append({
        'Combination': combo_name,
        'Num Features': len(features),
        'F1': f1_score(y_test_class, y_pred),
        'Precision': precision_score(y_test_class, y_pred),
        'Recall': recall_score(y_test_class, y_pred),
        'ROC-AUC': roc_auc_score(y_test_class, y_pred_proba),
        'MAE': mean_absolute_error(y_test_reg, y_pred_reg),
        'RMSE': np.sqrt(mean_squared_error(y_test_reg, y_pred_reg)),
        'R²': r2_score(y_test_reg, y_pred_reg)
    })

ablation_df = pd.DataFrame(ablation_results)

print("\n" + "="*100)
print("ABLATION STUDY RESULTS")
print("="*100)
print(ablation_df.to_string(index=False))

In [None]:
# Visualize ablation study
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# F1 Score
ablation_df.plot(x='Combination', y='F1', kind='bar', ax=axes[0, 0], legend=False, color='steelblue')
axes[0, 0].set_ylabel('F1 Score')
axes[0, 0].set_title('F1 Score by Feature Combination')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(axis='y', alpha=0.3)

# ROC-AUC
ablation_df.plot(x='Combination', y='ROC-AUC', kind='bar', ax=axes[0, 1], legend=False, color='darkgreen')
axes[0, 1].set_ylabel('ROC-AUC')
axes[0, 1].set_title('ROC-AUC by Feature Combination')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(axis='y', alpha=0.3)

# MAE
ablation_df.plot(x='Combination', y='MAE', kind='bar', ax=axes[1, 0], legend=False, color='coral')
axes[1, 0].set_ylabel('MAE (Lower is better)')
axes[1, 0].set_title('MAE by Feature Combination')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(axis='y', alpha=0.3)

# R²
ablation_df.plot(x='Combination', y='R²', kind='bar', ax=axes[1, 1], legend=False, color='purple')
axes[1, 1].set_ylabel('R²')
axes[1, 1].set_title('R² by Feature Combination')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 11. Summary & Recommendations

In [None]:
print("="*100)
print("FINAL SUMMARY & RECOMMENDATIONS")
print("="*100)

# Best performing combination
best_f1_idx = ablation_df['F1'].idxmax()
best_auc_idx = ablation_df['ROC-AUC'].idxmax()
best_r2_idx = ablation_df['R²'].idxmax()

print("\n1. BEST PERFORMING COMBINATIONS:")
print(f"   - Best F1 Score: {ablation_df.loc[best_f1_idx, 'Combination']} (F1={ablation_df.loc[best_f1_idx, 'F1']:.4f})")
print(f"   - Best ROC-AUC: {ablation_df.loc[best_auc_idx, 'Combination']} (AUC={ablation_df.loc[best_auc_idx, 'ROC-AUC']:.4f})")
print(f"   - Best R²: {ablation_df.loc[best_r2_idx, 'Combination']} (R²={ablation_df.loc[best_r2_idx, 'R²']:.4f})")

# Calculate improvements
baseline_row = ablation_df[ablation_df['Combination'] == 'Baseline Only'].iloc[0]
best_row = ablation_df.loc[best_f1_idx]

f1_improvement = ((best_row['F1'] - baseline_row['F1']) / baseline_row['F1'] * 100) if baseline_row['F1'] > 0 else 0
auc_improvement = ((best_row['ROC-AUC'] - baseline_row['ROC-AUC']) / baseline_row['ROC-AUC'] * 100) if baseline_row['ROC-AUC'] > 0 else 0
r2_improvement_abs = best_row['R²'] - baseline_row['R²']

print("\n2. PERFORMANCE IMPROVEMENTS:")
print(f"   - F1 Score: {f1_improvement:+.2f}%")
print(f"   - ROC-AUC: {auc_improvement:+.2f}%")
print(f"   - R² (absolute): {r2_improvement_abs:+.4f}")

# Interpret results
print("\n3. INTERPRETATION:")
if baseline_row['R²'] < 0:
    print(f"   ⚠️  WARNING: Baseline R² is negative ({baseline_row['R²']:.4f})")
    print(f"   → This means the regression model performs worse than predicting the mean")
    print(f"   → Consider: different features, feature engineering, or different model")

if baseline_row['F1'] < 0.5:
    print(f"   ⚠️  WARNING: F1 score is low ({baseline_row['F1']:.4f})")
    print(f"   → Citation prediction is inherently difficult")
    print(f"   → Consider: focusing on ranking (ROC-AUC) instead of classification")

if f1_improvement < 5 and auc_improvement < 5:
    print(f"   ℹ️  UL features provide modest improvement (<5%)")
    print(f"   → They add some signal but aren't game-changing")
    print(f"   → Consider cost/benefit of feature complexity")

# Top contributing UL features
top_ul_clf = ul_importance_clf.head(5) if len(ul_importance_clf) > 0 else None
if top_ul_clf is not None and len(top_ul_clf) > 0:
    print("\n4. TOP 5 UL FEATURES (Classification):")
    for i, row in top_ul_clf.iterrows():
        print(f"   - {row['feature']}: {row['importance']:.4f}")

print("\n5. RECOMMENDATIONS:")
print(f"   → Focus on improving baseline features first")
print(f"   → UL features add marginal value - use selectively")
print(f"   → Consider different problem framing:")
print(f"      • Ranking papers instead of binary classification")
print(f"      • Predicting citation percentile instead of raw count")
print(f"      • Multi-class classification (low/medium/high impact)")
print(f"\n6. NEXT STEPS:")
print(f"   → Investigate why R² is negative (feature engineering issue?)")
print(f"   → Try gradient boosting models (XGBoost, LightGBM)")
print(f"   → Add domain-specific features (author h-index, journal impact factor)")
print(f"   → Consider temporal dynamics (citation accumulation over time)")

print("\n" + "="*100)

## 12. Save Results

In [None]:
# Save comparison results
comparison_df.to_csv('../data/processed/ul_integration_comparison.csv', index=False)
ablation_df.to_csv('../data/processed/ul_ablation_study.csv', index=False)
feature_importance_clf.to_csv('../data/processed/ul_feature_importance_clf.csv', index=False)
feature_importance_reg.to_csv('../data/processed/ul_feature_importance_reg.csv', index=False)

# Save recommended feature list
recommended_features = best_row['Combination']
with open('../data/processed/recommended_ul_features.txt', 'w') as f:
    f.write(f"Best performing combination: {recommended_features}\n")
    f.write(f"Number of features: {best_row['Num Features']}\n\n")
    f.write("Top 20 UL features by importance:\n")
    for feat in ul_importance_clf.head(20)['feature']:
        f.write(f"  - {feat}\n")

print("Results saved to data/processed/:")
print("  - ul_integration_comparison.csv")
print("  - ul_ablation_study.csv")
print("  - ul_feature_importance_clf.csv")
print("  - ul_feature_importance_reg.csv")
print("  - recommended_ul_features.txt")