# Experiment 15: Domain-Specific Models

**Hypothesis**: Training separate models per research domain will improve F1 beyond the baseline 62.54%.

**Motivation**: Wu et al. (2023) demonstrated that citation patterns vary significantly across research domains. Domain-specific models can capture field-specific citation dynamics better than a universal model.

**Method**:
1. Group papers by ASJC research field into 5-6 major domains
2. Train separate LogisticRegression models per domain
3. Compare domain-specific F1 scores vs. baseline (62.54%)
4. Calculate overall weighted F1 across all domains

**Expected Outcome**: F1 improvement to 63-66% if domain segmentation captures field-specific patterns.

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, 
    f1_score, 
    precision_score, 
    recall_score, 
    accuracy_score,
    roc_auc_score,
    confusion_matrix
)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Load Data and Analyze ASJC Field Distribution

In [None]:
# Load dataset
df = pd.read_pickle('../data/processed/cleaned_data.pkl')

# Load features and targets
X_all = pd.read_pickle('../data/features/X_all.pkl')
y_cls = pd.read_pickle('../data/features/y_classification.pkl')
metadata = pd.read_pickle('../data/features/metadata.pkl')

print(f"Dataset: {df.shape}")
print(f"Features: {X_all.shape}")
print(f"Target: {y_cls.shape}")

In [None]:
# Check ASJC field availability
print("\n=== ASJC Field Columns ===")
asjc_cols = [col for col in df.columns if 'asjc' in col.lower() or 'field' in col.lower()]
print(asjc_cols)

if 'ASJC field name' in df.columns:
    print("\n=== Top 20 ASJC Fields ===")
    field_dist = df['ASJC field name'].value_counts().head(20)
    print(field_dist)
    
    print(f"\nTotal unique fields: {df['ASJC field name'].nunique()}")
    print(f"Papers with field data: {df['ASJC field name'].notna().sum()} / {len(df)}")
    print(f"Missing field data: {df['ASJC field name'].isna().sum()}")
else:
    print("\nWarning: 'ASJC field name' not found. Available columns:")
    print(df.columns.tolist())

## 2. Create Domain Groupings

Group ASJC fields into 5-6 major research domains based on common categorization.

In [None]:
# Define domain mapping (ASJC field ‚Üí broader domain)
domain_mapping = {
    # Medicine & Health Sciences
    'Medicine': 'Medicine & Health',
    'Nursing': 'Medicine & Health',
    'Health Professions': 'Medicine & Health',
    'Dentistry': 'Medicine & Health',
    'Pharmacology, Toxicology and Pharmaceutics': 'Medicine & Health',
    'Immunology and Microbiology': 'Medicine & Health',
    'Neuroscience': 'Medicine & Health',
    
    # Life & Natural Sciences
    'Biochemistry, Genetics and Molecular Biology': 'Life Sciences',
    'Agricultural and Biological Sciences': 'Life Sciences',
    'Environmental Science': 'Life Sciences',
    'Chemistry': 'Natural Sciences',
    'Physics and Astronomy': 'Natural Sciences',
    'Earth and Planetary Sciences': 'Natural Sciences',
    'Mathematics': 'Natural Sciences',
    
    # Engineering & Technology
    'Engineering': 'Engineering & Technology',
    'Computer Science': 'Engineering & Technology',
    'Materials Science': 'Engineering & Technology',
    'Chemical Engineering': 'Engineering & Technology',
    'Energy': 'Engineering & Technology',
    
    # Social Sciences
    'Social Sciences': 'Social Sciences',
    'Psychology': 'Social Sciences',
    'Economics, Econometrics and Finance': 'Social Sciences',
    'Business, Management and Accounting': 'Social Sciences',
    'Decision Sciences': 'Social Sciences',
    
    # Arts & Humanities
    'Arts and Humanities': 'Arts & Humanities',
    
    # Multidisciplinary
    'Multidisciplinary': 'Multidisciplinary',
}

# Apply domain mapping
if 'ASJC field name' in df.columns:
    df['domain'] = df['ASJC field name'].map(domain_mapping)
    df['domain'].fillna('Other', inplace=True)
    
    print("=== Domain Distribution ===")
    domain_dist = df['domain'].value_counts()
    print(domain_dist)
    print(f"\nTotal domains: {df['domain'].nunique()}")
else:
    print("Cannot create domain mapping without ASJC field name column")

## 3. Prepare Train/Test Splits with Domain Labels

In [None]:
# Load existing temporal splits
X_train = pd.read_pickle('../data/features/X_train_temporal.pkl')
X_test = pd.read_pickle('../data/features/X_test_temporal.pkl')
y_train = pd.read_pickle('../data/features/y_train_cls_temporal.pkl')
y_test = pd.read_pickle('../data/features/y_test_cls_temporal.pkl')

print(f"Train: {X_train.shape}, Test: {X_test.shape}")

# Get domain labels for train/test sets
if 'domain' in df.columns:
    domains_train = df.loc[X_train.index, 'domain']
    domains_test = df.loc[X_test.index, 'domain']
    
    print("\n=== Train Set Domain Distribution ===")
    print(domains_train.value_counts())
    
    print("\n=== Test Set Domain Distribution ===")
    print(domains_test.value_counts())
else:
    print("\nError: Domain labels not created")

## 4. Baseline Model (Universal, No Domain Segmentation)

In [None]:
print("="*80)
print("BASELINE: Universal Model (No Domain Segmentation)")
print("="*80)

# Train baseline model
model_baseline = LogisticRegression(
    max_iter=1000,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

model_baseline.fit(X_train, y_train)
y_pred_proba_baseline = model_baseline.predict_proba(X_test)[:, 1]
y_pred_baseline = (y_pred_proba_baseline >= 0.54).astype(int)

baseline_results = {
    'Accuracy': accuracy_score(y_test, y_pred_baseline),
    'Precision': precision_score(y_test, y_pred_baseline),
    'Recall': recall_score(y_test, y_pred_baseline),
    'F1': f1_score(y_test, y_pred_baseline),
    'ROC-AUC': roc_auc_score(y_test, y_pred_proba_baseline)
}

print("\nBaseline Results:")
for metric, value in baseline_results.items():
    print(f"  {metric:12s}: {value*100:.2f}%")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_baseline))

## 5. Domain-Specific Models

In [None]:
print("\n" + "="*80)
print("DOMAIN-SPECIFIC MODELS")
print("="*80)

# Train separate model for each domain
domain_models = {}
domain_results = {}

for domain in sorted(domains_train.unique()):
    print(f"\n--- {domain} ---")
    
    # Get domain-specific train/test data
    train_mask = domains_train == domain
    test_mask = domains_test == domain
    
    X_train_domain = X_train[train_mask]
    y_train_domain = y_train[train_mask]
    X_test_domain = X_test[test_mask]
    y_test_domain = y_test[test_mask]
    
    n_train = len(X_train_domain)
    n_test = len(X_test_domain)
    
    print(f"Train: {n_train} papers, Test: {n_test} papers")
    
    # Skip if too few test samples
    if n_test < 50:
        print(f"  ‚ö†Ô∏è  Skipping - too few test samples (<50)")
        continue
    
    # Check class distribution
    if y_train_domain.sum() < 10 or (len(y_train_domain) - y_train_domain.sum()) < 10:
        print(f"  ‚ö†Ô∏è  Skipping - insufficient class balance (need >10 samples per class)")
        continue
    
    # Train domain-specific model
    model = LogisticRegression(
        max_iter=1000,
        random_state=42,
        n_jobs=-1,
        class_weight='balanced'
    )
    
    model.fit(X_train_domain, y_train_domain)
    domain_models[domain] = model
    
    # Predict
    y_pred_proba = model.predict_proba(X_test_domain)[:, 1]
    y_pred = (y_pred_proba >= 0.54).astype(int)
    
    # Evaluate
    results = {
        'n_train': n_train,
        'n_test': n_test,
        'accuracy': accuracy_score(y_test_domain, y_pred),
        'precision': precision_score(y_test_domain, y_pred, zero_division=0),
        'recall': recall_score(y_test_domain, y_pred, zero_division=0),
        'f1': f1_score(y_test_domain, y_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_test_domain, y_pred_proba) if len(np.unique(y_test_domain)) > 1 else np.nan
    }
    
    domain_results[domain] = results
    
    print(f"  F1: {results['f1']*100:.2f}%, ROC-AUC: {results['roc_auc']*100:.2f}%")

## 6. Calculate Overall Domain-Specific Performance

In [None]:
# Make predictions for all test samples using appropriate domain model
y_pred_domain_all = np.zeros(len(y_test))
y_pred_proba_domain_all = np.zeros(len(y_test))

for domain, model in domain_models.items():
    test_mask = domains_test == domain
    if test_mask.sum() > 0:
        X_test_domain = X_test[test_mask]
        y_pred_proba = model.predict_proba(X_test_domain)[:, 1]
        y_pred = (y_pred_proba >= 0.54).astype(int)
        
        y_pred_domain_all[test_mask] = y_pred
        y_pred_proba_domain_all[test_mask] = y_pred_proba

# Overall metrics
overall_results = {
    'Accuracy': accuracy_score(y_test, y_pred_domain_all),
    'Precision': precision_score(y_test, y_pred_domain_all),
    'Recall': recall_score(y_test, y_pred_domain_all),
    'F1': f1_score(y_test, y_pred_domain_all),
    'ROC-AUC': roc_auc_score(y_test, y_pred_proba_domain_all)
}

print("\n" + "="*80)
print("OVERALL DOMAIN-SPECIFIC RESULTS")
print("="*80)
print("\nOverall Results (Weighted across all domains):")
for metric, value in overall_results.items():
    print(f"  {metric:12s}: {value*100:.2f}%")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_domain_all))

## 7. Compare Baseline vs Domain-Specific

In [None]:
print("\n" + "="*80)
print("COMPARISON: BASELINE vs DOMAIN-SPECIFIC")
print("="*80)

comparison_df = pd.DataFrame({
    'Baseline (Universal)': baseline_results,
    'Domain-Specific': overall_results,
    'Change': {k: overall_results[k] - baseline_results[k] for k in baseline_results.keys()}
})

# Format as percentages
comparison_df_display = comparison_df.copy()
for col in comparison_df_display.columns:
    comparison_df_display[col] = comparison_df_display[col].apply(
        lambda x: f"{x*100:+.2f}%" if isinstance(x, float) else x
    )

print("\n", comparison_df_display)

# Determine improvement
f1_change = overall_results['F1'] - baseline_results['F1']

print("\n" + "="*80)
if f1_change > 0.01:  # More than 1 percentage point
    print(f"‚úÖ IMPROVEMENT: +{f1_change*100:.2f} F1 points")
    print(f"   Domain-specific models perform better!")
    print(f"   F1: {baseline_results['F1']*100:.2f}% ‚Üí {overall_results['F1']*100:.2f}%")
elif f1_change > 0:
    print(f"‚ö†Ô∏è  SLIGHT IMPROVEMENT: +{f1_change*100:.2f} F1 points")
    print(f"   Marginal benefit from domain segmentation")
else:
    print(f"‚ùå NO IMPROVEMENT: {f1_change*100:+.2f} F1 points")
    print(f"   Domain segmentation did not help with current dataset size")
    print(f"   Likely due to small sample sizes per domain")
print("="*80)

## 8. Per-Domain Performance Analysis

In [None]:
# Create detailed comparison table per domain
domain_comparison = []

for domain in sorted(domain_results.keys()):
    # Get baseline performance for this domain
    test_mask = domains_test == domain
    y_test_domain = y_test[test_mask]
    y_pred_baseline_domain = y_pred_baseline[test_mask]
    
    baseline_f1_domain = f1_score(y_test_domain, y_pred_baseline_domain, zero_division=0)
    domain_f1 = domain_results[domain]['f1']
    
    domain_comparison.append({
        'Domain': domain,
        'Test Size': domain_results[domain]['n_test'],
        'Baseline F1': baseline_f1_domain,
        'Domain-Specific F1': domain_f1,
        'Change': domain_f1 - baseline_f1_domain
    })

domain_comp_df = pd.DataFrame(domain_comparison)
domain_comp_df = domain_comp_df.sort_values('Change', ascending=False)

print("\n=== Per-Domain F1 Comparison ===")
print(domain_comp_df.to_string(index=False))

# Visualize
fig, ax = plt.subplots(figsize=(12, 6))
x = np.arange(len(domain_comp_df))
width = 0.35

ax.bar(x - width/2, domain_comp_df['Baseline F1']*100, width, label='Baseline (Universal)', alpha=0.8)
ax.bar(x + width/2, domain_comp_df['Domain-Specific F1']*100, width, label='Domain-Specific', alpha=0.8)

ax.set_xlabel('Research Domain')
ax.set_ylabel('F1 Score (%)')
ax.set_title('F1 Score Comparison: Baseline vs Domain-Specific Models')
ax.set_xticks(x)
ax.set_xticklabels(domain_comp_df['Domain'], rotation=45, ha='right')
ax.legend()
ax.axhline(y=baseline_results['F1']*100, color='red', linestyle='--', 
           label=f"Overall Baseline: {baseline_results['F1']*100:.2f}%", alpha=0.5)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Conclusion & Interpretation

In [None]:
print("\nüí° CONCLUSION:")
print("-" * 80)

if f1_change > 0.01:
    print("‚úÖ Domain-specific modeling IMPROVED performance!")
    print(f"\n   Overall F1 increased by {f1_change*100:.2f} points")
    print(f"   This validates Wu et al.'s (2023) findings on domain segmentation.")
    print(f"\n   Recommendation: With a larger dataset (20,000+ papers), domain-specific")
    print(f"   models could achieve F1 of 65-70%.")
else:
    print("‚ö†Ô∏è  Domain-specific modeling did NOT improve performance.")
    print(f"\n   Change: {f1_change*100:+.2f} F1 points (not significant)")
    print(f"\n   Likely reasons:")
    print(f"   1. Dataset size: Training sets per domain are small (200-800 papers)")
    print(f"   2. Wu et al. used 4M+ papers, enabling robust domain-specific models")
    print(f"   3. Current universal model already captures most patterns")
    print(f"\n   Recommendation: Domain segmentation requires larger dataset.")
    print(f"   Baseline (62.54% F1) remains optimal for current data size.")

print("-" * 80)