# Fixed: XGBoost In-Domain vs Out-Domain Analysis

This version properly handles label encoding issues across different sites.

In [None]:
# Setup: Clone repository and install packages
import os

# Clone repository if not already present
if not os.path.exists('/content/tabicl'):
    print("Cloning repository...")
    !git clone https://github.com/cliu238/tabicl.git
    print("Repository cloned successfully!")
else:
    print("Repository already exists.")

# Change to repository directory
%cd /content/tabicl

# Install required packages
print("\nInstalling packages...")
!pip install xgboost scikit-learn pandas numpy matplotlib seaborn plotly -q
print("Packages installed!")

# Verify data files exist
print("\nChecking data files:")
!ls -lh processed_data/*.csv

In [None]:
# Import all required libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

# Set style and random seed
plt.style.use('default')
np.random.seed(42)

print("✅ All libraries imported successfully!")

In [None]:
# Load and explore the dataset
df = pd.read_csv('processed_data/adult_numeric_20250729_155457.csv')

print("📊 Dataset Overview:")
print(f"Shape: {df.shape}")
print(f"\n🏥 Sites distribution:")
print(df['site'].value_counts())
print(f"\n🎯 Target (va34): {df['va34'].nunique()} unique classes")
print(f"Missing values: {df.isnull().sum().sum()}")

# Check class distribution per site
print("\n📊 Classes per site:")
for site in df['site'].unique():
    site_classes = df[df['site'] == site]['va34'].nunique()
    print(f"{site}: {site_classes} unique classes")

In [None]:
# Preprocessing
print("🔧 Preprocessing data...")

# Drop cod5 column
if 'cod5' in df.columns:
    df = df.drop('cod5', axis=1)
    print("✅ Dropped 'cod5' column")

# Create a global label encoder that knows all possible labels
global_le = LabelEncoder()
global_le.fit(df['va34'])  # Fit on ALL labels across all sites
print(f"\n Total unique labels across all sites: {len(global_le.classes_)}")

# Separate features, target, and sites
X = df.drop(['va34', 'site'], axis=1)
y = df['va34']
sites = df['site']

print(f"\n📐 Data shapes:")
print(f"Features: {X.shape}")
print(f"Target: {y.shape}")
print(f"Sites: {sites.unique().tolist()}")

In [None]:
def create_domain_splits(df, test_size=0.2, random_state=42):
    """Create train/test splits for each site with proper stratification"""
    domain_splits = {}
    
    for site in df['site'].unique():
        site_data = df[df['site'] == site]
        X_site = site_data.drop(['va34', 'site'], axis=1)
        y_site = site_data['va34']
        
        print(f"\nProcessing {site}:")
        print(f"  Samples: {len(site_data)}")
        print(f"  Unique classes: {y_site.nunique()}")
        
        # Check if we have enough samples per class for stratification
        min_samples_per_class = y_site.value_counts().min()
        
        if len(site_data) < 50 or min_samples_per_class < 2:
            # Too few samples, use simple split without stratification
            print(f"  Using simple split (too few samples for stratification)")
            if len(site_data) < 10:
                # Very small dataset, use all for training and testing
                domain_splits[site] = {
                    'X_train': X_site, 'X_test': X_site,
                    'y_train': y_site, 'y_test': y_site,
                    'full_X': X_site, 'full_y': y_site
                }
            else:
                X_train, X_test, y_train, y_test = train_test_split(
                    X_site, y_site, test_size=test_size, random_state=random_state
                )
                domain_splits[site] = {
                    'X_train': X_train, 'X_test': X_test,
                    'y_train': y_train, 'y_test': y_test,
                    'full_X': X_site, 'full_y': y_site
                }
        else:
            # Enough samples for stratified split
            print(f"  Using stratified split")
            try:
                X_train, X_test, y_train, y_test = train_test_split(
                    X_site, y_site, test_size=test_size, 
                    random_state=random_state, stratify=y_site
                )
            except:
                # Fallback to simple split if stratification fails
                print(f"  Stratification failed, using simple split")
                X_train, X_test, y_train, y_test = train_test_split(
                    X_site, y_site, test_size=test_size, random_state=random_state
                )
            
            domain_splits[site] = {
                'X_train': X_train, 'X_test': X_test,
                'y_train': y_train, 'y_test': y_test,
                'full_X': X_site, 'full_y': y_site
            }
    
    return domain_splits

def train_xgboost_model(X_train, y_train, global_le):
    """Train XGBoost with global label encoder"""
    # Use the global label encoder
    y_train_encoded = global_le.transform(y_train)
    
    params = {
        'objective': 'multi:softprob',
        'num_class': len(global_le.classes_),  # Use all possible classes
        'max_depth': 4,
        'learning_rate': 0.1,
        'n_estimators': 100,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.1,
        'reg_lambda': 1.0,
        'random_state': 42,
        'verbosity': 0,
        'use_label_encoder': False,
        'eval_metric': 'mlogloss'
    }
    
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train_encoded)
    return model

def evaluate_model(model, global_le, X_test, y_test):
    """Evaluate model with global label encoder"""
    try:
        # Transform test labels using global encoder
        y_test_encoded = global_le.transform(y_test)
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test_encoded, y_pred)
        balanced_acc = balanced_accuracy_score(y_test_encoded, y_pred)
        
        # For F1 score, handle the case where not all classes are predicted
        f1_macro = f1_score(y_test_encoded, y_pred, average='macro', zero_division=0)
        
        return {
            'accuracy': accuracy,
            'balanced_accuracy': balanced_acc,
            'f1_macro': f1_macro,
            'predictions': y_pred,
            'true_labels': y_test_encoded
        }
    except Exception as e:
        print(f"    Error in evaluation: {e}")
        return {
            'accuracy': 0.0,
            'balanced_accuracy': 0.0,
            'f1_macro': 0.0,
            'predictions': [],
            'true_labels': []
        }

print("✅ Functions defined successfully!")

In [None]:
# Create domain splits
print("Creating domain splits...")
print("="*60)
domain_data = create_domain_splits(df)

print("\n" + "="*60)
print("📁 Domain splits summary:")
for site, data in domain_data.items():
    print(f"{site:10} - Train: {len(data['X_train']):4}, Test: {len(data['X_test']):4}, Total: {len(data['full_X']):4}")

In [None]:
# In-domain performance evaluation with fixed label encoding
print("🎯 Evaluating In-Domain Performance...")
print("="*50)

in_domain_results = {}

for site in domain_data.keys():
    print(f"\nTraining for {site}...")
    
    # Check if site has enough unique classes
    site_classes = domain_data[site]['y_train'].nunique()
    print(f"  Training classes: {site_classes}")
    
    # Train on site's data using global label encoder
    model = train_xgboost_model(
        domain_data[site]['X_train'], 
        domain_data[site]['y_train'],
        global_le
    )
    
    # Test on same site
    results = evaluate_model(
        model, global_le,
        domain_data[site]['X_test'],
        domain_data[site]['y_test']
    )
    
    in_domain_results[site] = results
    in_domain_results[site]['model'] = model
    
    print(f"  ✓ Accuracy: {results['accuracy']:.4f}")
    print(f"  ✓ Balanced Acc: {results['balanced_accuracy']:.4f}")
    print(f"  ✓ F1 Macro: {results['f1_macro']:.4f}")

In [None]:
# Visualize in-domain results
sites_list = list(in_domain_results.keys())
accuracies = [in_domain_results[s]['accuracy'] for s in sites_list]
balanced_accs = [in_domain_results[s]['balanced_accuracy'] for s in sites_list]
f1_scores = [in_domain_results[s]['f1_macro'] for s in sites_list]

# Create bar plot
x = np.arange(len(sites_list))
width = 0.25

fig, ax = plt.subplots(figsize=(12, 6))
bars1 = ax.bar(x - width, accuracies, width, label='Accuracy', color='#2E7D32', alpha=0.8)
bars2 = ax.bar(x, balanced_accs, width, label='Balanced Accuracy', color='#1976D2', alpha=0.8)
bars3 = ax.bar(x + width, f1_scores, width, label='F1 Macro', color='#F57C00', alpha=0.8)

ax.set_xlabel('Site', fontsize=12)
ax.set_ylabel('Score', fontsize=12)
ax.set_title('In-Domain Performance by Site', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(sites_list)
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_ylim([0, 1])

# Add value labels on bars
for bars in [bars1, bars2, bars3]:
    for bar in bars:
        height = bar.get_height()
        if height > 0:  # Only show label if value > 0
            ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                    f'{height:.3f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

# Print summary statistics
print("\n📊 In-Domain Performance Summary:")
print(f"Average Accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
print(f"Average Balanced Accuracy: {np.mean(balanced_accs):.4f} ± {np.std(balanced_accs):.4f}")
print(f"Average F1 Macro: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")

In [None]:
# Out-domain performance evaluation
print("🔄 Evaluating Out-Domain Performance...")
print("(Train on one site, test on others)")
print("="*50)

out_domain_results = {}

for train_site in domain_data.keys():
    out_domain_results[train_site] = {}
    
    # Train on full data from one site
    print(f"\nTraining on {train_site}...")
    model = train_xgboost_model(
        domain_data[train_site]['full_X'],
        domain_data[train_site]['full_y'],
        global_le
    )
    
    # Test on all other sites
    for test_site in domain_data.keys():
        if train_site == test_site:
            continue
            
        results = evaluate_model(
            model, global_le,
            domain_data[test_site]['full_X'],
            domain_data[test_site]['full_y']
        )
        
        out_domain_results[train_site][test_site] = results
        print(f"  {train_site} → {test_site}: {results['accuracy']:.4f}")

In [None]:
# Cross-domain performance evaluation
print("🌐 Evaluating Cross-Domain Performance...")
print("(Train on multiple sites, test on held-out)")
print("="*50)

cross_domain_results = {}

for held_out_site in domain_data.keys():
    # Combine all other sites for training
    train_sites = [s for s in domain_data.keys() if s != held_out_site]
    
    X_train_list = [domain_data[s]['full_X'] for s in train_sites]
    y_train_list = [domain_data[s]['full_y'] for s in train_sites]
    
    X_train_combined = pd.concat(X_train_list, axis=0)
    y_train_combined = pd.concat(y_train_list, axis=0)
    
    print(f"\nTraining on {', '.join(train_sites)}")
    print(f"  Combined training size: {len(X_train_combined)} samples")
    
    # Train model
    model = train_xgboost_model(X_train_combined, y_train_combined, global_le)
    
    # Test on held-out site
    results = evaluate_model(
        model, global_le,
        domain_data[held_out_site]['full_X'],
        domain_data[held_out_site]['full_y']
    )
    
    cross_domain_results[held_out_site] = results
    cross_domain_results[held_out_site]['model'] = model
    cross_domain_results[held_out_site]['train_sites'] = train_sites
    
    print(f"  Testing on {held_out_site}: {results['accuracy']:.4f}")

In [None]:
# Final comparison and report
print("\n" + "="*80)
print(" "*20 + "FINAL PERFORMANCE COMPARISON")
print("="*80)

# Create comparison dataframe
comparison_data = []
for site in domain_data.keys():
    # In-domain
    comparison_data.append({
        'Site': site,
        'Scenario': 'In-Domain',
        'Accuracy': in_domain_results[site]['accuracy']
    })
    
    # Out-domain average (when this site is test)
    out_accs = [out_domain_results[train][site]['accuracy'] 
               for train in domain_data.keys() if train != site]
    if out_accs:
        comparison_data.append({
            'Site': site,
            'Scenario': 'Out-Domain (Avg)',
            'Accuracy': np.mean(out_accs)
        })
    
    # Cross-domain
    comparison_data.append({
        'Site': site,
        'Scenario': 'Cross-Domain',
        'Accuracy': cross_domain_results[site]['accuracy']
    })

df_comparison = pd.DataFrame(comparison_data)

# Create visualization
fig = px.bar(df_comparison, x='Site', y='Accuracy', color='Scenario',
             title='Performance Comparison: In-Domain vs Out-Domain vs Cross-Domain',
             barmode='group', height=500,
             color_discrete_map={
                 'In-Domain': '#2E7D32',
                 'Out-Domain (Avg)': '#F57C00',
                 'Cross-Domain': '#1976D2'
             })
fig.update_layout(yaxis_range=[0, 1])
fig.show()

# Print summary
print("\n📊 Overall Performance Summary:")
print("="*50)

for scenario in ['In-Domain', 'Out-Domain (Avg)', 'Cross-Domain']:
    scenario_data = df_comparison[df_comparison['Scenario'] == scenario]['Accuracy']
    if len(scenario_data) > 0:
        print(f"\n{scenario}:")
        print(f"  Mean: {scenario_data.mean():.4f}")
        print(f"  Std:  {scenario_data.std():.4f}")
        print(f"  Min:  {scenario_data.min():.4f}")
        print(f"  Max:  {scenario_data.max():.4f}")