# Final Fix: XGBoost In-Domain vs Out-Domain Analysis

This version properly handles non-consecutive class labels.

In [None]:
# Setup: Clone repository and install packages
import os

# Clone repository if not already present
if not os.path.exists('/content/tabicl'):
    print("Cloning repository...")
    !git clone https://github.com/cliu238/tabicl.git
    print("Repository cloned successfully!")
else:
    print("Repository already exists.")

# Change to repository directory
%cd /content/tabicl

# Install required packages
print("\nInstalling packages...")
!pip install xgboost scikit-learn pandas numpy matplotlib seaborn plotly -q
print("Packages installed!")

# Verify data files exist
print("\nChecking data files:")
!ls -lh processed_data/*.csv

In [None]:
# Import all required libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

# Set style and random seed
plt.style.use('default')
np.random.seed(42)

print("✅ All libraries imported successfully!")

In [None]:
# Load and explore the dataset
df = pd.read_csv('processed_data/adult_numeric_20250729_155457.csv')

print("📊 Dataset Overview:")
print(f"Shape: {df.shape}")
print(f"\n🏥 Sites distribution:")
print(df['site'].value_counts())
print(f"\n🎯 Target (va34): {df['va34'].nunique()} unique classes")
print(f"Unique target values: {sorted(df['va34'].unique())}")
print(f"Missing values: {df.isnull().sum().sum()}")

In [None]:
# Preprocessing with proper label encoding
print("🔧 Preprocessing data...")

# Drop cod5 column
if 'cod5' in df.columns:
    df = df.drop('cod5', axis=1)
    print("✅ Dropped 'cod5' column")

# IMPORTANT: Convert va34 to consecutive integers starting from 0
# This is required by XGBoost for multi-class classification
print("\n🔄 Converting labels to consecutive integers...")
label_mapping = {label: idx for idx, label in enumerate(sorted(df['va34'].unique()))}
reverse_mapping = {idx: label for label, idx in label_mapping.items()}

# Apply the mapping
df['va34_encoded'] = df['va34'].map(label_mapping)

print(f"Original labels: {list(label_mapping.keys())[:10]}...")
print(f"Encoded labels: {list(range(len(label_mapping)))}")
print(f"Total number of classes: {len(label_mapping)}")

# Separate features, target, and sites
X = df.drop(['va34', 'va34_encoded', 'site'], axis=1)
y = df['va34_encoded']  # Use encoded labels
y_original = df['va34']  # Keep original for reference
sites = df['site']

print(f"\n📐 Data shapes:")
print(f"Features: {X.shape}")
print(f"Target: {y.shape}")
print(f"Sites: {sites.unique().tolist()}")

In [None]:
def create_domain_splits(df, test_size=0.2, random_state=42):
    """Create train/test splits for each site"""
    domain_splits = {}
    
    for site in df['site'].unique():
        site_data = df[df['site'] == site]
        X_site = site_data.drop(['va34', 'va34_encoded', 'site'], axis=1)
        y_site = site_data['va34_encoded']  # Use encoded labels
        
        print(f"\nProcessing {site}:")
        print(f"  Samples: {len(site_data)}")
        print(f"  Unique classes: {site_data['va34'].nunique()}")
        
        # Simple train/test split
        if len(site_data) >= 10:
            X_train, X_test, y_train, y_test = train_test_split(
                X_site, y_site, test_size=test_size, random_state=random_state
            )
        else:
            # Very small dataset
            X_train, X_test = X_site, X_site
            y_train, y_test = y_site, y_site
        
        domain_splits[site] = {
            'X_train': X_train, 'X_test': X_test,
            'y_train': y_train, 'y_test': y_test,
            'full_X': X_site, 'full_y': y_site
        }
    
    return domain_splits

def train_xgboost_model(X_train, y_train, num_classes):
    """Train XGBoost model"""
    params = {
        'objective': 'multi:softprob',
        'num_class': num_classes,
        'max_depth': 4,
        'learning_rate': 0.1,
        'n_estimators': 100,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.1,
        'reg_lambda': 1.0,
        'random_state': 42,
        'verbosity': 0,
        'use_label_encoder': False,
        'eval_metric': 'mlogloss'
    }
    
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """Evaluate model performance"""
    try:
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        
        # For balanced accuracy and F1, we need to handle missing classes
        # Get unique labels in test set
        test_labels = np.unique(y_test)
        pred_labels = np.unique(y_pred)
        all_labels = np.unique(np.concatenate([test_labels, pred_labels]))
        
        balanced_acc = balanced_accuracy_score(y_test, y_pred)
        f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0, labels=all_labels)
        
        return {
            'accuracy': accuracy,
            'balanced_accuracy': balanced_acc,
            'f1_macro': f1_macro,
            'predictions': y_pred,
            'true_labels': y_test
        }
    except Exception as e:
        print(f"    Error in evaluation: {e}")
        return {
            'accuracy': 0.0,
            'balanced_accuracy': 0.0,
            'f1_macro': 0.0,
            'predictions': [],
            'true_labels': []
        }

print("✅ Functions defined successfully!")

In [None]:
# Create domain splits
print("Creating domain splits...")
print("="*60)
domain_data = create_domain_splits(df)

print("\n" + "="*60)
print("📁 Domain splits summary:")
for site, data in domain_data.items():
    print(f"{site:10} - Train: {len(data['X_train']):4}, Test: {len(data['X_test']):4}, Total: {len(data['full_X']):4}")

In [None]:
# In-domain performance evaluation
print("🎯 Evaluating In-Domain Performance...")
print("="*50)

in_domain_results = {}
num_classes = len(label_mapping)  # Total number of classes

for site in domain_data.keys():
    print(f"\nTraining for {site}...")
    
    # Get unique classes in training data
    train_classes = domain_data[site]['y_train'].nunique()
    print(f"  Training samples: {len(domain_data[site]['y_train'])}")
    print(f"  Unique training classes: {train_classes}")
    
    # Train model
    model = train_xgboost_model(
        domain_data[site]['X_train'], 
        domain_data[site]['y_train'],
        num_classes
    )
    
    # Test on same site
    results = evaluate_model(
        model,
        domain_data[site]['X_test'],
        domain_data[site]['y_test']
    )
    
    in_domain_results[site] = results
    in_domain_results[site]['model'] = model
    
    print(f"  ✓ Accuracy: {results['accuracy']:.4f}")
    print(f"  ✓ Balanced Acc: {results['balanced_accuracy']:.4f}")
    print(f"  ✓ F1 Macro: {results['f1_macro']:.4f}")

In [None]:
# Visualize in-domain results
sites_list = list(in_domain_results.keys())
accuracies = [in_domain_results[s]['accuracy'] for s in sites_list]
balanced_accs = [in_domain_results[s]['balanced_accuracy'] for s in sites_list]
f1_scores = [in_domain_results[s]['f1_macro'] for s in sites_list]

# Create bar plot
x = np.arange(len(sites_list))
width = 0.25

fig, ax = plt.subplots(figsize=(12, 6))
bars1 = ax.bar(x - width, accuracies, width, label='Accuracy', color='#2E7D32', alpha=0.8)
bars2 = ax.bar(x, balanced_accs, width, label='Balanced Accuracy', color='#1976D2', alpha=0.8)
bars3 = ax.bar(x + width, f1_scores, width, label='F1 Macro', color='#F57C00', alpha=0.8)

ax.set_xlabel('Site', fontsize=12)
ax.set_ylabel('Score', fontsize=12)
ax.set_title('In-Domain Performance by Site', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(sites_list)
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_ylim([0, 1])

# Add value labels
for bars in [bars1, bars2, bars3]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.3f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

print("\n📊 In-Domain Performance Summary:")
print(f"Average Accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
print(f"Average Balanced Accuracy: {np.mean(balanced_accs):.4f} ± {np.std(balanced_accs):.4f}")
print(f"Average F1 Macro: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")

In [None]:
# Out-domain performance evaluation
print("\n🔄 Evaluating Out-Domain Performance...")
print("(Train on one site, test on others)")
print("="*50)

out_domain_results = {}

for train_site in domain_data.keys():
    out_domain_results[train_site] = {}
    
    # Train on full data from one site
    print(f"\nTraining on {train_site}...")
    model = train_xgboost_model(
        domain_data[train_site]['full_X'],
        domain_data[train_site]['full_y'],
        num_classes
    )
    
    # Test on all other sites
    for test_site in domain_data.keys():
        if train_site == test_site:
            continue
            
        results = evaluate_model(
            model,
            domain_data[test_site]['full_X'],
            domain_data[test_site]['full_y']
        )
        
        out_domain_results[train_site][test_site] = results
        print(f"  {train_site} → {test_site}: {results['accuracy']:.4f}")

In [None]:
# Create out-domain performance matrix
sites_list = list(domain_data.keys())
n_sites = len(sites_list)
out_matrix = np.zeros((n_sites, n_sites))

for i, train_site in enumerate(sites_list):
    for j, test_site in enumerate(sites_list):
        if train_site == test_site:
            out_matrix[i, j] = in_domain_results[train_site]['accuracy']
        else:
            out_matrix[i, j] = out_domain_results[train_site][test_site]['accuracy']

# Visualize matrix
plt.figure(figsize=(10, 8))
sns.heatmap(out_matrix, annot=True, fmt='.3f', 
            xticklabels=sites_list, yticklabels=sites_list,
            cmap='RdYlGn', vmin=0, vmax=1, 
            cbar_kws={'label': 'Accuracy'},
            linewidths=0.5, linecolor='gray')

plt.title('Domain Transfer Performance Matrix\n(Train → Test)', fontsize=14, fontweight='bold')
plt.ylabel('Train Site', fontsize=12)
plt.xlabel('Test Site', fontsize=12)

# Highlight diagonal (in-domain)
for i in range(n_sites):
    plt.gca().add_patch(plt.Rectangle((i, i), 1, 1, fill=False, edgecolor='blue', lw=3))

plt.tight_layout()
plt.show()

print("\n💡 Blue boxes = In-domain performance (diagonal)")

In [None]:
# Cross-domain performance evaluation
print("\n🌐 Evaluating Cross-Domain Performance...")
print("(Train on multiple sites, test on held-out)")
print("="*50)

cross_domain_results = {}

for held_out_site in domain_data.keys():
    # Combine all other sites for training
    train_sites = [s for s in domain_data.keys() if s != held_out_site]
    
    X_train_list = [domain_data[s]['full_X'] for s in train_sites]
    y_train_list = [domain_data[s]['full_y'] for s in train_sites]
    
    X_train_combined = pd.concat(X_train_list, axis=0)
    y_train_combined = pd.concat(y_train_list, axis=0)
    
    print(f"\nTraining on {', '.join(train_sites)}")
    print(f"  Combined training size: {len(X_train_combined)} samples")
    
    # Train model
    model = train_xgboost_model(X_train_combined, y_train_combined, num_classes)
    
    # Test on held-out site
    results = evaluate_model(
        model,
        domain_data[held_out_site]['full_X'],
        domain_data[held_out_site]['full_y']
    )
    
    cross_domain_results[held_out_site] = results
    cross_domain_results[held_out_site]['model'] = model
    cross_domain_results[held_out_site]['train_sites'] = train_sites
    
    print(f"  Testing on {held_out_site}: {results['accuracy']:.4f}")

In [None]:
# Final performance comparison
print("\n" + "="*80)
print(" "*20 + "FINAL PERFORMANCE COMPARISON")
print("="*80)

# Create comparison dataframe
comparison_data = []
for site in domain_data.keys():
    # In-domain
    comparison_data.append({
        'Site': site,
        'Scenario': 'In-Domain',
        'Accuracy': in_domain_results[site]['accuracy']
    })
    
    # Out-domain average
    out_accs = [out_domain_results[train][site]['accuracy'] 
               for train in domain_data.keys() if train != site]
    if out_accs:
        comparison_data.append({
            'Site': site,
            'Scenario': 'Out-Domain (Avg)',
            'Accuracy': np.mean(out_accs)
        })
    
    # Cross-domain
    comparison_data.append({
        'Site': site,
        'Scenario': 'Cross-Domain',
        'Accuracy': cross_domain_results[site]['accuracy']
    })

df_comparison = pd.DataFrame(comparison_data)

# Create visualization
fig = px.bar(df_comparison, x='Site', y='Accuracy', color='Scenario',
             title='Performance Comparison: In-Domain vs Out-Domain vs Cross-Domain',
             barmode='group', height=500,
             color_discrete_map={
                 'In-Domain': '#2E7D32',
                 'Out-Domain (Avg)': '#F57C00',
                 'Cross-Domain': '#1976D2'
             })
fig.update_layout(yaxis_range=[0, 1])
fig.show()

# Print final summary
print("\n📊 Overall Performance Summary:")
print("="*50)

for scenario in ['In-Domain', 'Out-Domain (Avg)', 'Cross-Domain']:
    scenario_data = df_comparison[df_comparison['Scenario'] == scenario]['Accuracy']
    if len(scenario_data) > 0:
        print(f"\n{scenario}:")
        print(f"  Mean: {scenario_data.mean():.4f}")
        print(f"  Std:  {scenario_data.std():.4f}")
        print(f"  Min:  {scenario_data.min():.4f}")
        print(f"  Max:  {scenario_data.max():.4f}")

# Calculate domain shift
in_domain_mean = df_comparison[df_comparison['Scenario'] == 'In-Domain']['Accuracy'].mean()
out_domain_mean = df_comparison[df_comparison['Scenario'] == 'Out-Domain (Avg)']['Accuracy'].mean()
cross_domain_mean = df_comparison[df_comparison['Scenario'] == 'Cross-Domain']['Accuracy'].mean()

print("\n📉 Domain Shift Analysis:")
print(f"  In-Domain → Out-Domain drop: {(in_domain_mean - out_domain_mean):.4f} ({(in_domain_mean - out_domain_mean)/in_domain_mean*100:.1f}%)")
print(f"  In-Domain → Cross-Domain drop: {(in_domain_mean - cross_domain_mean):.4f} ({(in_domain_mean - cross_domain_mean)/in_domain_mean*100:.1f}%)")
print(f"  Cross-Domain vs Out-Domain gain: {(cross_domain_mean - out_domain_mean):.4f} ({(cross_domain_mean - out_domain_mean)/out_domain_mean*100:.1f}%)")

print("\n✅ Analysis Complete!")