# Complete XGBoost In-Domain vs Out-Domain Analysis

This notebook automatically sets up everything needed and runs the complete analysis.

In [None]:
# Setup: Clone repository and install packages
import os

# Clone repository if not already present
if not os.path.exists('/content/tabicl'):
    print("Cloning repository...")
    !git clone https://github.com/cliu238/tabicl.git
    print("Repository cloned successfully!")
else:
    print("Repository already exists.")

# Change to repository directory
%cd /content/tabicl

# Install required packages
print("\nInstalling packages...")
!pip install xgboost scikit-learn pandas numpy matplotlib seaborn plotly -q
print("Packages installed!")

# Verify data files exist
print("\nChecking data files:")
!ls -lh processed_data/*.csv

In [None]:
# Import all required libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

# Set style and random seed
plt.style.use('seaborn-v0_8-darkgrid')
np.random.seed(42)

print("✅ All libraries imported successfully!")

In [None]:
# Load and explore the dataset
df = pd.read_csv('processed_data/adult_numeric_20250729_155457.csv')

print("📊 Dataset Overview:")
print(f"Shape: {df.shape}")
print(f"\n🏥 Sites distribution:")
print(df['site'].value_counts())
print(f"\n🎯 Target (va34): {df['va34'].nunique()} unique classes")
print(f"Missing values: {df.isnull().sum().sum()}")

In [None]:
# Preprocessing
print("🔧 Preprocessing data...")

# Drop cod5 column
if 'cod5' in df.columns:
    df = df.drop('cod5', axis=1)
    print("✅ Dropped 'cod5' column")

# Separate features, target, and sites
X = df.drop(['va34', 'site'], axis=1)
y = df['va34']
sites = df['site']

print(f"\n📐 Data shapes:")
print(f"Features: {X.shape}")
print(f"Target: {y.shape}")
print(f"Sites: {sites.unique().tolist()}")

In [None]:
# Visualize site distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Site sample counts
site_counts = df['site'].value_counts()
ax1.bar(site_counts.index, site_counts.values, color='skyblue', edgecolor='navy', alpha=0.7)
ax1.set_title('Samples per Site', fontsize=14, fontweight='bold')
ax1.set_ylabel('Number of Samples')
ax1.set_xlabel('Site')
for i, (site, count) in enumerate(site_counts.items()):
    ax1.text(i, count + 20, str(count), ha='center', fontweight='bold')

# Classes per site
classes_per_site = df.groupby('site')['va34'].nunique().sort_index()
ax2.bar(classes_per_site.index, classes_per_site.values, color='lightcoral', edgecolor='darkred', alpha=0.7)
ax2.set_title('Unique Classes per Site', fontsize=14, fontweight='bold')
ax2.set_ylabel('Number of Unique Classes')
ax2.set_xlabel('Site')
for i, (site, count) in enumerate(classes_per_site.items()):
    ax2.text(i, count + 0.5, str(count), ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

## 🎯 Model Training Functions

In [None]:
def create_domain_splits(df, test_size=0.2, random_state=42):
    """Create train/test splits for each site"""
    domain_splits = {}
    
    for site in df['site'].unique():
        site_data = df[df['site'] == site]
        X_site = site_data.drop(['va34', 'site'], axis=1)
        y_site = site_data['va34']
        
        # Handle small sites
        if len(site_data) < 50:
            domain_splits[site] = {
                'X_train': X_site, 'X_test': X_site[:10],
                'y_train': y_site, 'y_test': y_site[:10],
                'full_X': X_site, 'full_y': y_site
            }
        else:
            # Try stratified split, fall back to regular if needed
            try:
                X_train, X_test, y_train, y_test = train_test_split(
                    X_site, y_site, test_size=test_size, 
                    random_state=random_state, stratify=y_site
                )
            except:
                X_train, X_test, y_train, y_test = train_test_split(
                    X_site, y_site, test_size=test_size, random_state=random_state
                )
            
            domain_splits[site] = {
                'X_train': X_train, 'X_test': X_test,
                'y_train': y_train, 'y_test': y_test,
                'full_X': X_site, 'full_y': y_site
            }
    
    return domain_splits

def train_xgboost_model(X_train, y_train):
    """Train XGBoost with regularization"""
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    
    params = {
        'objective': 'multi:softprob',
        'num_class': len(np.unique(y_train_encoded)),
        'max_depth': 4,
        'learning_rate': 0.1,
        'n_estimators': 100,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.1,
        'reg_lambda': 1.0,
        'random_state': 42,
        'verbosity': 0
    }
    
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train_encoded)
    return model, le

def evaluate_model(model, le, X_test, y_test):
    """Evaluate model performance"""
    try:
        y_test_encoded = le.transform(y_test)
        y_pred = model.predict(X_test)
        
        return {
            'accuracy': accuracy_score(y_test_encoded, y_pred),
            'balanced_accuracy': balanced_accuracy_score(y_test_encoded, y_pred),
            'f1_macro': f1_score(y_test_encoded, y_pred, average='macro', zero_division=0)
        }
    except:
        return {'accuracy': 0.0, 'balanced_accuracy': 0.0, 'f1_macro': 0.0}

print("✅ Functions defined successfully!")

## 📊 In-Domain Performance Analysis

In [None]:
# Create domain splits
print("Creating domain splits...")
domain_data = create_domain_splits(df)

print("\n📁 Domain splits created:")
for site, data in domain_data.items():
    print(f"{site:10} - Train: {len(data['X_train']):4}, Test: {len(data['X_test']):4}")

In [None]:
# In-domain performance evaluation
print("🎯 Evaluating In-Domain Performance...")
print("=" * 50)

in_domain_results = {}

for site in domain_data.keys():
    print(f"\nTraining for {site}...")
    
    # Train on site's data
    model, le = train_xgboost_model(
        domain_data[site]['X_train'], 
        domain_data[site]['y_train']
    )
    
    # Test on same site
    results = evaluate_model(
        model, le,
        domain_data[site]['X_test'],
        domain_data[site]['y_test']
    )
    
    in_domain_results[site] = results
    in_domain_results[site]['model'] = model
    in_domain_results[site]['le'] = le
    
    print(f"  ✓ Accuracy: {results['accuracy']:.4f}")
    print(f"  ✓ Balanced Acc: {results['balanced_accuracy']:.4f}")
    print(f"  ✓ F1 Macro: {results['f1_macro']:.4f}")

In [None]:
# Visualize in-domain results
sites_list = list(in_domain_results.keys())
accuracies = [in_domain_results[s]['accuracy'] for s in sites_list]
balanced_accs = [in_domain_results[s]['balanced_accuracy'] for s in sites_list]
f1_scores = [in_domain_results[s]['f1_macro'] for s in sites_list]

# Create bar plot
x = np.arange(len(sites_list))
width = 0.25

fig, ax = plt.subplots(figsize=(12, 6))
bars1 = ax.bar(x - width, accuracies, width, label='Accuracy', color='#2E7D32', alpha=0.8)
bars2 = ax.bar(x, balanced_accs, width, label='Balanced Accuracy', color='#1976D2', alpha=0.8)
bars3 = ax.bar(x + width, f1_scores, width, label='F1 Macro', color='#F57C00', alpha=0.8)

ax.set_xlabel('Site', fontsize=12)
ax.set_ylabel('Score', fontsize=12)
ax.set_title('In-Domain Performance by Site', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(sites_list)
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_ylim([0, 1])

# Add value labels on bars
for bars in [bars1, bars2, bars3]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.3f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

## 🔄 Out-Domain Performance Analysis

In [None]:
# Out-domain performance evaluation
print("🔄 Evaluating Out-Domain Performance...")
print("(Train on one site, test on others)")
print("=" * 50)

out_domain_results = {}

for train_site in domain_data.keys():
    out_domain_results[train_site] = {}
    
    # Train on full data from one site
    print(f"\nTraining on {train_site}...")
    model, le = train_xgboost_model(
        domain_data[train_site]['full_X'],
        domain_data[train_site]['full_y']
    )
    
    # Test on all other sites
    for test_site in domain_data.keys():
        if train_site == test_site:
            continue
            
        results = evaluate_model(
            model, le,
            domain_data[test_site]['full_X'],
            domain_data[test_site]['full_y']
        )
        
        out_domain_results[train_site][test_site] = results
        print(f"  {train_site} → {test_site}: {results['accuracy']:.4f}")

In [None]:
# Create out-domain performance matrix
sites_list = list(domain_data.keys())
n_sites = len(sites_list)
out_matrix = np.zeros((n_sites, n_sites))

for i, train_site in enumerate(sites_list):
    for j, test_site in enumerate(sites_list):
        if train_site == test_site:
            out_matrix[i, j] = in_domain_results[train_site]['accuracy']
        else:
            out_matrix[i, j] = out_domain_results[train_site][test_site]['accuracy']

# Visualize matrix
plt.figure(figsize=(10, 8))
sns.heatmap(out_matrix, annot=True, fmt='.3f', 
            xticklabels=sites_list, yticklabels=sites_list,
            cmap='RdYlGn', vmin=0, vmax=1, 
            cbar_kws={'label': 'Accuracy'},
            linewidths=0.5, linecolor='gray')

plt.title('Domain Transfer Performance Matrix\n(Train → Test)', fontsize=14, fontweight='bold')
plt.ylabel('Train Site', fontsize=12)
plt.xlabel('Test Site', fontsize=12)

# Highlight diagonal (in-domain)
for i in range(n_sites):
    plt.gca().add_patch(plt.Rectangle((i, i), 1, 1, fill=False, edgecolor='blue', lw=3))

plt.tight_layout()
plt.show()

print("\n💡 Blue boxes = In-domain performance (diagonal)")

## 🌐 Cross-Domain Performance Analysis

In [None]:
# Cross-domain performance evaluation
print("🌐 Evaluating Cross-Domain Performance...")
print("(Train on multiple sites, test on held-out)")
print("=" * 50)

cross_domain_results = {}

for held_out_site in domain_data.keys():
    # Combine all other sites for training
    train_sites = [s for s in domain_data.keys() if s != held_out_site]
    
    X_train_list = [domain_data[s]['full_X'] for s in train_sites]
    y_train_list = [domain_data[s]['full_y'] for s in train_sites]
    
    X_train_combined = pd.concat(X_train_list, axis=0)
    y_train_combined = pd.concat(y_train_list, axis=0)
    
    print(f"\nTraining on {', '.join(train_sites)}")
    print(f"  Combined training size: {len(X_train_combined)} samples")
    
    # Train model
    model, le = train_xgboost_model(X_train_combined, y_train_combined)
    
    # Test on held-out site
    results = evaluate_model(
        model, le,
        domain_data[held_out_site]['full_X'],
        domain_data[held_out_site]['full_y']
    )
    
    cross_domain_results[held_out_site] = results
    cross_domain_results[held_out_site]['model'] = model
    cross_domain_results[held_out_site]['train_sites'] = train_sites
    
    print(f"  Testing on {held_out_site}: {results['accuracy']:.4f}")

## 📈 Performance Comparison

In [None]:
# Comprehensive performance comparison
sites_list = list(domain_data.keys())

# Collect metrics
performance_data = []

for site in sites_list:
    # In-domain
    performance_data.append({
        'Site': site,
        'Scenario': 'In-Domain',
        'Accuracy': in_domain_results[site]['accuracy']
    })
    
    # Out-domain average
    out_accs = [out_domain_results[train][site]['accuracy'] 
               for train in sites_list if train != site]
    performance_data.append({
        'Site': site,
        'Scenario': 'Out-Domain (Avg)',
        'Accuracy': np.mean(out_accs) if out_accs else 0
    })
    
    # Cross-domain
    performance_data.append({
        'Site': site,
        'Scenario': 'Cross-Domain',
        'Accuracy': cross_domain_results[site]['accuracy']
    })

df_perf = pd.DataFrame(performance_data)

# Create grouped bar chart
fig = px.bar(df_perf, x='Site', y='Accuracy', color='Scenario',
             title='Performance Comparison: In-Domain vs Out-Domain vs Cross-Domain',
             barmode='group', height=500,
             color_discrete_map={
                 'In-Domain': '#2E7D32',
                 'Out-Domain (Avg)': '#F57C00', 
                 'Cross-Domain': '#1976D2'
             })

fig.update_layout(
    xaxis_tickangle=-45,
    yaxis_range=[0, 1],
    yaxis_title="Accuracy",
    font=dict(size=12)
)

fig.show()

# Print summary table
print("\n📊 Performance Summary:")
pivot_table = df_perf.pivot(index='Site', columns='Scenario', values='Accuracy')
print(pivot_table.round(4))

## 📉 Domain Shift Analysis

In [None]:
# Analyze domain shift effects
domain_shift_data = {}

for site in sites_list:
    in_acc = in_domain_results[site]['accuracy']
    cross_acc = cross_domain_results[site]['accuracy']
    
    # Average out-domain when this site is test
    out_accs = [out_domain_results[train][site]['accuracy'] 
               for train in sites_list if train != site]
    avg_out_acc = np.mean(out_accs) if out_accs else 0
    
    domain_shift_data[site] = {
        'In-Domain': in_acc,
        'Cross-Domain': cross_acc,
        'Out-Domain (Avg)': avg_out_acc,
        'In→Cross Drop': in_acc - cross_acc,
        'In→Out Drop': in_acc - avg_out_acc,
        'Relative Drop (%)': ((in_acc - cross_acc) / in_acc * 100) if in_acc > 0 else 0
    }

df_shift = pd.DataFrame(domain_shift_data).T

# Visualize domain shift
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Performance drops
x = np.arange(len(sites_list))
width = 0.35

axes[0].bar(x - width/2, df_shift['In→Cross Drop'], width, 
           label='In→Cross Drop', color='#FF6B6B', alpha=0.7)
axes[0].bar(x + width/2, df_shift['In→Out Drop'], width,
           label='In→Out Drop', color='#4ECDC4', alpha=0.7)
axes[0].set_xlabel('Site', fontsize=12)
axes[0].set_ylabel('Accuracy Drop', fontsize=12)
axes[0].set_title('Performance Degradation due to Domain Shift', fontsize=14, fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels(sites_list)
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].axhline(y=0, color='black', linestyle='-', linewidth=0.5)

# Absolute performance
axes[1].plot(sites_list, df_shift['In-Domain'], 'o-', 
            label='In-Domain', linewidth=2, markersize=8, color='#2E7D32')
axes[1].plot(sites_list, df_shift['Cross-Domain'], 's-',
            label='Cross-Domain', linewidth=2, markersize=8, color='#1976D2')
axes[1].plot(sites_list, df_shift['Out-Domain (Avg)'], '^-',
            label='Out-Domain (Avg)', linewidth=2, markersize=8, color='#F57C00')
axes[1].set_xlabel('Site', fontsize=12)
axes[1].set_ylabel('Accuracy', fontsize=12)
axes[1].set_title('Absolute Performance Across Scenarios', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].set_ylim([0, 1])

plt.tight_layout()
plt.show()

print("\n📊 Domain Shift Analysis:")
print(df_shift.round(4))

## 📊 Final Report

In [None]:
# Generate final report
print("="*80)
print(" "*20 + "XGBOOST DOMAIN ANALYSIS FINAL REPORT")
print("="*80)

# Calculate overall statistics
in_domain_avg = np.mean([in_domain_results[s]['accuracy'] for s in sites_list])
cross_domain_avg = np.mean([cross_domain_results[s]['accuracy'] for s in sites_list])

out_domain_all = []
for train in sites_list:
    for test in sites_list:
        if train != test:
            out_domain_all.append(out_domain_results[train][test]['accuracy'])
out_domain_avg = np.mean(out_domain_all)

print(f"\n📊 DATASET OVERVIEW:")
print(f"  • Total samples: {len(df):,}")
print(f"  • Number of features: {X.shape[1]}")
print(f"  • Number of classes: {y.nunique()}")
print(f"  • Number of sites: {len(sites_list)}")

print(f"\n📈 PERFORMANCE SUMMARY:")
print(f"  • Average In-Domain Accuracy:    {in_domain_avg:.4f}")
print(f"  • Average Cross-Domain Accuracy: {cross_domain_avg:.4f}")
print(f"  • Average Out-Domain Accuracy:   {out_domain_avg:.4f}")

print(f"\n📉 PERFORMANCE DEGRADATION:")
drop_in_cross = in_domain_avg - cross_domain_avg
drop_in_out = in_domain_avg - out_domain_avg
print(f"  • In-Domain → Cross-Domain: -{drop_in_cross:.4f} ({drop_in_cross/in_domain_avg*100:.1f}%)")
print(f"  • In-Domain → Out-Domain:   -{drop_in_out:.4f} ({drop_in_out/in_domain_avg*100:.1f}%)")

print(f"\n🏆 BEST & WORST PERFORMERS:")
best_in = max(sites_list, key=lambda x: in_domain_results[x]['accuracy'])
worst_in = min(sites_list, key=lambda x: in_domain_results[x]['accuracy'])
best_cross = max(sites_list, key=lambda x: cross_domain_results[x]['accuracy'])
worst_cross = min(sites_list, key=lambda x: cross_domain_results[x]['accuracy'])

print(f"  In-Domain:")
print(f"    • Best:  {best_in} ({in_domain_results[best_in]['accuracy']:.4f})")
print(f"    • Worst: {worst_in} ({in_domain_results[worst_in]['accuracy']:.4f})")
print(f"  Cross-Domain:")
print(f"    • Best:  {best_cross} ({cross_domain_results[best_cross]['accuracy']:.4f})")
print(f"    • Worst: {worst_cross} ({cross_domain_results[worst_cross]['accuracy']:.4f})")

# Find best/worst transfer pairs
best_pair = None
best_score = 0
worst_pair = None
worst_score = 1

for train in sites_list:
    for test in sites_list:
        if train != test:
            score = out_domain_results[train][test]['accuracy']
            if score > best_score:
                best_score = score
                best_pair = (train, test)
            if score < worst_score:
                worst_score = score
                worst_pair = (train, test)

print(f"\n🔄 DOMAIN TRANSFER PAIRS:")
print(f"  • Best transfer:  {best_pair[0]} → {best_pair[1]} ({best_score:.4f})")
print(f"  • Worst transfer: {worst_pair[0]} → {worst_pair[1]} ({worst_score:.4f})")

print(f"\n💡 KEY INSIGHTS:")
print(f"  1. Cross-domain training improves over single-domain by {(cross_domain_avg - out_domain_avg):.4f}")
print(f"  2. Average domain shift causes {drop_in_cross/in_domain_avg*100:.1f}% performance drop")
print(f"  3. Site '{best_cross}' shows best robustness to domain shift")
print(f"  4. Site '{worst_cross}' is most affected by domain shift")

if best_score > 0.5:
    print(f"  5. Sites '{best_pair[0]}' and '{best_pair[1]}' have good domain similarity")
else:
    print(f"  5. Significant distribution shift exists across all domain pairs")

print("\n" + "="*80)
print("✅ Analysis Complete!")
print("="*80)

In [None]:
# Optional: Save results
save = input("\nSave results to CSV? (y/n): ").lower() == 'y'

if save:
    import os
    os.makedirs('results', exist_ok=True)
    
    # Save performance summary
    df_perf.to_csv('results/performance_summary.csv', index=False)
    df_shift.to_csv('results/domain_shift_analysis.csv')
    
    # Save transfer matrix
    pd.DataFrame(out_matrix, index=sites_list, columns=sites_list).to_csv(
        'results/transfer_matrix.csv'
    )
    
    print("✅ Results saved to 'results/' directory")
else:
    print("Results not saved.")