# XGBoost In-Domain vs Out-Domain Performance Analysis

This notebook analyzes XGBoost performance across different domains (sites) using the VA dataset.

## Dataset Information
- File: `adult_numeric_20250729_155457.csv`
- Target: `va34` (34 classes)
- Sites: AP, Bohol, Dar, Mexico, Pemba, UP
- Features: 169 (after dropping 'cod5' and 'site')

In [None]:
# Install required packages
!pip install xgboost scikit-learn pandas numpy matplotlib seaborn plotly -q

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")

## 1. Data Loading and Exploration

In [None]:
# Load the dataset
# For Google Colab: Upload the file using the file browser or mount Google Drive
# from google.colab import files
# uploaded = files.upload()

df = pd.read_csv('processed_data/adult_numeric_20250729_155457.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {list(df.columns[:10])}...")  # Show first 10 columns
print(f"\nSites distribution:")
print(df['site'].value_counts())
print(f"\nTarget classes: {df['va34'].nunique()} unique classes")
print(f"Missing values: {df.isnull().sum().sum()}")

In [None]:
# Data preprocessing
# Drop 'cod5' column as specified
if 'cod5' in df.columns:
    df_clean = df.drop('cod5', axis=1)
    print("Dropped 'cod5' column")
else:
    df_clean = df.copy()
    print("'cod5' column not found, proceeding with all columns")

# Separate features and target
X = df_clean.drop(['va34', 'site'], axis=1)
y = df_clean['va34']
sites = df_clean['site']

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nTarget value distribution (top 10):")
print(y.value_counts().head(10))

In [None]:
# Analyze domain statistics
site_stats = df_clean.groupby('site').agg({
    'va34': ['count', 'nunique', lambda x: x.value_counts().iloc[0]]
}).round(2)
site_stats.columns = ['Sample_Count', 'Unique_Classes', 'Most_Common_Class_Count']
site_stats['Class_Coverage'] = site_stats['Unique_Classes'] / df_clean['va34'].nunique()

print("Site Statistics:")
print(site_stats)

# Visualize site distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Samples per site
axes[0].bar(site_stats.index, site_stats['Sample_Count'], color='skyblue', alpha=0.7)
axes[0].set_title('Samples per Site')
axes[0].set_ylabel('Number of Samples')
axes[0].set_xlabel('Site')
for i, v in enumerate(site_stats['Sample_Count']):
    axes[0].text(i, v + 20, str(int(v)), ha='center')

# Classes per site
axes[1].bar(site_stats.index, site_stats['Unique_Classes'], color='lightcoral', alpha=0.7)
axes[1].set_title('Unique Classes per Site')
axes[1].set_ylabel('Number of Unique Classes')
axes[1].set_xlabel('Site')
for i, v in enumerate(site_stats['Unique_Classes']):
    axes[1].text(i, v + 0.5, str(int(v)), ha='center')

plt.tight_layout()
plt.show()

## 2. Domain Splitting Strategy

In [None]:
def create_domain_splits(df, test_size=0.2, min_samples=50, random_state=42):
    """Create train/test splits for each site"""
    domain_splits = {}
    
    for site in df['site'].unique():
        site_data = df[df['site'] == site]
        X_site = site_data.drop(['va34', 'site'], axis=1)
        y_site = site_data['va34']
        
        # Check if site has enough samples
        if len(site_data) < min_samples:
            print(f"Warning: {site} has only {len(site_data)} samples (< {min_samples}), using all for training")
            domain_splits[site] = {
                'X_train': X_site, 'X_test': X_site[:10],  # Small test set for consistency
                'y_train': y_site, 'y_test': y_site[:10],
                'full_X': X_site, 'full_y': y_site
            }
            continue
        
        # Check if we can stratify
        min_class_count = y_site.value_counts().min()
        if min_class_count >= 2:
            # Stratified split
            X_train, X_test, y_train, y_test = train_test_split(
                X_site, y_site, test_size=test_size, 
                random_state=random_state, stratify=y_site
            )
        else:
            # Regular split without stratification
            X_train, X_test, y_train, y_test = train_test_split(
                X_site, y_site, test_size=test_size, 
                random_state=random_state
            )
        
        domain_splits[site] = {
            'X_train': X_train, 'X_test': X_test,
            'y_train': y_train, 'y_test': y_test,
            'full_X': X_site, 'full_y': y_site
        }
    
    return domain_splits

# Create domain splits
domain_data = create_domain_splits(df_clean)

print(f"Created splits for {len(domain_data)} sites:")
for site, data in domain_data.items():
    print(f"{site:10} - Train: {len(data['X_train']):4}, Test: {len(data['X_test']):4}, Total: {len(data['full_X']):4}")

## 3. Model Training Functions

In [None]:
def train_xgboost_model(X_train, y_train, params=None, verbose=False):
    """Train XGBoost model with regularization for better generalization"""
    
    # Encode labels to ensure they are 0-indexed
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    
    if params is None:
        params = {
            'objective': 'multi:softprob',
            'num_class': len(np.unique(y_train_encoded)),
            'max_depth': 4,  # Shallow trees for better generalization
            'learning_rate': 0.1,
            'n_estimators': 100,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'reg_alpha': 0.1,  # L1 regularization
            'reg_lambda': 1.0,  # L2 regularization
            'random_state': 42,
            'verbosity': 0
        }
    
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train_encoded, verbose=verbose)
    
    return model, le

def evaluate_model(model, label_encoder, X_test, y_test):
    """Evaluate model and return multiple metrics"""
    y_test_encoded = label_encoder.transform(y_test)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test_encoded, y_pred)
    balanced_acc = balanced_accuracy_score(y_test_encoded, y_pred)
    f1_macro = f1_score(y_test_encoded, y_pred, average='macro', zero_division=0)
    f1_weighted = f1_score(y_test_encoded, y_pred, average='weighted', zero_division=0)
    
    return {
        'accuracy': accuracy,
        'balanced_accuracy': balanced_acc,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'predictions': y_pred,
        'true_labels': y_test_encoded
    }

print("Model training functions defined successfully!")

## 4. In-Domain Performance Analysis

In [None]:
def evaluate_in_domain_performance(domain_data):
    """Evaluate in-domain performance for each site"""
    in_domain_results = {}
    
    print("Training in-domain models...")
    print("-" * 60)
    
    for site, data in domain_data.items():
        print(f"\nProcessing {site}...")
        
        # Train model on site's training data
        model, le = train_xgboost_model(data['X_train'], data['y_train'])
        
        # Test on same site's test data
        results = evaluate_model(model, le, data['X_test'], data['y_test'])
        
        in_domain_results[site] = {
            'model': model,
            'label_encoder': le,
            'accuracy': results['accuracy'],
            'balanced_accuracy': results['balanced_accuracy'],
            'f1_macro': results['f1_macro'],
            'f1_weighted': results['f1_weighted'],
            'predictions': results['predictions'],
            'true_labels': results['true_labels']
        }
        
        print(f"  Accuracy: {results['accuracy']:.4f}")
        print(f"  Balanced Accuracy: {results['balanced_accuracy']:.4f}")
        print(f"  F1 Macro: {results['f1_macro']:.4f}")
    
    return in_domain_results

# Evaluate in-domain performance
in_domain_results = evaluate_in_domain_performance(domain_data)

In [None]:
# Visualize in-domain performance
sites = list(in_domain_results.keys())
metrics = ['accuracy', 'balanced_accuracy', 'f1_macro', 'f1_weighted']

# Create comparison dataframe
in_domain_df = pd.DataFrame({
    site: {
        metric: in_domain_results[site][metric] 
        for metric in metrics
    }
    for site in sites
}).T

# Plot in-domain performance
fig, ax = plt.subplots(figsize=(12, 6))
in_domain_df.plot(kind='bar', ax=ax, width=0.8)
ax.set_title('In-Domain Performance by Site', fontsize=14, fontweight='bold')
ax.set_ylabel('Score')
ax.set_xlabel('Site')
ax.legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(True, alpha=0.3)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
plt.tight_layout()
plt.show()

print("\nIn-Domain Performance Summary:")
print(in_domain_df.round(4))

## 5. Out-Domain Performance Analysis

In [None]:
def evaluate_out_domain_performance(domain_data):
    """Evaluate out-domain performance (train on one site, test on others)"""
    out_domain_results = {}
    
    sites = list(domain_data.keys())
    
    print("Evaluating out-domain performance...")
    print("(Training on each site and testing on all others)")
    print("-" * 60)
    
    for train_site in sites:
        out_domain_results[train_site] = {}
        train_data = domain_data[train_site]
        
        # Train model on one site using full data
        print(f"\nTraining on {train_site}...")
        model, le = train_xgboost_model(train_data['full_X'], train_data['full_y'])
        
        for test_site in sites:
            if train_site == test_site:
                continue  # Skip same-domain evaluation
            
            test_data = domain_data[test_site]
            
            # Handle unseen labels in test set
            try:
                results = evaluate_model(model, le, test_data['full_X'], test_data['full_y'])
                accuracy = results['accuracy']
            except ValueError as e:
                # If test set has unseen labels, set accuracy to 0
                print(f"  Warning: {train_site} -> {test_site}: Unseen labels in test set")
                accuracy = 0.0
                results = {'accuracy': 0.0, 'balanced_accuracy': 0.0, 
                          'f1_macro': 0.0, 'f1_weighted': 0.0}
            
            out_domain_results[train_site][test_site] = results
            print(f"  {train_site} -> {test_site}: {accuracy:.4f}")
    
    return out_domain_results

# Evaluate out-domain performance
out_domain_results = evaluate_out_domain_performance(domain_data)

In [None]:
# Create out-domain performance matrix
sites = list(domain_data.keys())
out_domain_matrix = np.zeros((len(sites), len(sites)))

for i, train_site in enumerate(sites):
    for j, test_site in enumerate(sites):
        if train_site == test_site:
            # Use in-domain accuracy for diagonal
            out_domain_matrix[i, j] = in_domain_results[train_site]['accuracy']
        else:
            out_domain_matrix[i, j] = out_domain_results[train_site][test_site]['accuracy']

# Visualize out-domain performance matrix
plt.figure(figsize=(10, 8))
sns.heatmap(out_domain_matrix, annot=True, fmt='.3f', 
            xticklabels=sites, yticklabels=sites, 
            cmap='RdYlGn', vmin=0, vmax=1, cbar_kws={'label': 'Accuracy'})
plt.title('Domain Transfer Performance Matrix\n(Train Site → Test Site)', fontsize=14, fontweight='bold')
plt.ylabel('Train Site')
plt.xlabel('Test Site')

# Add diagonal line to highlight in-domain performance
for i in range(len(sites)):
    plt.gca().add_patch(plt.Rectangle((i, i), 1, 1, fill=False, edgecolor='blue', lw=2))

plt.tight_layout()
plt.show()

print("Note: Blue boxes indicate in-domain performance (diagonal)")

## 6. Cross-Domain Performance Analysis

In [None]:
def evaluate_cross_domain_performance(domain_data):
    """Train on multiple sites, test on held-out site"""
    cross_domain_results = {}
    sites = list(domain_data.keys())
    
    print("Evaluating cross-domain performance...")
    print("(Training on all sites except one, testing on held-out site)")
    print("-" * 60)
    
    for held_out_site in sites:
        # Combine data from all other sites for training
        train_sites = [s for s in sites if s != held_out_site]
        
        X_train_combined = []
        y_train_combined = []
        
        for site in train_sites:
            X_train_combined.append(domain_data[site]['full_X'])
            y_train_combined.append(domain_data[site]['full_y'])
        
        X_train = pd.concat(X_train_combined, axis=0)
        y_train = pd.concat(y_train_combined, axis=0)
        
        print(f"\nTraining on {', '.join(train_sites)}")
        print(f"  Combined training size: {len(X_train)} samples")
        
        # Train model on combined data
        model, le = train_xgboost_model(X_train, y_train)
        
        # Test on held-out site
        test_data = domain_data[held_out_site]
        
        try:
            results = evaluate_model(model, le, test_data['full_X'], test_data['full_y'])
        except ValueError as e:
            print(f"  Warning: Unseen labels in {held_out_site}")
            results = {'accuracy': 0.0, 'balanced_accuracy': 0.0, 
                      'f1_macro': 0.0, 'f1_weighted': 0.0}
        
        cross_domain_results[held_out_site] = {
            'model': model,
            'label_encoder': le,
            'accuracy': results['accuracy'],
            'balanced_accuracy': results['balanced_accuracy'],
            'f1_macro': results['f1_macro'],
            'f1_weighted': results['f1_weighted'],
            'train_sites': train_sites
        }
        
        print(f"  Testing on {held_out_site}: Accuracy = {results['accuracy']:.4f}")
    
    return cross_domain_results

# Evaluate cross-domain performance
cross_domain_results = evaluate_cross_domain_performance(domain_data)

## 7. Comprehensive Performance Comparison

In [None]:
# Create comprehensive performance comparison
sites = list(domain_data.keys())

# Collect all performance metrics
performance_data = []

# In-domain performance
for site in sites:
    performance_data.append({
        'Site': site,
        'Scenario': 'In-Domain',
        'Accuracy': in_domain_results[site]['accuracy'],
        'Balanced_Accuracy': in_domain_results[site]['balanced_accuracy'],
        'F1_Macro': in_domain_results[site]['f1_macro']
    })

# Out-domain performance (average across all target sites)
for train_site in sites:
    out_accuracies = [out_domain_results[train_site][test_site]['accuracy'] 
                     for test_site in sites if test_site != train_site]
    out_balanced = [out_domain_results[train_site][test_site]['balanced_accuracy'] 
                   for test_site in sites if test_site != train_site]
    out_f1 = [out_domain_results[train_site][test_site]['f1_macro'] 
             for test_site in sites if test_site != train_site]
    
    performance_data.append({
        'Site': train_site,
        'Scenario': 'Out-Domain (Avg)',
        'Accuracy': np.mean(out_accuracies),
        'Balanced_Accuracy': np.mean(out_balanced),
        'F1_Macro': np.mean(out_f1)
    })

# Cross-domain performance
for site in sites:
    performance_data.append({
        'Site': site,
        'Scenario': 'Cross-Domain',
        'Accuracy': cross_domain_results[site]['accuracy'],
        'Balanced_Accuracy': cross_domain_results[site]['balanced_accuracy'],
        'F1_Macro': cross_domain_results[site]['f1_macro']
    })

df_performance = pd.DataFrame(performance_data)

# Create interactive comparison plot using plotly
fig = px.bar(df_performance, x='Site', y='Accuracy', color='Scenario',
             title='Performance Comparison: In-Domain vs Out-Domain vs Cross-Domain',
             barmode='group', height=500,
             color_discrete_map={
                 'In-Domain': '#2E7D32',
                 'Out-Domain (Avg)': '#F57C00',
                 'Cross-Domain': '#1976D2'
             })
fig.update_layout(xaxis_tickangle=-45)
fig.update_yaxis(range=[0, 1])
fig.show()

# Print performance summary table
print("\nPerformance Summary (Accuracy):")
performance_pivot = df_performance.pivot(index='Site', columns='Scenario', values='Accuracy')
print(performance_pivot.round(4))

# Calculate average performance across all sites
print("\n" + "="*60)
print("Average Performance Across All Sites:")
print("="*60)
avg_performance = df_performance.groupby('Scenario')[['Accuracy', 'Balanced_Accuracy', 'F1_Macro']].mean()
print(avg_performance.round(4))

## 8. Domain Shift Analysis

In [None]:
# Analyze domain shift effects
sites = list(domain_data.keys())

# Calculate performance drops
performance_drops = {}
for site in sites:
    in_domain_acc = in_domain_results[site]['accuracy']
    cross_domain_acc = cross_domain_results[site]['accuracy']
    
    # Calculate average out-domain accuracy when this site is the test site
    out_domain_accs = [out_domain_results[train_site][site]['accuracy'] 
                      for train_site in sites if train_site != site]
    avg_out_domain_acc = np.mean(out_domain_accs) if out_domain_accs else 0
    
    performance_drops[site] = {
        'In-Domain': in_domain_acc,
        'Cross-Domain': cross_domain_acc,
        'Out-Domain (Avg)': avg_out_domain_acc,
        'In→Cross Drop': in_domain_acc - cross_domain_acc,
        'In→Out Drop': in_domain_acc - avg_out_domain_acc,
        'Relative Drop (%)': ((in_domain_acc - cross_domain_acc) / in_domain_acc * 100) if in_domain_acc > 0 else 0
    }

drops_df = pd.DataFrame(performance_drops).T

print("Domain Shift Analysis:")
print("="*80)
print(drops_df.round(4))

# Visualize performance drops
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Plot 1: Performance drops
x = np.arange(len(sites))
width = 0.35

axes[0].bar(x - width/2, drops_df['In→Cross Drop'], width, 
           label='In-Domain → Cross-Domain Drop', color='#FF6B6B', alpha=0.7)
axes[0].bar(x + width/2, drops_df['In→Out Drop'], width, 
           label='In-Domain → Out-Domain Drop', color='#4ECDC4', alpha=0.7)
axes[0].set_xlabel('Site')
axes[0].set_ylabel('Accuracy Drop')
axes[0].set_title('Performance Degradation due to Domain Shift', fontsize=14, fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels(sites)
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].axhline(y=0, color='black', linestyle='-', linewidth=0.5)

# Plot 2: Absolute performance comparison
axes[1].plot(sites, drops_df['In-Domain'], 'o-', label='In-Domain', 
            linewidth=2, markersize=8, color='#2E7D32')
axes[1].plot(sites, drops_df['Cross-Domain'], 's-', label='Cross-Domain', 
            linewidth=2, markersize=8, color='#1976D2')
axes[1].plot(sites, drops_df['Out-Domain (Avg)'], '^-', label='Out-Domain (Avg)', 
            linewidth=2, markersize=8, color='#F57C00')
axes[1].set_xlabel('Site')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Absolute Performance Across Different Scenarios', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].set_ylim([0, 1])

plt.tight_layout()
plt.show()

# Summary statistics
print("\n" + "="*60)
print("Summary Statistics:")
print("="*60)
print(f"Average In-Domain Accuracy: {drops_df['In-Domain'].mean():.4f} ± {drops_df['In-Domain'].std():.4f}")
print(f"Average Cross-Domain Accuracy: {drops_df['Cross-Domain'].mean():.4f} ± {drops_df['Cross-Domain'].std():.4f}")
print(f"Average Out-Domain Accuracy: {drops_df['Out-Domain (Avg)'].mean():.4f} ± {drops_df['Out-Domain (Avg)'].std():.4f}")
print(f"\nAverage Performance Drop:")
print(f"  In→Cross: {drops_df['In→Cross Drop'].mean():.4f} ({drops_df['Relative Drop (%)'].mean():.1f}%)")
print(f"  In→Out: {drops_df['In→Out Drop'].mean():.4f}")

## 9. Feature Importance Analysis

In [None]:
# Analyze feature importance across different models
feature_names = list(domain_data[list(domain_data.keys())[0]]['full_X'].columns)

# Collect feature importance from in-domain and cross-domain models
importance_data = {}

# In-domain models
for site in domain_data.keys():
    model = in_domain_results[site]['model']
    importance_data[f'{site}_in'] = model.feature_importances_

# Cross-domain models
for site in cross_domain_results.keys():
    model = cross_domain_results[site]['model']
    importance_data[f'{site}_cross'] = model.feature_importances_

# Create importance DataFrame
importance_df = pd.DataFrame(importance_data, index=feature_names)

# Calculate average importance
avg_importance_in = importance_df[[col for col in importance_df.columns if '_in' in col]].mean(axis=1)
avg_importance_cross = importance_df[[col for col in importance_df.columns if '_cross' in col]].mean(axis=1)
overall_avg = importance_df.mean(axis=1)

# Get top features
top_n = 20
top_features = overall_avg.nlargest(top_n)

# Plot feature importance
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

# Top features overall
axes[0].barh(range(len(top_features)), top_features.values, color='steelblue', alpha=0.7)
axes[0].set_yticks(range(len(top_features)))
axes[0].set_yticklabels(top_features.index)
axes[0].set_xlabel('Average Feature Importance')
axes[0].set_title(f'Top {top_n} Most Important Features (Overall)', fontweight='bold')
axes[0].invert_yaxis()

# Compare in-domain vs cross-domain importance for top features
top_feature_names = top_features.index
in_domain_imp = avg_importance_in[top_feature_names].values
cross_domain_imp = avg_importance_cross[top_feature_names].values

x_pos = np.arange(len(top_feature_names))
width = 0.35

axes[1].barh(x_pos - width/2, in_domain_imp, width, label='In-Domain', color='#2E7D32', alpha=0.7)
axes[1].barh(x_pos + width/2, cross_domain_imp, width, label='Cross-Domain', color='#1976D2', alpha=0.7)
axes[1].set_yticks(x_pos)
axes[1].set_yticklabels(top_feature_names)
axes[1].set_xlabel('Average Feature Importance')
axes[1].set_title('Feature Importance: In-Domain vs Cross-Domain', fontweight='bold')
axes[1].legend()
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

# Calculate feature stability (variance across models)
feature_stability = importance_df.std(axis=1).sort_values()
print(f"\nMost stable features (lowest variance across models):")
print(feature_stability.head(10).round(6))

print(f"\nMost variable features (highest variance across models):")
print(feature_stability.tail(10).round(6))

## 10. Final Report and Conclusions

In [None]:
# Generate comprehensive final report
print("="*80)
print(" "*20 + "XGBOOST DOMAIN ANALYSIS FINAL REPORT")
print("="*80)

print(f"\n📊 DATASET OVERVIEW:")
print(f"  • Total samples: {len(df_clean):,}")
print(f"  • Number of features: {len(X.columns)}")
print(f"  • Number of classes: {y.nunique()}")
print(f"  • Number of sites: {len(sites)}")

print(f"\n🌍 SITE DISTRIBUTION:")
for site in sites:
    count = len(domain_data[site]['full_X'])
    percentage = count/len(df_clean)*100
    print(f"  • {site:10}: {count:,} samples ({percentage:.1f}%)")

# Calculate overall statistics
in_domain_scores = [in_domain_results[site]['accuracy'] for site in sites]
cross_domain_scores = [cross_domain_results[site]['accuracy'] for site in sites]
out_domain_all = []
for train_site in sites:
    for test_site in sites:
        if train_site != test_site:
            out_domain_all.append(out_domain_results[train_site][test_site]['accuracy'])

print(f"\n📈 PERFORMANCE SUMMARY:")
print(f"  • Average In-Domain Accuracy:    {np.mean(in_domain_scores):.4f} ± {np.std(in_domain_scores):.4f}")
print(f"  • Average Cross-Domain Accuracy: {np.mean(cross_domain_scores):.4f} ± {np.std(cross_domain_scores):.4f}")
print(f"  • Average Out-Domain Accuracy:   {np.mean(out_domain_all):.4f} ± {np.std(out_domain_all):.4f}")

print(f"\n📉 PERFORMANCE DEGRADATION:")
in_cross_drop = np.mean(in_domain_scores) - np.mean(cross_domain_scores)
in_out_drop = np.mean(in_domain_scores) - np.mean(out_domain_all)
print(f"  • In-Domain → Cross-Domain: -{in_cross_drop:.4f} ({in_cross_drop/np.mean(in_domain_scores)*100:.1f}% relative drop)")
print(f"  • In-Domain → Out-Domain:   -{in_out_drop:.4f} ({in_out_drop/np.mean(in_domain_scores)*100:.1f}% relative drop)")

print(f"\n🏆 BEST & WORST PERFORMERS:")
best_in_domain = max(sites, key=lambda x: in_domain_results[x]['accuracy'])
worst_in_domain = min(sites, key=lambda x: in_domain_results[x]['accuracy'])
best_cross_domain = max(sites, key=lambda x: cross_domain_results[x]['accuracy'])
worst_cross_domain = min(sites, key=lambda x: cross_domain_results[x]['accuracy'])

print(f"  In-Domain:")
print(f"    • Best:  {best_in_domain} ({in_domain_results[best_in_domain]['accuracy']:.4f})")
print(f"    • Worst: {worst_in_domain} ({in_domain_results[worst_in_domain]['accuracy']:.4f})")
print(f"  Cross-Domain:")
print(f"    • Best:  {best_cross_domain} ({cross_domain_results[best_cross_domain]['accuracy']:.4f})")
print(f"    • Worst: {worst_cross_domain} ({cross_domain_results[worst_cross_domain]['accuracy']:.4f})")

# Find best and worst domain pairs
best_pair = None
best_pair_score = 0
worst_pair = None
worst_pair_score = 1

for train_site in sites:
    for test_site in sites:
        if train_site != test_site:
            score = out_domain_results[train_site][test_site]['accuracy']
            if score > best_pair_score:
                best_pair_score = score
                best_pair = (train_site, test_site)
            if score < worst_pair_score:
                worst_pair_score = score
                worst_pair = (train_site, test_site)

print(f"\n🔄 DOMAIN TRANSFER PAIRS:")
print(f"  • Best transfer:  {best_pair[0]} → {best_pair[1]} ({best_pair_score:.4f})")
print(f"  • Worst transfer: {worst_pair[0]} → {worst_pair[1]} ({worst_pair_score:.4f})")

print(f"\n🔑 TOP 5 MOST IMPORTANT FEATURES:")
for i, (feature, importance) in enumerate(top_features.head().items(), 1):
    print(f"  {i}. {feature}: {importance:.4f}")

print(f"\n💡 KEY INSIGHTS:")
print(f"  1. Cross-domain training (multiple sites) improves generalization by {(np.mean(cross_domain_scores) - np.mean(out_domain_all)):.4f}")
print(f"  2. Domain shift causes an average {in_cross_drop/np.mean(in_domain_scores)*100:.1f}% performance drop")
print(f"  3. Site '{best_cross_domain}' shows best robustness to domain shift")
print(f"  4. Site '{worst_cross_domain}' is most affected by domain shift")

if best_pair_score > 0.5:
    print(f"  5. Sites '{best_pair[0]}' and '{best_pair[1]}' have good domain similarity")
else:
    print(f"  5. All domain pairs show significant distribution shift")

print("\n" + "="*80)
print("Analysis Complete!")
print("="*80)

## 11. Save Results (Optional)

In [None]:
# Save results to files
save_results = input("Do you want to save the results? (y/n): ").lower() == 'y'

if save_results:
    # Create results directory
    import os
    os.makedirs('xgboost_domain_results', exist_ok=True)
    
    # Save performance summary
    df_performance.to_csv('xgboost_domain_results/performance_summary.csv', index=False)
    print("✓ Saved performance_summary.csv")
    
    # Save domain shift analysis
    drops_df.to_csv('xgboost_domain_results/domain_shift_analysis.csv')
    print("✓ Saved domain_shift_analysis.csv")
    
    # Save feature importance
    importance_df.to_csv('xgboost_domain_results/feature_importance.csv')
    print("✓ Saved feature_importance.csv")
    
    # Save out-domain matrix
    pd.DataFrame(out_domain_matrix, index=sites, columns=sites).to_csv(
        'xgboost_domain_results/out_domain_matrix.csv'
    )
    print("✓ Saved out_domain_matrix.csv")
    
    print("\nAll results saved to 'xgboost_domain_results/' directory")
else:
    print("Results not saved.")