# Working Version: XGBoost Domain Analysis

This version uses sklearn's RandomForestClassifier as an alternative that handles missing classes better.

In [None]:
# Setup: Clone repository and install packages
import os

# Clone repository if not already present
if not os.path.exists('/content/tabicl'):
    print("Cloning repository...")
    !git clone https://github.com/cliu238/tabicl.git
    print("Repository cloned successfully!")
else:
    print("Repository already exists.")

# Change to repository directory
%cd /content/tabicl

# Install required packages
print("\nInstalling packages...")
!pip install xgboost scikit-learn pandas numpy matplotlib seaborn plotly -q
print("Packages installed!")

# Verify data files exist
print("\nChecking data files:")
!ls -lh processed_data/*.csv

In [None]:
# Import all required libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

# Set style and random seed
plt.style.use('default')
np.random.seed(42)

print("✅ All libraries imported successfully!")
print("Note: Using RandomForest instead of XGBoost to handle missing classes better")

In [None]:
# Load and explore the dataset
df = pd.read_csv('processed_data/adult_numeric_20250729_155457.csv')

print("📊 Dataset Overview:")
print(f"Shape: {df.shape}")
print(f"\n🏥 Sites distribution:")
print(df['site'].value_counts())
print(f"\n🎯 Target (va34): {df['va34'].nunique()} unique classes")
print(f"Missing values: {df.isnull().sum().sum()}")

In [None]:
# Preprocessing
print("🔧 Preprocessing data...")

# Drop cod5 column
if 'cod5' in df.columns:
    df = df.drop('cod5', axis=1)
    print("✅ Dropped 'cod5' column")

# Use LabelEncoder for consistent encoding
le = LabelEncoder()
df['va34_encoded'] = le.fit_transform(df['va34'])

print(f"\nEncoded {len(le.classes_)} unique classes")
print(f"Class range: {df['va34_encoded'].min()} to {df['va34_encoded'].max()}")

# Separate features, target, and sites
X = df.drop(['va34', 'va34_encoded', 'site'], axis=1)
y = df['va34_encoded']
sites = df['site']

print(f"\n📐 Data shapes:")
print(f"Features: {X.shape}")
print(f"Target: {y.shape}")
print(f"Sites: {sites.unique().tolist()}")

In [None]:
def create_domain_splits(df, test_size=0.2, random_state=42):
    """Create train/test splits for each site"""
    domain_splits = {}
    
    for site in df['site'].unique():
        site_data = df[df['site'] == site]
        X_site = site_data.drop(['va34', 'va34_encoded', 'site'], axis=1)
        y_site = site_data['va34_encoded']
        
        print(f"Processing {site}: {len(site_data)} samples, {site_data['va34'].nunique()} classes")
        
        # Simple train/test split
        if len(site_data) >= 10:
            X_train, X_test, y_train, y_test = train_test_split(
                X_site, y_site, test_size=test_size, random_state=random_state
            )
        else:
            X_train, X_test = X_site, X_site
            y_train, y_test = y_site, y_site
        
        domain_splits[site] = {
            'X_train': X_train, 'X_test': X_test,
            'y_train': y_train, 'y_test': y_test,
            'full_X': X_site, 'full_y': y_site
        }
    
    return domain_splits

def train_model(X_train, y_train, model_type='rf'):
    """Train a model (RandomForest or GradientBoosting)"""
    if model_type == 'rf':
        model = RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42,
            n_jobs=-1
        )
    else:
        model = GradientBoostingClassifier(
            n_estimators=100,
            max_depth=4,
            learning_rate=0.1,
            random_state=42
        )
    
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """Evaluate model performance"""
    try:
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        balanced_acc = balanced_accuracy_score(y_test, y_pred)
        
        # Get unique labels for F1 calculation
        unique_labels = np.unique(np.concatenate([y_test, y_pred]))
        f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0, labels=unique_labels)
        
        return {
            'accuracy': accuracy,
            'balanced_accuracy': balanced_acc,
            'f1_macro': f1_macro
        }
    except Exception as e:
        print(f"    Error: {e}")
        return {'accuracy': 0.0, 'balanced_accuracy': 0.0, 'f1_macro': 0.0}

print("✅ Functions defined!")

In [None]:
# Create domain splits
print("Creating domain splits...")
print("="*60)
domain_data = create_domain_splits(df)

print("\n📁 Domain splits summary:")
for site, data in domain_data.items():
    print(f"{site:10} - Train: {len(data['X_train']):4}, Test: {len(data['X_test']):4}")

In [None]:
# In-domain performance evaluation
print("\n🎯 Evaluating In-Domain Performance with RandomForest...")
print("="*50)

in_domain_results = {}

for site in domain_data.keys():
    print(f"\nTraining for {site}...")
    
    # Train model
    model = train_model(
        domain_data[site]['X_train'], 
        domain_data[site]['y_train'],
        model_type='rf'
    )
    
    # Test on same site
    results = evaluate_model(
        model,
        domain_data[site]['X_test'],
        domain_data[site]['y_test']
    )
    
    in_domain_results[site] = results
    in_domain_results[site]['model'] = model
    
    print(f"  ✓ Accuracy: {results['accuracy']:.4f}")
    print(f"  ✓ Balanced Acc: {results['balanced_accuracy']:.4f}")
    print(f"  ✓ F1 Macro: {results['f1_macro']:.4f}")

In [None]:
# Visualize in-domain results
sites_list = list(in_domain_results.keys())
accuracies = [in_domain_results[s]['accuracy'] for s in sites_list]

plt.figure(figsize=(10, 6))
bars = plt.bar(sites_list, accuracies, color='skyblue', edgecolor='navy', alpha=0.7)
plt.xlabel('Site', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('In-Domain Performance by Site (RandomForest)', fontsize=14, fontweight='bold')
plt.ylim([0, 1])
plt.grid(True, alpha=0.3, axis='y')

# Add value labels
for bar, acc in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f'{acc:.3f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\n📊 Average In-Domain Accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")

In [None]:
# Out-domain performance evaluation
print("\n🔄 Evaluating Out-Domain Performance...")
print("="*50)

out_domain_results = {}

for train_site in domain_data.keys():
    out_domain_results[train_site] = {}
    
    print(f"\nTraining on {train_site}...")
    model = train_model(
        domain_data[train_site]['full_X'],
        domain_data[train_site]['full_y'],
        model_type='rf'
    )
    
    for test_site in domain_data.keys():
        if train_site == test_site:
            continue
            
        results = evaluate_model(
            model,
            domain_data[test_site]['full_X'],
            domain_data[test_site]['full_y']
        )
        
        out_domain_results[train_site][test_site] = results
        print(f"  {train_site} → {test_site}: {results['accuracy']:.4f}")

In [None]:
# Create performance matrix
sites_list = list(domain_data.keys())
n_sites = len(sites_list)
perf_matrix = np.zeros((n_sites, n_sites))

for i, train_site in enumerate(sites_list):
    for j, test_site in enumerate(sites_list):
        if train_site == test_site:
            perf_matrix[i, j] = in_domain_results[train_site]['accuracy']
        else:
            perf_matrix[i, j] = out_domain_results[train_site][test_site]['accuracy']

# Visualize matrix
plt.figure(figsize=(10, 8))
sns.heatmap(perf_matrix, annot=True, fmt='.3f', 
            xticklabels=sites_list, yticklabels=sites_list,
            cmap='RdYlGn', vmin=0, vmax=1,
            cbar_kws={'label': 'Accuracy'})

plt.title('Domain Transfer Performance Matrix\n(RandomForest)', fontsize=14, fontweight='bold')
plt.ylabel('Train Site', fontsize=12)
plt.xlabel('Test Site', fontsize=12)

# Highlight diagonal
for i in range(n_sites):
    plt.gca().add_patch(plt.Rectangle((i, i), 1, 1, fill=False, edgecolor='blue', lw=3))

plt.tight_layout()
plt.show()

In [None]:
# Cross-domain performance
print("\n🌐 Evaluating Cross-Domain Performance...")
print("="*50)

cross_domain_results = {}

for held_out_site in domain_data.keys():
    train_sites = [s for s in domain_data.keys() if s != held_out_site]
    
    # Combine training data
    X_train_list = [domain_data[s]['full_X'] for s in train_sites]
    y_train_list = [domain_data[s]['full_y'] for s in train_sites]
    
    X_train_combined = pd.concat(X_train_list, axis=0)
    y_train_combined = pd.concat(y_train_list, axis=0)
    
    print(f"\nTraining on all except {held_out_site} ({len(X_train_combined)} samples)...")
    
    model = train_model(X_train_combined, y_train_combined, model_type='rf')
    
    results = evaluate_model(
        model,
        domain_data[held_out_site]['full_X'],
        domain_data[held_out_site]['full_y']
    )
    
    cross_domain_results[held_out_site] = results
    print(f"  Testing on {held_out_site}: {results['accuracy']:.4f}")

In [None]:
# Final comparison
print("\n" + "="*80)
print(" "*25 + "FINAL RESULTS SUMMARY")
print("="*80)

# Prepare comparison data
comparison_data = []
for site in domain_data.keys():
    # In-domain
    comparison_data.append({
        'Site': site,
        'Scenario': 'In-Domain',
        'Accuracy': in_domain_results[site]['accuracy']
    })
    
    # Out-domain average
    out_accs = [out_domain_results[train][site]['accuracy'] 
               for train in domain_data.keys() if train != site]
    comparison_data.append({
        'Site': site,
        'Scenario': 'Out-Domain',
        'Accuracy': np.mean(out_accs)
    })
    
    # Cross-domain
    comparison_data.append({
        'Site': site,
        'Scenario': 'Cross-Domain',
        'Accuracy': cross_domain_results[site]['accuracy']
    })

df_comp = pd.DataFrame(comparison_data)

# Visualization
fig = px.bar(df_comp, x='Site', y='Accuracy', color='Scenario',
             title='Performance Comparison: In-Domain vs Out-Domain vs Cross-Domain',
             barmode='group', height=500,
             color_discrete_map={
                 'In-Domain': '#2E7D32',
                 'Out-Domain': '#F57C00',
                 'Cross-Domain': '#1976D2'
             })
fig.update_layout(yaxis_range=[0, 1])
fig.show()

# Print statistics
for scenario in ['In-Domain', 'Out-Domain', 'Cross-Domain']:
    data = df_comp[df_comp['Scenario'] == scenario]['Accuracy']
    print(f"\n{scenario}:")
    print(f"  Mean: {data.mean():.4f}")
    print(f"  Std:  {data.std():.4f}")
    print(f"  Range: [{data.min():.4f}, {data.max():.4f}]")

# Domain shift analysis
in_mean = df_comp[df_comp['Scenario'] == 'In-Domain']['Accuracy'].mean()
out_mean = df_comp[df_comp['Scenario'] == 'Out-Domain']['Accuracy'].mean()
cross_mean = df_comp[df_comp['Scenario'] == 'Cross-Domain']['Accuracy'].mean()

print("\n📉 Domain Shift Effects:")
print(f"  In→Out drop: {(in_mean - out_mean):.4f} ({(in_mean - out_mean)/in_mean*100:.1f}%)")
print(f"  In→Cross drop: {(in_mean - cross_mean):.4f} ({(in_mean - cross_mean)/in_mean*100:.1f}%)")
print(f"  Cross vs Out gain: {(cross_mean - out_mean):.4f}")

print("\n✅ Analysis Complete!")