# Fixed: XGBoost Domain Analysis for Google Colab

This version is specifically configured for Google Colab with proper file paths.

In [None]:
# First, clone the repository if not already done
import os

# Check if repository exists
if not os.path.exists('/content/tabicl'):
    !git clone https://github.com/cliu238/tabicl.git
    %cd /content/tabicl
else:
    %cd /content/tabicl

# Verify we're in the right directory
!pwd
!ls -la processed_data/

In [None]:
# Install required packages
!pip install xgboost scikit-learn pandas numpy matplotlib seaborn plotly -q

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")

## Load Data with Correct Path

In [None]:
# Load the dataset with the correct path
# Make sure we're in the tabicl directory
import os
os.chdir('/content/tabicl')

# Now load the data
df = pd.read_csv('processed_data/adult_numeric_20250729_155457.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumn names (first 10): {list(df.columns[:10])}...")
print(f"\nSites distribution:")
print(df['site'].value_counts())
print(f"\nTarget classes: {df['va34'].nunique()} unique classes")
print(f"Missing values: {df.isnull().sum().sum()}")

# Display first few rows
df.head()

In [None]:
# Data preprocessing
# Drop 'cod5' column as specified
if 'cod5' in df.columns:
    df_clean = df.drop('cod5', axis=1)
    print("Dropped 'cod5' column")
else:
    df_clean = df.copy()
    print("'cod5' column not found, proceeding with all columns")

# Separate features and target
X = df_clean.drop(['va34', 'site'], axis=1)
y = df_clean['va34']
sites = df_clean['site']

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nTarget value distribution (top 10):")
print(y.value_counts().head(10))

## Continue with the rest of the analysis...