In [None]:
!pip install xgboost


📌 Step 1: Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

from xgboost import XGBClassifier 

📌 Step 2: Load Dataset

In [None]:
df = pd.read_csv('../adult.csv')  # Replace with your correct path
df.head()


📌 Step 3: Split Features and Target

In [None]:
le = LabelEncoder()
df['income'] = le.fit_transform(df['income'])  # 0: <=50K, 1: >50K

X = df.drop('income', axis=1)
y = df['income']


📌 Check Dataset Split

In [None]:
# Check class distribution in target (to ensure stratification worked)
print("Train target distribution:")
print(y_train.value_counts(normalize=True))

print("\nTest target distribution:")
print(y_test.value_counts(normalize=True))

# Optional: preview
print("\nSample of X_train:")
print(X_train.head())


📌 Visualize Target Distribution

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Training set distribution
sns.countplot(x=y_train)
plt.title("Income Distribution in Training Set")
plt.show()


📌 Step 4: Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


📌 Inspect Preprocessing

In [None]:
# Just preprocess the training set
X_train_preprocessed = preprocessor.fit_transform(X_train)

# Show shape before and after
print("Original shape:", X_train.shape)
print("After preprocessing:", X_train_preprocessed.shape)

# To see what features look like:
# Convert to DataFrame if you want to inspect (for OneHotEncoded columns)
ohe = preprocessor.named_transformers_['cat']['encoder']
feature_names = ohe.get_feature_names_out(categorical_features)
all_features = np.concatenate([numeric_features, feature_names])

X_df_transformed = pd.DataFrame(X_train_preprocessed.toarray() 
                                if hasattr(X_train_preprocessed, 'toarray') 
                                else X_train_preprocessed,
                                columns=all_features)

print(X_df_transformed.head())


📌 Data Exploration & Preprocessing Visualization

In [None]:
sns.set(style="whitegrid")
fig, axes = plt.subplots(2, 1, figsize=(14, 14))

X_train[numeric_features].hist(bins=20, ax=axes[0], grid=False, edgecolor='black')
axes[0].set_title("🔍 Distribution of Numeric Features", fontsize=16)
axes[0].set_xlabel("Feature Value")
axes[0].set_ylabel("Count")

corr_matrix = X_train[numeric_features].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=axes[1])
axes[1].set_title("📈 Correlation Between Numeric Features", fontsize=16)

plt.tight_layout()
plt.show()

📌 Step 5: Define Preprocessing Pipeline

In [None]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])

📌 Basic Pipeline Test Run

In [None]:
quick_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

quick_pipeline.fit(X_train, y_train)
y_quick_pred = quick_pipeline.predict(X_test)

print("Quick pipeline accuracy:", (y_quick_pred == y_test).mean())
print(classification_report(y_test, y_quick_pred))

📌 PCA to Visualize Transformed Data

In [None]:
X_train_preprocessed = preprocessor.fit_transform(X_train)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(
    X_train_preprocessed.toarray() if hasattr(X_train_preprocessed, 'toarray') else X_train_preprocessed
)

plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_train, cmap='coolwarm', alpha=0.6)
plt.title("PCA of Preprocessed Features")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.colorbar(label="Income Class (0 = <=50K, 1 = >50K)")
plt.show()

📌 Step 6: Define Pipeline and GridSearchCV

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [3, 6],
    'clf__learning_rate': [0.1, 0.3]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

📌 Grid Search Summary

In [None]:
print("Best Params:", grid_search.best_params_)
print("Best CV Score (AUC):", grid_search.best_score_)

results = pd.DataFrame(grid_search.cv_results_)
print(results[['params', 'mean_test_score']])

📌 Visualize GridSearchCV Results

In [None]:
sns.lineplot(x='param_clf__n_estimators', y='mean_test_score', data=results)
plt.title("Random Forest - Estimators vs Mean ROC AUC")
plt.ylabel("Mean ROC AUC")
plt.show()



📌 Step 7: Model Evaluation

In [None]:
y_pred = grid_search.predict(X_test)
y_proba = grid_search.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))

roc_auc = roc_auc_score(y_test, y_proba)
fpr, tpr, _ = roc_curve(y_test, y_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'XGBoost (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid()
plt.show()

📌 Step 8: SHAP Explainability

In [None]:
X_sample = X_test.sample(100, random_state=42)
X_sample_transformed = preprocessor.transform(X_sample)

model = grid_search.best_estimator_.named_steps['clf']
explainer = shap.Explainer(model)
shap_values = explainer(X_sample_transformed)

shap.plots.beeswarm(shap_values)

📌 Optional: Save the model

In [None]:
joblib.dump(grid_search.best_estimator_, '../models/xgboost_model.pkl')