# Random Forest, XGBoost, and Boruta Feature Selection in Python

This notebook converts the Random Forest, XGBoost, and Boruta feature selection sections from an R script to Python. It processes the 'santander' dataset, trains a Random Forest model, performs feature selection, trains an XGBoost model, and applies Boruta feature selection. The code uses scikit-learn, xgboost, and BorutaPy libraries to replicate the R functionality.

## Setup and Imports

Install and import necessary Python libraries.

In [None]:
# Install required packages if running in a fresh environment
!pip install pandas numpy scikit-learn xgboost borutapy matplotlib seaborn sklearn-metrics

# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.feature_selection import VarianceThreshold
import xgboost as xgb
from borutapy import BorutaPy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve
%matplotlib inline

# Set random seed for reproducibility
np.random.seed(1966)

## Load and Prepare Data

Load the preprocessed 'santander' dataset and split into train and test sets.

In [None]:
# Load data (assuming santander_prepd.RData equivalent is a CSV or similar)
# Replace with actual file path
santander = pd.read_csv('santander_prepd.csv')  # Update path as needed

# Display class distribution
print(santander['y'].value_counts())

# Convert target to categorical if needed
santander['y'] = santander['y'].astype('category')

# Split data into train and test
train, test = train_test_split(santander, test_size=0.2, stratify=santander['y'], random_state=1966)

# Remove near-zero variance features
selector = VarianceThreshold(threshold=0)
train_features = selector.fit_transform(train.drop('y', axis=1))
selected_features = train.drop('y', axis=1).columns[selector.get_support()]
train = pd.concat([train[selected_features], train['y']], axis=1)

# Prepare features and target
X_train = train.drop('y', axis=1)
y_train = train['y']
X_test = test[selected_features]
y_test = test['y']

## Random Forest Model

Train a Random Forest model and evaluate feature importance.

In [None]:
# Set random seed for model
np.random.seed(1999)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=200, class_weight={0: 1, 1: 3}, random_state=1999)
rf.fit(X_train, y_train)

# Feature importance
feature_importance = pd.DataFrame({
    'var': X_train.columns,
    'MeanDecreaseGini': rf.feature_importances_
})
print(feature_importance.describe())

# Select important features (threshold based on mean importance)
threshold = feature_importance['MeanDecreaseGini'].mean()
important_features = feature_importance[feature_importance['MeanDecreaseGini'] > threshold]['var']
X_train_reduced = X_train[important_features]
X_test_reduced = X_test[important_features]

# Train reduced Random Forest
np.random.seed(567)
rf_reduced = RandomForestClassifier(n_estimators=110, class_weight={0: 1, 1: 3}, random_state=567)
rf_reduced.fit(X_train_reduced, y_train)

# Predictions and probabilities
rf_prob = rf.predict_proba(X_train)[:, 1]
y_num = y_train.astype(int)

# Evaluate
print('AUC:', roc_auc_score(y_num, rf_prob))
print('Log Loss:', log_loss(y_num, rf_prob))

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='MeanDecreaseGini', y='var', data=feature_importance.sort_values('MeanDecreaseGini', ascending=False))
plt.title('Random Forest Feature Importance')
plt.show()

## Corrected Probability Function

Define a function to correct class probabilities based on population and sample fractions.

In [None]:
def corrected_prob(result, population_fraction, sample_fraction):
    value = 1 / (1 + (1 / population_fraction - 1) / (1 / sample_fraction - 1) * (1 / result - 1))
    return value

# Apply correction
y_prob_corrected = corrected_prob(rf_prob, population_fraction=0.04, sample_fraction=0.33)

# Evaluate corrected probabilities
print('Corrected AUC:', roc_auc_score(y_num, y_prob_corrected))
print('Corrected Log Loss:', log_loss(y_num, y_prob_corrected))

# Test set predictions
rf_test_prob = rf.predict_proba(X_test)[:, 1]
rf_test_corrected = corrected_prob(rf_test_prob, population_fraction=0.04, sample_fraction=0.33)
y_test_num = y_test.astype(int)

print('Test AUC:', roc_auc_score(y_test_num, rf_test_corrected))
print('Test Log Loss:', log_loss(y_test_num, rf_test_corrected))

## XGBoost Model

Train an XGBoost model with hyperparameter tuning and evaluate performance.

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100],
    'colsample_bytree': [1],
    'min_child_weight': [1],
    'learning_rate': [0.1, 0.3, 0.5],
    'gamma': [0.25, 0.5],
    'subsample': [1],
    'max_depth': [3]
}

# Initialize XGBoost
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='error', random_state=123)

# Grid search
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='roc_auc', verbose=2)
grid_search.fit(X_train_reduced, y_train)

print('Best parameters:', grid_search.best_params_)

# Train final model
xgb_final = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='error',
    learning_rate=0.5,
    max_depth=3,
    subsample=1,
    colsample_bytree=1,
    gamma=0.25,
    n_estimators=100,
    random_state=1232
)
xgb_final.fit(X_train_reduced, y_train)

# Feature importance
imp_matrix = pd.DataFrame({
    'Feature': X_train_reduced.columns,
    'Gain': xgb_final.feature_importances_
})
plt.figure(figsize=(10, 6))
sns.barplot(x='Gain', y='Feature', data=imp_matrix.sort_values('Gain', ascending=False))
plt.title('XGBoost Feature Importance')
plt.show()

# Predictions and evaluation
xgb_pred = xgb_final.predict_proba(X_train_reduced)[:, 1]
print('Train AUC:', roc_auc_score(y_num, xgb_pred))
print('Train Log Loss:', log_loss(y_num, xgb_pred))

# Test set predictions
xgb_test_pred = xgb_final.predict_proba(X_test_reduced)[:, 1]
print('Test AUC:', roc_auc_score(y_test_num, xgb_test_pred))
print('Test Log Loss:', log_loss(y_test_num, xgb_test_pred))

## ROC Curve Comparison

Compare ROC curves for Random Forest and XGBoost models.

In [None]:
# ROC curves
fpr_rf, tpr_rf, _ = roc_curve(y_test_num, rf_test_corrected)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test_num, xgb_test_pred)

plt.figure(figsize=(8, 6))
plt.plot(fpr_rf, tpr_rf, label='Random Forest', color='black')
plt.plot(fpr_xgb, tpr_xgb, label='XGBoost', color='green')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

## Boruta Feature Selection

Apply Boruta feature selection and train a Random Forest on selected features.

In [None]:
# Load sim_df for Boruta (replace with actual file path)
sim_df = pd.read_csv('sim_df.csv')  # Update path as needed
sim_df['y'] = sim_df['y'].astype('category')

# Split data
train, test = train_test_split(sim_df, test_size=0.3, stratify=sim_df['y'], random_state=1066)
X_train = train.drop('y', axis=1)
y_train = train['y']
X_test = test.drop('y', axis=1)
y_test = test['y']

# Boruta feature selection
np.random.seed(5150)
rf_boruta = RandomForestClassifier(n_jobs=-1, random_state=5150)
boruta = BorutaPy(rf_boruta, n_estimators='auto', random_state=5150)
boruta.fit(X_train.values, y_train.values)

# Selected features
selected_features = X_train.columns[boruta.support_]
print('Selected features:', selected_features)

# Train Random Forest on selected features
np.random.seed(999)
boruta_rf = RandomForestClassifier(random_state=999)
boruta_rf.fit(X_train[selected_features], y_train)

# Predictions and evaluation
boruta_pred = boruta_rf.predict_proba(X_test[selected_features])[:, 1]
y_test_num = y_test.astype(int)
print('Boruta RF AUC:', roc_auc_score(y_test_num, boruta_pred))
print('Boruta RF Log Loss:', log_loss(y_test_num, boruta_pred))