# Gradient Boosting for Maternal Health Dataset 

In [29]:
# Gradient Boosting Classification on Autism Children Dataset
# Includes hold-out evaluation, hyperparameter grid search experiments, and metrics comparison

# 1. Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, f1_score,
    classification_report, make_scorer
)

# 2. Load dataset
# Ensure 'autism_children.csv' is in your working directory
df = pd.read_csv('autism_children.csv')

# 3. Drop columns that leak the target and useless index
#    - 'Unnamed: 0' is just the original index
#    - 'result' is a sum of A1–A10 scores and directly correlates with 'class'
df = df.drop(columns=['Unnamed: 0', 'result'])

# 4. Remove exact duplicate rows (optional but recommended)
df = df.drop_duplicates()

# 5. Split features and target
y = df['class']
X = df.drop(columns=['class'])

# 6. One-hot encode categorical features
X = pd.get_dummies(X)

# 7. Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)



# GradientBoost 

In [30]:
# 8. Initialize and train the baseline GradientBoostingClassifier
print("\n[Baseline Model Training]")
baseline_model = GradientBoostingClassifier(random_state=42)
baseline_model.fit(X_train, y_train)
print("✅ Baseline model trained successfully!")

# 9. Evaluate baseline model on the test set
y_pred_baseline = baseline_model.predict(X_test)
baseline_accuracy = accuracy_score(y_test, y_pred_baseline)
baseline_precision = precision_score(y_test, y_pred_baseline, average='weighted')
baseline_f1 = f1_score(y_test, y_pred_baseline, average='weighted')

print("\n=== Baseline Evaluation Results on Autism Children Dataset ===")
print(f"Accuracy: {baseline_accuracy:.4f}")
print(f"Precision (weighted): {baseline_precision:.4f}")
print(f"F1 Score (weighted): {baseline_f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_baseline))




[Baseline Model Training]
✅ Baseline model trained successfully!

=== Baseline Evaluation Results on Autism Children Dataset ===
Accuracy: 0.8793
Precision (weighted): 0.8799
F1 Score (weighted): 0.8793

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.87      0.88        30
           1       0.86      0.89      0.88        28

    accuracy                           0.88        58
   macro avg       0.88      0.88      0.88        58
weighted avg       0.88      0.88      0.88        58



#  Hyperparameter Experiments: Grid Search

In [31]:
# 10. Hyperparameter Experiments: Grid Search
print("\n[Hyperparameter Experiments with GridSearchCV]")

# 10.1. Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0]
}

# 10.2. Ensure scorer is imported and set up base model
scorer = make_scorer(f1_score, average='weighted')
base_model = GradientBoostingClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    scoring=scorer,
    cv=5,
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

# 10.3. Run grid search
grid_search.fit(X_train, y_train)
print("✅ Grid search completed!")

# 10.4. Collect and display top configurations
cv_results = pd.DataFrame(grid_search.cv_results_)
top_results = cv_results.loc[:, [
    'param_n_estimators', 'param_learning_rate', 'param_max_depth',
    'param_subsample', 'mean_test_score', 'std_test_score'
]]
top_results = top_results.sort_values(
    by='mean_test_score', ascending=False
).head(5)
print("\nTop 5 parameter settings by weighted F1:\n", top_results)

# 10.5. Evaluate best estimator on the held-out test set
best_model = grid_search.best_estimator_
print(f"\nBest parameters found: {grid_search.best_params_}")
y_pred_best = best_model.predict(X_test)

# Calculate metrics for best model
test_accuracy = accuracy_score(y_test, y_pred_best)
test_precision = precision_score(y_test, y_pred_best, average='weighted')
test_f1 = f1_score(y_test, y_pred_best, average='weighted')

print("\n=== Test Set Evaluation with Best Model ===")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision (weighted): {test_precision:.4f}")
print(f"F1 Score (weighted): {test_f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best))

# 11. Compare Baseline vs. Best Model
def compare_metrics(metric_name, baseline_val, best_val):
    improvement = best_val - baseline_val
    print(f"{metric_name}: Baseline = {baseline_val:.4f}, Best = {best_val:.4f}, Improvement = {improvement:+.4f}")

print("\n=== Performance Comparison ===")
compare_metrics('Accuracy', baseline_accuracy, test_accuracy)
compare_metrics('Precision (weighted)', baseline_precision, test_precision)
compare_metrics('F1 Score (weighted)', baseline_f1, test_f1)

# 12. Optionally, save the detailed CV results for reporting
top_results.to_csv('gb_experiment_results_top5.csv', index=False)
print("Saved top 5 CV results to 'gb_experiment_results_top5.csv'.")



[Hyperparameter Experiments with GridSearchCV]
Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.6; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.6; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.6; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.6; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.6; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimator