# Gradient Boosting Classification on Autism Children Dataset with Hyperparameter Experiments


In [85]:

#  Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, f1_score, classification_report, make_scorer
)

# Load dataset
# Make sure 'autism_children.csv' is in your working directory
df = pd.read_csv('autism_children.csv')

# Drop columns that leak the target and useless index
#    - 'Unnamed: 0' is just the original index
#    - 'result' is a sum of A1–A10 scores and directly correlates with 'class'
df = df.drop(columns=['Unnamed: 0', 'result'])

# Remove exact duplicate rows (optional but recommended)
df = df.drop_duplicates()



#  Split features and target

In [86]:
y = df['class']
X = df.drop(columns=['class'])

# One-hot encode categorical features
X = pd.get_dummies(X)

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Initialize and train the baseline GradientBoostingClassifier
print("\n[Baseline Model Training]")
baseline_model = GradientBoostingClassifier(random_state=42)
baseline_model.fit(X_train, y_train)
print("✅ Baseline model trained successfully!")

print("\n=== Baseline Evaluation ===")
y_pred_base = baseline_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_base):.4f}")
print(f"Precision (weighted): {precision_score(y_test, y_pred_base, average='weighted'):.4f}")
print(f"F1 Score (weighted): {f1_score(y_test, y_pred_base, average='weighted'):.4f}")
print(classification_report(y_test, y_pred_base))




[Baseline Model Training]
✅ Baseline model trained successfully!

=== Baseline Evaluation ===
Accuracy: 0.8793
Precision (weighted): 0.8799
F1 Score (weighted): 0.8793
              precision    recall  f1-score   support

           0       0.90      0.87      0.88        30
           1       0.86      0.89      0.88        28

    accuracy                           0.88        58
   macro avg       0.88      0.88      0.88        58
weighted avg       0.88      0.88      0.88        58



#  Hyperparameter Experiment: GridSearchCV

In [87]:
print("\n[Hyperparameter Experiments with GridSearchCV]")

# Define parameter grid as required by assignment
grid_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0]
}

# Set up scorer and grid search
scorer = make_scorer(f1_score, average='weighted')
gb = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(
    estimator=gb,
    param_grid=grid_params,
    scoring=scorer,
    cv=5,
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

# Run grid search
grid_search.fit(X_train, y_train)
print("✅ Grid search completed!")
print(f"Best parameters found: {grid_search.best_params_}")

# Evaluate best model on the test set
y_pred_best = grid_search.best_estimator_.predict(X_test)
print("\n=== Test Set Evaluation with Best Model ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best):.4f}")
print(f"Precision (weighted): {precision_score(y_test, y_pred_best, average='weighted'):.4f}")
print(f"F1 Score (weighted): {f1_score(y_test, y_pred_best, average='weighted'):.4f}")
print(classification_report(y_test, y_pred_best))

# Compare baseline vs. tuned performance
def compare(name, base, tuned):
    print(f"{name}: baseline={base:.4f}, tuned={tuned:.4f}, delta={tuned-base:+.4f}")

print("\n=== Performance Comparison ===")
compare('Accuracy', accuracy_score(y_test, y_pred_base), accuracy_score(y_test, y_pred_best))
compare('Precision', precision_score(y_test, y_pred_base, average='weighted'), precision_score(y_test, y_pred_best, average='weighted'))
compare('F1 Score', f1_score(y_test, y_pred_base, average='weighted'), f1_score(y_test, y_pred_best, average='weighted'))

# Optionally save CV results for your report
import pandas as pd
cv_df = pd.DataFrame(grid_search.cv_results_)
cv_df.to_csv('autism_gb_gridsearch_results.csv', index=False)
print("Saved full CV results to 'autism_gb_gridsearch_results.csv'.")



[Hyperparameter Experiments with GridSearchCV]
Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.6; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.6; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.6; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.6; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.6; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimator