In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
adult_census = pd.read_csv("..//M1.03/adult-census.csv")

target_name = "class"
target = adult_census[target_name]
data = adult_census.drop(columns=[target_name, "education-num"])

data_train, data_test, target_train, target_test = train_test_split(data, target, train_size=0.2, random_state=42)

In [5]:
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

categorical_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

column_processor = make_column_transformer((categorical_encoder, selector(dtype_include=object)),remainder="passthrough",)

ml_pipeline = Pipeline(
    [
        ("preprocessing", column_processor),
        ("gb_classifier", HistGradientBoostingClassifier(random_state=42)),
    ]
)

In [15]:
from sklearn.model_selection import cross_val_score

learning_rate_list = [0.01, 0.1, 1, 10]
leaf_nodes_list = [3, 10, 30]

top_score = 0
top_params = {}

for i in learning_rate_list:
    for leaf_nodes in leaf_nodes_list:
        print(f"Testing lr={lr:.3f}, leaf_nodes={leaf_nodes}...", end=" ")
        
        ml_pipeline.set_params(
            gb_classifier__learning_rate=lr,
            gb_classifier__max_leaf_nodes=leaf_nodes
        )
        
        scores = cross_val_score(ml_pipeline, data_train, target_train, cv=2)
        current_score = scores.mean()
        
        print(f"Score: {current_score:.3f}")
        
        if current_score > top_score:
            top_score = current_score
            top_params = {"learning_rate": lr, "max_leaf_nodes": leaf_nodes}

# Final results
print(f"\nBest Score: {top_score:.3f}")
print(f"Best Parameters: {top_params}")

Testing lr=10.000, leaf_nodes=3... Score: 0.288
Testing lr=10.000, leaf_nodes=10... Score: 0.480
Testing lr=10.000, leaf_nodes=30... Score: 0.639
Testing lr=10.000, leaf_nodes=3... Score: 0.288
Testing lr=10.000, leaf_nodes=10... Score: 0.480
Testing lr=10.000, leaf_nodes=30... Score: 0.639
Testing lr=10.000, leaf_nodes=3... Score: 0.288
Testing lr=10.000, leaf_nodes=10... Score: 0.480
Testing lr=10.000, leaf_nodes=30... Score: 0.639
Testing lr=10.000, leaf_nodes=3... Score: 0.288
Testing lr=10.000, leaf_nodes=10... Score: 0.480
Testing lr=10.000, leaf_nodes=30... Score: 0.639

Best Score: 0.639
Best Parameters: {'learning_rate': 10, 'max_leaf_nodes': 30}


In [20]:
optimal_lr = top_params["learning_rate"]
optimal_nodes = top_params["max_leaf_nodes"]

ml_pipeline.set_params(gb_classifier__learning_rate=optimal_lr,gb_classifier__max_leaf_nodes=optimal_nodes)
ml_pipeline.fit(data_train, target_train)
final_accuracy = ml_pipeline.score(data_test, target_test)

print(f"Final model test accuracy: {final_accuracy:.3f}")

Final model test accuracy: 0.584
