#### Veri setinin yüklenmesi

In [1]:
import pandas as pd

target_name = "class"
adult_census = pd.read_csv("dataset/adult-census.csv")
target = adult_census[target_name]
data = adult_census.drop(columns=[target_name, "education-num"])

#### Diagram için config ayarlaması

In [2]:
from sklearn import set_config

# To get a diagram visualization of the pipeline
set_config(display="diagram")

#### Transformerların ColumnTransformer'a verilmesi

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_selector as selector

categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(data)

categorical_preprocessor = OrdinalEncoder(
    handle_unknown="use_encoded_value", unknown_value=-1
)
preprocessor = ColumnTransformer(
    [
        ('cat_preprocessor', categorical_preprocessor, categorical_columns),
    ],
    remainder='passthrough',
    sparse_threshold=0,
)

#### Pipeline oluşturmak

In [4]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

model = Pipeline([
    ("preprocessor", preprocessor),
    (
        "classifier",
        HistGradientBoostingClassifier(
            random_state=42, max_leaf_nodes=4
        )
    ),
])
model

#### Cross_validate oluşturma

In [5]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, data, target, cv=5)
cv_results = pd.DataFrame(cv_results)
cv_results

Unnamed: 0,fit_time,score_time,test_score
0,0.495473,0.080258,0.863241
1,0.596686,0.067241,0.860784
2,0.507114,0.047129,0.86036
3,0.348598,0.052081,0.862408
4,0.34003,0.047698,0.866912


#### Generalization (test) score'unun elde edilmesi

In [6]:
print(
    "Generalization score without hyperparameters tuning:\n"
    f"{cv_results['test_score'].mean():.3f} ± {cv_results['test_score'].std():.3f}"
)

Generalization score without hyperparameters tuning:
0.863 ± 0.003


#### Bu bölümde ise daha önce yapılan adımların aynısının bir hyper-parameter tuning ile iyileştirilmesi, gridSearchCV init

In [7]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__learning_rate': (0.05, 0.5),
    'classifier__max_leaf_nodes': (10, 30),
}
model_grid_search = GridSearchCV(
    model, param_grid=param_grid, n_jobs=2, cv=2
)
model_grid_search.fit(data, target)

#### Sonuçların görselleştirilmesi

In [8]:
cv_results = pd.DataFrame(model_grid_search.cv_results_)
cv_results[[
    "param_classifier__learning_rate",
    "param_classifier__max_leaf_nodes",
    "mean_test_score",
    "std_test_score",
    "rank_test_score"
]]

Unnamed: 0,param_classifier__learning_rate,param_classifier__max_leaf_nodes,mean_test_score,std_test_score,rank_test_score
0,0.05,10,0.864195,6.1e-05,4
1,0.05,30,0.87091,6.1e-05,1
2,0.5,10,0.869743,0.000532,2
3,0.5,30,0.866058,0.001515,3


In [9]:
model_grid_search.best_params_

{'classifier__learning_rate': 0.05, 'classifier__max_leaf_nodes': 30}

In [10]:
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    data, target, test_size=0.2, random_state=42
)

model_grid_search.fit(data_train, target_train)
accuracy = model_grid_search.score(data_test, target_test)
print(f"Accuracy on test set: {accuracy:.3f}")

Accuracy on test set: 0.877


In [11]:
cv_results = cross_validate(
    model_grid_search, data, target, cv=5, n_jobs=2, return_estimator=True
)

In [12]:
cv_results = pd.DataFrame(cv_results)
cv_test_scores = cv_results['test_score']
print(
    "Generalization score with hyperparameters tuning:\n"
    f"{cv_test_scores.mean():.3f} ± {cv_test_scores.std():.3f}"
)

Generalization score with hyperparameters tuning:
0.871 ± 0.003


In [13]:
for cv_fold, estimator_in_fold in enumerate(cv_results["estimator"]):
    print(
        f"Best hyperparameters for fold #{cv_fold + 1}:\n"
        f"{estimator_in_fold.best_params_}"
    )

Best hyperparameters for fold #1:
{'classifier__learning_rate': 0.05, 'classifier__max_leaf_nodes': 30}
Best hyperparameters for fold #2:
{'classifier__learning_rate': 0.05, 'classifier__max_leaf_nodes': 30}
Best hyperparameters for fold #3:
{'classifier__learning_rate': 0.05, 'classifier__max_leaf_nodes': 30}
Best hyperparameters for fold #4:
{'classifier__learning_rate': 0.05, 'classifier__max_leaf_nodes': 30}
Best hyperparameters for fold #5:
{'classifier__learning_rate': 0.05, 'classifier__max_leaf_nodes': 30}


Görüldüğü üzere hyper-parameter tuning ile model performansı daha iyi olur.