In [1]:
import os

import numpy as np


np.random.seed(0)
import pandas as pd
from sklearn import set_config

  from pandas.core import (


In [2]:
set_config(display="diagram")

In [3]:
DATA_PATH = os.path.abspath(
    "C:/Users/Jan/OneDrive/_Coding/UdemyML/"
    "Chapter13_CaseStudies/CaseStudyIncome/adult.xlsx"
)

### Dataset


In [4]:
df = pd.read_excel(DATA_PATH)

In [5]:
idx = np.where(df["native-country"] == "Holand-Netherlands")[0]

In [6]:
data = df.to_numpy()

x = data[:, :-1]
x = np.delete(x, idx, axis=0)
y = data[:, -1]
y = np.delete(y, idx, axis=0)

categorical_features = [1, 2, 3, 4, 5, 6, 7, 9]
numerical_features = [0, 8]

print(f"x shape: {x.shape}")
print(f"y shape: {y.shape}")

x shape: (48841, 10)
y shape: (48841,)


### y-Data


In [7]:
def one_hot(y: np.ndarray) -> np.ndarray:
    return np.array([0 if val == "<=50K" else 1 for val in y], dtype=np.int32)

In [8]:
y = one_hot(y)

### Helper


In [9]:
def print_grid_cv_results(grid_result) -> None:
    print(
        f"Best model score: {grid_result.best_score_} "
        f"Best model params: {grid_result.best_params_} "
    )
    means = grid_result.cv_results_["mean_test_score"]
    stds = grid_result.cv_results_["std_test_score"]
    params = grid_result.cv_results_["params"]

    for mean, std, param in zip(means, stds, params):
        mean_ = round(mean, 4)
        std_ = round(std, 4)
        print(f"{mean_} (+/- {2.0 * std_}) with: {param}")

### Sklearn Imports


In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

### Classifier and Params


In [12]:
params = {
    "classifier__n_estimators": [50, 100, 200],
    "classifier__max_depth": [None, 100, 200],
}

clf = RandomForestClassifier()

### Ordinal Features


In [13]:
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

categorical_transformer = Pipeline(steps=[("ordinal", OrdinalEncoder())])

preprocessor_odinal = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, numerical_features),
        ("categorical", categorical_transformer, categorical_features),
    ]
)

In [14]:
preprocessor_odinal

In [15]:
preprocessor_odinal.fit(x_train)

x_train_ordinal = preprocessor_odinal.transform(x_train)
x_test_ordinal = preprocessor_odinal.transform(x_test)

print(f"Shape of odinal data: {x_train_ordinal.shape}")
print(f"Shape of odinal data: {x_test_ordinal.shape}")

Shape of odinal data: (34188, 10)
Shape of odinal data: (14653, 10)


In [16]:
pipe_ordinal = Pipeline(
    steps=[("preprocessor_odinal", preprocessor_odinal), ("classifier", clf)]
)

In [17]:
pipe_ordinal

In [18]:
grid_ordinal = GridSearchCV(pipe_ordinal, params, cv=3)
grid_results_ordinal = grid_ordinal.fit(x_train, y_train)
print_grid_cv_results(grid_results_ordinal)

Best model score: 0.817947817947818 Best model params: {'classifier__max_depth': 200, 'classifier__n_estimators': 200} 
0.8169 (+/- 0.0022) with: {'classifier__max_depth': None, 'classifier__n_estimators': 50}
0.8173 (+/- 0.002) with: {'classifier__max_depth': None, 'classifier__n_estimators': 100}
0.8178 (+/- 0.0016) with: {'classifier__max_depth': None, 'classifier__n_estimators': 200}
0.8174 (+/- 0.0024) with: {'classifier__max_depth': 100, 'classifier__n_estimators': 50}
0.8179 (+/- 0.002) with: {'classifier__max_depth': 100, 'classifier__n_estimators': 100}
0.8179 (+/- 0.0036) with: {'classifier__max_depth': 100, 'classifier__n_estimators': 200}
0.8169 (+/- 0.001) with: {'classifier__max_depth': 200, 'classifier__n_estimators': 50}
0.8174 (+/- 0.0034) with: {'classifier__max_depth': 200, 'classifier__n_estimators': 100}
0.8179 (+/- 0.0012) with: {'classifier__max_depth': 200, 'classifier__n_estimators': 200}


### OneHot Features


In [19]:
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

categorical_transformer = Pipeline(
    steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]
)

preprocessor_onehot = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, numerical_features),
        ("categorical", categorical_transformer, categorical_features),
    ]
)

In [20]:
preprocessor_onehot

In [21]:
preprocessor_onehot.fit(x_train)

x_train_onehot = preprocessor_onehot.transform(x_train)
x_test_onehot = preprocessor_onehot.transform(x_test)

print(f"Shape of onehot data: {x_train_onehot.shape}")
print(f"Shape of onehot data: {x_test_onehot.shape}")

Shape of onehot data: (34188, 103)
Shape of onehot data: (14653, 103)


In [22]:
pipe_onehot = Pipeline(
    steps=[("preprocessor_onehot", preprocessor_odinal), ("classifier", clf)]
)

In [23]:
pipe_onehot

In [24]:
grid_onehot = GridSearchCV(pipe_onehot, params, cv=3)
grid_results_onehot = grid_onehot.fit(x_train, y_train)
print_grid_cv_results(grid_results_onehot)

Best model score: 0.818094068094068 Best model params: {'classifier__max_depth': 100, 'classifier__n_estimators': 200} 
0.8173 (+/- 0.0016) with: {'classifier__max_depth': None, 'classifier__n_estimators': 50}
0.8167 (+/- 0.0016) with: {'classifier__max_depth': None, 'classifier__n_estimators': 100}
0.8173 (+/- 0.0038) with: {'classifier__max_depth': None, 'classifier__n_estimators': 200}
0.818 (+/- 0.0024) with: {'classifier__max_depth': 100, 'classifier__n_estimators': 50}
0.8177 (+/- 0.005) with: {'classifier__max_depth': 100, 'classifier__n_estimators': 100}
0.8181 (+/- 0.0016) with: {'classifier__max_depth': 100, 'classifier__n_estimators': 200}
0.8163 (+/- 0.002) with: {'classifier__max_depth': 200, 'classifier__n_estimators': 50}
0.8179 (+/- 0.003) with: {'classifier__max_depth': 200, 'classifier__n_estimators': 100}
0.8175 (+/- 0.002) with: {'classifier__max_depth': 200, 'classifier__n_estimators': 200}
