In [1]:
import os
import numpy as np
np.random.seed(0)
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import set_config
set_config(display="diagram")

In [2]:
DATA_PATH = os.path.abspath(
    r"C:\Users\jan\Dropbox\_Coding\UdemyML\Chapter13_CaseStudies\CaseStudyIncome\adult.xlsx"
)

### Dataset

In [3]:
df = pd.read_excel(DATA_PATH)

In [4]:
data = df.to_numpy()

x = data[:, :-1]
y = data[:, -1]

categorical_features = [1, 2, 3, 4, 5, 6, 7, 9]
numerical_features = [0, 8]

print(f"x shape: {x.shape}")
print(f"y shape: {y.shape}")

x shape: (48842, 10)
y shape: (48842,)


### y-Data

In [5]:
def one_hot(y):
    return np.array([0 if val == "<=50K" else 1 for val in y], dtype=np.int32)

In [6]:
y = one_hot(y)

### Helper

In [7]:
def print_grid_cv_results(grid_result):
    print(
        f"Best model score: {grid_result.best_score_} "
        f"Best model params: {grid_result.best_params_} "
    )
    means = grid_result.cv_results_["mean_test_score"]
    stds = grid_result.cv_results_["std_test_score"]
    params = grid_result.cv_results_["params"]

    for mean, std, param in zip(means, stds, params):
        mean = round(mean, 4)
        std = round(std, 4)
        print(f"{mean} (+/- {2 * std}) with: {param}")

### Sklearn Imports

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

### Classifier and Params

In [10]:
params = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 100, 200]
}

clf = RandomForestClassifier()

### Ordinal Features

In [11]:
numeric_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler())
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ('ordinal', OrdinalEncoder())
    ]
)

preprocessor_odinal = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numerical_features),
        ('categorical', categorical_transformer, categorical_features)
    ]
)

In [12]:
preprocessor_odinal

In [13]:
preprocessor_odinal.fit(x_train)

x_train_ordinal = preprocessor_odinal.transform(x_train)
x_test_ordinal = preprocessor_odinal.transform(x_test)

print(f"Shape of odinal data: {x_train_ordinal.shape}")
print(f"Shape of odinal data: {x_test_ordinal.shape}")

Shape of odinal data: (34189, 10)
Shape of odinal data: (14653, 10)


### OneHot Features