In [None]:
import os
import numpy as np
np.random.seed(0)
import pandas as pd
# print(pd.__version__)
import matplotlib.pyplot as plt
from sklearn import set_config
set_config(display='diagram')

In [None]:
DATA_PATH = os.path.abspath(
    r'C:\Users\christoph.Kempkes\OneDrive - Magna\01_Fortbildungen\04_Udemy\Machine Learning mit Python\Chapter13_CaseStudies\adult_XLS.xls')

In [None]:
df = pd.read_excel(DATA_PATH)

In [None]:
idx = np.where(df['native-country']=='Holand-Netherlands')[0]

In [None]:
data = df.to_numpy()
x = data[:, :-1] # Letzte Spalte sind die y-Werte
x = np.delete(x, idx, axis = 0)
y = data[:, -1]
y = np.delete(y, idx, axis = 0)

print(f'x-Shape: {x.shape}\ny-Shape: {y.shape}')

In [None]:
# y-Daten aufbereiten
def one_hot(y):
    return np.array([0 if val == "<=50K" else 1 for val in y], dtype=np.int32)

In [None]:
print(y)
y = one_hot(y)
print(y)

In [None]:
# GridSearch Helper
def print_grid_cv_results(grid_result):
    print(
        f'Best model score: {grid_result.best_score_}'
        f'Best model params: {grid_result.best_params_}'
    )
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']

    for mean, std, param in zip(means, stds, params):
        mean = round(mean, 4)
        std = round(std, 4)
        print(f'{mean} (+/- {2 * std}) with: {param}')

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [None]:
params = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 100, 200]
}

clf = RandomForestClassifier()

# Ordinal Features

In [None]:
categorical_features = [1, 2, 3, 4, 5, 6, 7, 9]
numerical_features = [0, 8]

numeric_transformer = Pipeline(
    steps = [
        ('scaler', StandardScaler())
    ]
)

categorical_transformer = Pipeline(
    steps = [
        ('oridnal', OrdinalEncoder())
    ]
)

preprocessor_ordinal = ColumnTransformer(
    transformers = [
        ('numeric', numeric_transformer, numerical_features),
        ('categorical', categorical_transformer, categorical_features)
    ]
)

In [None]:
preprocessor_ordinal

In [None]:
preprocessor_ordinal.fit(x_train)
x_train_ordinal = preprocessor_ordinal.transform(x_train)
x_test_ordinal = preprocessor_ordinal.transform(x_test)
print(f'x_train_ord: {x_train_ordinal.shape}\ny_train_ord: {x_test_ordinal.shape}')

In [None]:
pipe_ordinal = Pipeline(
    steps = [
        ('preprocessor_ordinal', preprocessor_ordinal),
        ('classifier', clf) # Dieses "Classifier" muss in den Params angegeben werden
    ]
)

In [None]:
pipe_ordinal

In [None]:
grid_ordinal = GridSearchCV(pipe_ordinal, params, cv=3)
grid_results_oridnal = grid_ordinal.fit(x_train, y_train)
print_grid_cv_results(grid_results_oridnal)

# OneHot Features

In [None]:
categorical_features = [1, 2, 3, 4, 5, 6, 7, 9]
numerical_features = [0, 8]

numeric_transformer = Pipeline(
    steps = [
        ('scaler', StandardScaler())
    ]
)

categorical_transformer = Pipeline(
    steps = [
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ]
)

preprocessor_onehot = ColumnTransformer(
    transformers = [ 
        ('numeric', numeric_transformer, numerical_features),
        ('categorical', categorical_transformer, categorical_features)
    ]
)

In [None]:
preprocessor_onehot

In [None]:
preprocessor_onehot.fit(x_train)
x_train_onehot = preprocessor_onehot.transform(x_train)
x_test_onehot = preprocessor_onehot.transform(x_test)

print(f'x_train_onehot: {x_train_onehot.shape}\nx_test_onehot: {x_test_onehot.shape}')


In [None]:
pipe_onehot = Pipeline(
    steps = [
        ('preprocessor_onehot', preprocessor_ordinal),
        ('classifier', clf) # Dieses "Classifier" muss in den Params angegeben werden
    ]
)

In [None]:
pipe_onehot

In [None]:
grid_ordinal = GridSearchCV(pipe_onehot, params, cv=3)
grid_results_onehot = grid_ordinal.fit(x_train, y_train)
print_grid_cv_results(grid_results_onehot)

# TensorFlow Model

In [None]:
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import SGD

In [None]:
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

In [None]:
def build_model(input_dim, output_dim):
    model = Sequential()
    model.add(Dense(units=128, input_dim=input_dim))
    model.add(Activation('relu'))
    model.add(Dense(units=64))
    model.add(Activation('relu'))
    model.add(Dense(units=output_dim))
    model.add(Activation('sigmoid'))
    return model

## NN with ordinal features

In [None]:
model = build_model(
    input_dim=x_train_ordinal.shape[1],
    output_dim=y_train.shape[1]
)

model.compile(
    loss='binary_crossentropy',
    optimizer=SGD(learning_rate=0.001),
    metrics=['binary_accuracy']
)

history_ordinal = model.fit(
    x=x_train_ordinal,
    y=y_train,
    epochs=20,
    validation_data=(x_test_ordinal, y_test)
)

In [None]:
val_binary_accuracy = history_ordinal.history['val_binary_accuracy']
plt.plot(range(len(val_binary_accuracy)), val_binary_accuracy)
plt.show()

## NN with onehot features

In [None]:
model = build_model(
    input_dim=x_train_onehot.shape[1],
    output_dim=y_train.shape[1]
)

model.compile(
    loss='binary_crossentropy',
    optimizer=SGD(learning_rate=0.001),
    metrics=['binary_accuracy']
)

history_onehot = model.fit(
    x=x_train_onehot,
    y=y_train,
    epochs=20,
    validation_data=(x_test_onehot, y_test)
)

In [None]:
val_binary_accuracy = history_onehot.history['val_binary_accuracy']
plt.plot(range(len(val_binary_accuracy)), val_binary_accuracy)
plt.show()

## Pass in user-data

In [None]:
pipe_ordinal.fit(x_train, y_train)
score = pipe_ordinal.score(x_test, y_test)

print(f'score: {score}')

In [None]:
x_sample = [
    25,
    'Private',
    '11th',
    'Never-married',
    'Machine-op-inspct',
    'Own-child',
    'Black',
    'Male',
    40,
    'United-States'
]
y_sample = 0

y_pred_sample = pipe_ordinal.predict([x_sample])
print(f'y_sample: {y_sample}\ny_pred: {y_pred_sample}')