In [1]:
import os

import pandas as pd
import numpy as np

In [2]:
PATH = os.path.abspath("C:/Users/Jan/Dropbox/_Programmieren/UdemyML/Chapter13_CaseStudies/CaseStudyIncome/1_Solution")
DATA_PATH = os.path.join(PATH, "adult.xlsx")

In [3]:
df = pd.read_excel(DATA_PATH)

In [4]:
print(df.columns)

Index(['age', 'workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'gender', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')


In [5]:
print(df.dtypes)

age                int64
workclass         object
education         object
marital-status    object
occupation        object
relationship      object
race              object
gender            object
hours-per-week     int64
native-country    object
income            object
dtype: object


In [6]:
print(df.shape, df.size)

(48842, 11) 537262


In [7]:
df.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,gender,hours-per-week,native-country,income
0,25,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,40,United-States,<=50K
1,38,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,40,United-States,>50K
3,44,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,40,United-States,>50K
4,18,?,Some-college,Never-married,?,Own-child,White,Female,30,United-States,<=50K


In [8]:
for col in df.columns:
    print(col, " Number of categories: ", len(df[col].value_counts()))

age  Number of categories:  74
workclass  Number of categories:  9
education  Number of categories:  16
marital-status  Number of categories:  7
occupation  Number of categories:  15
relationship  Number of categories:  6
race  Number of categories:  5
gender  Number of categories:  2
hours-per-week  Number of categories:  96
native-country  Number of categories:  42
income  Number of categories:  2


In [9]:
data = df.to_numpy()
x = data[:,:-1] # Gehe bis zur letzten Spalte (die aber nicht einschließlich)
y = data[:,-1] # Nimm nur die letzte Spalte

print(x.shape, y.shape)

# <=50K: Klasse 0
# >50K: Klasse 1
y = np.array([0 if val == "<=50K" else 1 for val in y], dtype=np.int8)

print(y)
x[:,[0,9]] = x[:,[9,0]]
print(x[0])

(48842, 10) (48842,)
[0 0 1 ... 0 0 1]
['United-States' 'Private' '11th' 'Never-married' 'Machine-op-inspct'
 'Own-child' 'Black' 'Male' 40 25]


In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [11]:
# Race: White, Black, Asian
#         0       1     2
#     [1,0,0]  [0,1,0]  [0,0,1]
# Kateogrie White = Kategorie 0 => [1, 0, 0]

In [12]:
enc = OneHotEncoder(handle_unknown="ignore")
enc.fit(x[:,:-2])

x_cat = enc.transform(x[:,:-2]).toarray()
x_cat = np.append(x_cat, x[:,-2:], axis=1)
print(x_cat.shape)
print(x_cat[0])

(48842, 104)
[0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 40 25]


In [13]:
x_train, x_test, y_train, y_test = train_test_split(x_cat, y, test_size=0.3)

scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [16]:
parameters = {"n_estimators": [50, 100, 200], "max_depth": [None, 30, 50, 100]}
clf = RandomForestClassifier(n_jobs=-1)
clf = GridSearchCV(clf, parameters, cv=3)
clf.fit(x_train, y_train)

print("Best parameters set found on developement set:")
print(clf.best_params_, "\n")

means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std*2, params))

Best parameters set found on developement set:
{'max_depth': 30, 'n_estimators': 200} 

0.815 (+/-0.001) for {'max_depth': None, 'n_estimators': 50}
0.815 (+/-0.001) for {'max_depth': None, 'n_estimators': 100}
0.815 (+/-0.001) for {'max_depth': None, 'n_estimators': 200}
0.823 (+/-0.000) for {'max_depth': 30, 'n_estimators': 50}
0.822 (+/-0.001) for {'max_depth': 30, 'n_estimators': 100}
0.823 (+/-0.001) for {'max_depth': 30, 'n_estimators': 200}
0.815 (+/-0.000) for {'max_depth': 50, 'n_estimators': 50}
0.815 (+/-0.002) for {'max_depth': 50, 'n_estimators': 100}
0.814 (+/-0.002) for {'max_depth': 50, 'n_estimators': 200}
0.815 (+/-0.000) for {'max_depth': 100, 'n_estimators': 50}
0.816 (+/-0.000) for {'max_depth': 100, 'n_estimators': 100}
0.815 (+/-0.001) for {'max_depth': 100, 'n_estimators': 200}


In [18]:
parameters = {"kernel": ["rbf"]}
clf = SVC()
clf = GridSearchCV(clf, parameters, cv=2)
clf.fit(x_train, y_train)

print("Best parameters set found on developement set:")
print(clf.best_params_, "\n")

means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std*2, params))

Best parameters set found on developement set:
{'kernel': 'rbf'} 

0.832 (+/-0.001) for {'kernel': 'rbf'}


In [19]:
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import SGD

In [20]:
# Klassifikation:
# >= 2 Klassen: OneHotEncoding für die y-Werte, am Ende des NN Softmax, loss categorical_crossentropy
# = 2 Klassen: y-Werte haben den binären Wert der Klasse (0, 1), am Ende des NN Sigmoid ~(0, 1), loss binary_crossentropy

input_shape = x_train.shape[1] # 104
output_shape = 1

model = Sequential()
model.add(Dense(128, input_dim=input_shape)) # Input layer
model.add(Activation("relu"))
model.add(Dense(64))
model.add(Activation("relu"))
model.add(Dense(output_shape))
# Output layer, sigmoid (0, 1) => Wahrscheinlichkeit dafür, dass der Datenpunkt zur Klasse 1 gehört
model.add(Activation("sigmoid"))

optimizer = SGD(lr=0.001)
model.compile(
    loss="binary_crossentropy",
    optimizer=optimizer,
    metrics=["accuracy"]
)
model.fit(
    x=x_train,
    y=y_train,
    epochs=10,
    validation_data=(x_test, y_test)
)

Epoch 1/10


InternalError:  Blas GEMM launch failed : a.shape=(32, 104), b.shape=(104, 128), m=32, n=128, k=104
	 [[node sequential/dense/MatMul (defined at <ipython-input-20-ab75c9d32c5a>:23) ]] [Op:__inference_train_function_609]

Function call stack:
train_function
