In [1]:
import numpy as np
from numpy.random import multivariate_normal
from numpy.random import uniform
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import auc
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("./einkommen.train")
# add column names
df.columns = ['Alter', 'Beschaeftigung', 'Gewichtung', 'Bildungsgrad', 'Bildungsdauer', 'Familie','Bereich', 'Partnerschaft', 'Ethnie', 'Geschlecht', 'Gewinn', 'Verlust', 'Zeit', 'Geburtsland', 'Einkommen']

In [3]:
# preprocessing
# Classification problem: classes <=50k and >50k change to 0 (= " <=50k") and 1 (= " >50k"); -1 = ' ?'
# One-Hot-Encoding needed for Bildungsgrad, Familienstand, Bereich, Partnerschaft, Ethnie, Geschlecht, Geburtsland
print('Einkommen:\n',df['Einkommen'].value_counts())
# Klassifizierung des Einkommens
df['Einkommen'].replace({' >50K' : 1, ' <=50K':0}, inplace = True)
#df = df[['Alter', 'Familie', 'Gewichtung', 'Gewinn', 'Bildungsdauer', 'Verlust', 'Zeit', 'Partnerschaft', 'Einkommen']]
#df = df.drop(['Geburtsland'], axis=1)
df.head()

Einkommen:
  ?        25000
 <=50K     3779
 >50K      1221
Name: Einkommen, dtype: int64


Unnamed: 0,Alter,Beschaeftigung,Gewichtung,Bildungsgrad,Bildungsdauer,Familie,Bereich,Partnerschaft,Ethnie,Geschlecht,Gewinn,Verlust,Zeit,Geburtsland,Einkommen
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [4]:
#df = df[['Alter', 'Familie', 'Gewichtung', 'Gewinn', 'Bildungsdauer', 'Verlust', 'Zeit', 'Partnerschaft', 'Einkommen']]
#df = df.drop(['Geburtsland'], axis=1)
#df.head()
df.shape

(30000, 15)

In [5]:
# extract train and test data
workdata = df[df.Einkommen != ' ?']
# extract data to apply model
applydata = df[df.Einkommen == ' ?']
#print(df.info())
#preprocessing
# Überblick über unbekannte Daten der verbleibenden Attribute
#wdNames = workdata.columns.values
#for i in wdNames:
#    print(i)
#    print(workdata[str(i)].loc[workdata[str(i)] == ' ?'].value_counts())

# one hot encoding aller daten bis auf Einkommen, deutlich mehr Attribute

wd = pd.get_dummies(workdata.iloc[:,:df.shape[1]-1])
wd['Einkommen'] = workdata['Einkommen']
wd

Unnamed: 0,Alter,Gewichtung,Bildungsdauer,Gewinn,Verlust,Zeit,Beschaeftigung_ ?,Beschaeftigung_ Federal-gov,Beschaeftigung_ Local-gov,Beschaeftigung_ Private,...,Geburtsland_ Puerto-Rico,Geburtsland_ Scotland,Geburtsland_ South,Geburtsland_ Taiwan,Geburtsland_ Thailand,Geburtsland_ Trinadad&Tobago,Geburtsland_ United-States,Geburtsland_ Vietnam,Geburtsland_ Yugoslavia,Einkommen
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38,215646,9,0,0,40,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,53,234721,7,0,0,40,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,28,338409,13,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,43,222971,3,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4996,31,259425,9,0,0,40,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1
4997,47,212120,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
4998,26,245880,9,0,0,60,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


In [6]:
from scipy.stats import zscore

work = zscore(wd.iloc[:, :105])
# preprocessing
# splitting in Y und X

X = pd.DataFrame(work).iloc[:,:105] # alle Daten außer Einkommen
y = pd.DataFrame(wd).iloc[:,105] #letztendliche Klassifizierung nach Einkommen

In [7]:
# 5000 Data known => split in Test and Train; 25000 Data to apply model
# splitting in test and training data
y=y.astype('int')
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33)
y_test.shape

(1650,)

In [8]:
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers.core import Activation

Using TensorFlow backend.


In [9]:
model = Sequential()
# hidden layer mit 105 inputs, 10 units
model.add(Dense(20, input_dim=105, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(20, activation = 'relu'))
# output layer
model.add(Dense(1, activation='sigmoid'))
history = model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'binary_crossentropy'])

In [10]:
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 20)                2120      
_________________________________________________________________
dense_2 (Dense)              (None, 10)                210       
_________________________________________________________________
dense_3 (Dense)              (None, 20)                220       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 21        
Total params: 2,571
Trainable params: 2,571
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
model.fit(X_train, y_train, epochs=150, batch_size=5)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150

KeyboardInterrupt: 

In [None]:
result = model.evaluate(X_train,y_train, verbose=0)
print(dict(zip(model.metrics_names, result)))

In [None]:
y_pred = model.predict_classes(X_test)

In [None]:
print('Metrics on Testdata')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1 Score:',f1_score(y_test, y_pred))

print()
print('Metrics on all data')
print('Accuracy:', accuracy_score(y, model.predict_classes(X)))
print('Recall:', recall_score(y, model.predict_classes(X)))
print('Precision:', precision_score(y, model.predict_classes(X)))
print('F1 Score:',f1_score(y, model.predict_classes(X)))

In [None]:
def plot_roc_curves(fprs, tprs):    
    fig = plt.figure(figsize=(20,10))
    
    for fpr, tpr in zip(fprs, tprs):
        plt.plot(fpr, tpr, label='ROC curve (AUC = %0.2f)' % metrics.auc(fpr, tpr))
    
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    fig.savefig('NN_ROC.jpg', bbox_inches='tight', dpi=150)
    plt.show()
   

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt

y = model.predict(X_test)
fpr, tpr, _ = metrics.roc_curve(y_test, y, pos_label=1)
# plot the curve
plot_roc_curves([fpr], [tpr])


In [None]:
apply = pd.get_dummies(applydata.iloc[:,:-1])

applyName = apply.columns.values
wdName = wd.columns.values
cmp = []
for i in applyName:
    if i not in wdName:
        cmp.append(i)
print(cmp)

print("Data Beschäftigung = Never worked:\n",applydata.loc[applydata['Beschaeftigung'] == ' Never-worked'].Einkommen.value_counts()) # 5
print("Data Geburtsland = Holand-Netherlands:\n",applydata.loc[applydata['Geburtsland'] == ' Holand-Netherlands'].Einkommen.value_counts()) # 1
print("Data Geburtsland = Hungary:\n",applydata.loc[applydata['Geburtsland'] == ' Hungary'].Einkommen.value_counts()) # 12

# replace with ' ?'

applydata.loc[applydata['Beschaeftigung'] == ' Never-worked', 'Beschaeftigung'] = ' ?'
print(applydata.loc[applydata['Beschaeftigung'] == ' Never-worked'].Beschaeftigung)

applydata.loc[applydata['Geburtsland'] == ' Holand-Netherlands', 'Geburtsland'] = ' ?'
print(applydata.loc[applydata['Geburtsland'] == ' Holand-Netherlands'].Beschaeftigung)

applydata.loc[applydata['Geburtsland'] == ' Hungary', 'Geburtsland'] = ' ?'
print(applydata.loc[applydata['Geburtsland'] == ' Hungary'].Beschaeftigung)

In [None]:
apply = pd.get_dummies(applydata.iloc[:,:-1])
apply.head()

In [None]:
y_out = model.predict_classes(apply)

In [None]:
applydata['Einkommen'] = y_out
applydata['Einkommen'].replace({1: ' >50K', 0 : ' <=50K'}, inplace = True)
applydata