# ANN, TF & Keras

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def normalize_feature(train, test):
  sc = MinMaxScaler()
  X_train = sc.fit_transform(train)
  X_test = sc.transform(test)

  return X_train, X_test


def evaluate_model(y_test, y_pred):
  acc_score = accuracy_score(y_test, y_pred)
  cm = confusion_matrix(y_test, y_pred)
  cr = classification_report(y_test, y_pred)

  print("Accuracy", acc_score)
  print("Confusion Matrix \n", cm)
  print("Classification Report \n", cr)

## Load Dataset

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

column_names = ['age',
                'workclass',
                'fnlwgt',
                'education',
                'education-num',
                'marital-status',
                'occupation',
                'relationship',
                'race',
                'sex',
                'capital-gain',
                'capital-loss',
                'hours-per-week',
                'native-country',
                'income']

df = pd.read_csv(url, names=column_names)
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


# Data Preprocessing

In [None]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [None]:
df.shape

(32561, 15)

In [None]:
income_level = {" <=50K": 0, " >50K": 1}
df.income = [income_level[item] for item in df.income]
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0


In [None]:
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456,0.24081
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429,0.427581
min,17.0,12285.0,1.0,0.0,0.0,1.0,0.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0,0.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0,0.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0,0.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0,1.0


In [None]:
df_numeric = df[['age', 'fnlwgt',	'education-num', 'capital-gain', 'capital-loss', 'hours-per-week',	'income']]
df_numeric

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income
0,39,77516,13,2174,0,40,0
1,50,83311,13,0,0,13,0
2,38,215646,9,0,0,40,0
3,53,234721,7,0,0,40,0
4,28,338409,13,0,0,40,0
...,...,...,...,...,...,...,...
32556,27,257302,12,0,0,38,0
32557,40,154374,9,0,0,40,1
32558,58,151910,9,0,0,40,0
32559,22,201490,9,0,0,20,0


In [None]:
# Convert to Numpy
data = df_numeric.values
data

array([[    39,  77516,     13, ...,      0,     40,      0],
       [    50,  83311,     13, ...,      0,     13,      0],
       [    38, 215646,      9, ...,      0,     40,      0],
       ...,
       [    58, 151910,      9, ...,      0,     40,      0],
       [    22, 201490,      9, ...,      0,     20,      0],
       [    52, 287927,      9, ...,      0,     40,      1]])

In [None]:
# Split as data and lebel
X, y = data[:, :-1], data[:, -1]
X, y

(array([[    39,  77516,     13,   2174,      0,     40],
        [    50,  83311,     13,      0,      0,     13],
        [    38, 215646,      9,      0,      0,     40],
        ...,
        [    58, 151910,      9,      0,      0,     40],
        [    22, 201490,      9,      0,      0,     20],
        [    52, 287927,      9,  15024,      0,     40]]),
 array([0, 0, 0, ..., 0, 0, 1]))

In [None]:
# Spliting into traing and test 
X_train_org, X_test_org, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_org.shape

(26048, 6)

In [None]:
# Normalization
X_train, X_test = normalize_feature(X_train_org, X_test_org)
X_train

array([[0.21917808, 0.12625338, 0.8       , 0.        , 0.        ,
        0.5       ],
       [0.26027397, 0.05037557, 0.66666667, 0.        , 0.43319559,
        0.5       ],
       [0.56164384, 0.12955135, 0.26666667, 0.        , 0.        ,
        0.39795918],
       ...,
       [0.01369863, 0.13854675, 0.4       , 0.        , 0.        ,
        0.19387755],
       [0.45205479, 0.02850817, 0.53333333, 0.        , 0.        ,
        0.84693878],
       [0.23287671, 0.07835129, 0.8       , 0.        , 0.        ,
        0.60204082]])

# TF-Keras

In [None]:
# Library
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
in_dim = X_train.shape[1]
in_dim

6

In [None]:
# Model Creation
def get_model(in_dim):
  model = Sequential()
  model.add(Dense(2, input_dim=in_dim, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))

  return model

def get_heavy_model(in_dim):
  model = Sequential()
  model.add(Dense(128, input_dim=in_dim, activation='relu'))
  model.add(Dropout(0.4))
  model.add(Dense(64, activation='relu'))
  model.add(Dropout(0.3))
  model.add(Dense(128, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))

  return model

In [None]:
model = get_heavy_model(in_dim)
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_30 (Dense)            (None, 128)               896       
                                                                 
 dropout_12 (Dropout)        (None, 128)               0         
                                                                 
 dense_31 (Dense)            (None, 64)                8256      
                                                                 
 dropout_13 (Dropout)        (None, 64)                0         
                                                                 
 dense_32 (Dense)            (None, 128)               8320      
                                                                 
 dense_33 (Dense)            (None, 1)                 129       
                                                                 
Total params: 17,601
Trainable params: 17,601
Non-trai

In [None]:
# Compile Model
model.compile(loss='BinaryCrossentropy',
              optimizer='Adam',
              metrics='accuracy'
              )

In [None]:
# Early Stopping
es = EarlyStopping(monitor='val_accuracy',
                   mode='max',
                   patience=20,
                   verbose=2)


# ModelCheckpoint
mc = ModelCheckpoint('best_model.h5',
                     monitor='val_accuracy',
                     mode='max',
                     save_best_only=True,
                     verbose=2)

# Train model
history = model.fit(X_train,
                    y_train,
                    batch_size=64,
                    epochs=150,
                    verbose=2,
                    validation_split=0.2,
                    callbacks=[es, mc]
                    )

Epoch 1/150

Epoch 1: val_accuracy improved from -inf to 0.83225, saving model to best_model.h5
326/326 - 3s - loss: 0.3809 - accuracy: 0.8257 - val_loss: 0.3731 - val_accuracy: 0.8322 - 3s/epoch - 10ms/step
Epoch 2/150

Epoch 2: val_accuracy improved from 0.83225 to 0.83301, saving model to best_model.h5
326/326 - 2s - loss: 0.3807 - accuracy: 0.8249 - val_loss: 0.3731 - val_accuracy: 0.8330 - 2s/epoch - 5ms/step
Epoch 3/150

Epoch 3: val_accuracy did not improve from 0.83301
326/326 - 1s - loss: 0.3805 - accuracy: 0.8257 - val_loss: 0.3736 - val_accuracy: 0.8324 - 1s/epoch - 4ms/step
Epoch 4/150

Epoch 4: val_accuracy improved from 0.83301 to 0.83512, saving model to best_model.h5
326/326 - 1s - loss: 0.3804 - accuracy: 0.8254 - val_loss: 0.3705 - val_accuracy: 0.8351 - 1s/epoch - 4ms/step
Epoch 5/150

Epoch 5: val_accuracy did not improve from 0.83512
326/326 - 1s - loss: 0.3813 - accuracy: 0.8238 - val_loss: 0.3739 - val_accuracy: 0.8351 - 1000ms/epoch - 3ms/step
Epoch 6/150

Epoch

In [None]:
# Load the saved best model
saved_model =load_model('best_model.h5')
model = saved_model

In [None]:
np.max(history.history['accuracy'])

0.8271427154541016

In [None]:
np.argmax(history.history['accuracy'])

23

In [None]:
# Evaluate with Keras
_, train_acc = model.evaluate(X_train, y_train)
_, test_acc = model.evaluate(X_test, y_test)
print('Train: %.3f, Test: %.3f' %(train_acc, test_acc))


Train: 0.829, Test: 0.825


In [None]:
# Predict with the model
y_pred_val = model.predict(X_test)
y_pred_val

array([[5.9014380e-02],
       [1.6244105e-01],
       [3.3621728e-01],
       ...,
       [1.0000000e+00],
       [1.1042735e-01],
       [8.0815807e-05]], dtype=float32)

In [None]:
y_pred = (y_pred_val >= 0.5).astype('int')
y_pred

array([[0],
       [0],
       [0],
       ...,
       [1],
       [0],
       [0]])

In [None]:
evaluate_model(y_test, y_pred)

Accuracy 0.8283433133732535
Confusion Matrix 
 [[4744  198]
 [ 920  651]]
Classification Report 
               precision    recall  f1-score   support

           0       0.84      0.96      0.89      4942
           1       0.77      0.41      0.54      1571

    accuracy                           0.83      6513
   macro avg       0.80      0.69      0.72      6513
weighted avg       0.82      0.83      0.81      6513

