In [1]:
import adult_data_functions as af
import numpy as np
import tensorflow as tf
# import os 
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
tf.random.set_seed(0)
np.random.seed(0)

In [3]:
# Load data
categorical_feature_encoder = preprocessing.OneHotEncoder()
sex_encoder = preprocessing.LabelEncoder() 
income_encoder = preprocessing.LabelEncoder()

merge_country_boolean=False
merge_education_boolean=False

train_data_dict = af.preprocess(adult_dt_path="data/adult.data", 
                                categorical_feature_encoder=categorical_feature_encoder, 
                                sex_encoder=sex_encoder,
                                income_encoder=income_encoder, encoder_fit_boolean=True,
                               merge_country_boolean=merge_country_boolean, 
                               merge_education_boolean=merge_education_boolean)
# train_data_dict, excessive_data_dict = \
#     af.preprocess(adult_dt_path="data/adult.data", 
#                   categorical_feature_encoder=categorical_feature_encoder, 
#                   sex_encoder=sex_encoder, race_encoder=race_encoder, 
#                   income_encoder=income_encoder, encoder_fit_boolean=True,
#                   drop_prop_male_poor=0.8)

In [4]:
# split based on income
indices_array = np.arange(train_data_dict["income-label"].shape[0])
train_indices_array, val_indices_array = train_test_split(indices_array, 
                                                           stratify=train_data_dict["income-label"], 
                                                           test_size=0.1, 
                                                           random_state=0)

In [5]:
from sklearn.preprocessing import StandardScaler
train_feature_mat = np.hstack([train_data_dict["categorical-features"][train_indices_array, :].toarray(), 
               train_data_dict["continuous-features"][train_indices_array, :]])
val_feature_mat = np.hstack([train_data_dict["categorical-features"][val_indices_array, :].toarray(), 
               train_data_dict["continuous-features"][val_indices_array, :]])

scaler = StandardScaler()

scaled_train_feature_mat = scaler.fit_transform(train_feature_mat)
scaled_val_feature_mat = scaler.fit_transform(val_feature_mat)

In [6]:
# feature_train_dataset = tf.data.Dataset.zip((continuous_feature_train_dataset, categorical_feature_train_dataset))
# feature_val_dataset = tf.data.Dataset.zip((continuous_feature_val_dataset, categorical_feature_val_dataset))
# feature_train_dataset = tf.data.Dataset.from_tensor_slices(scaled_train_feature_mat)
# feature_val_dataset = tf.data.Dataset.from_tensor_slices(scaled_val_feature_mat)

In [7]:
def _create_connnected_block(n_layers, hidden_dim, output_dim, l2=0):
    if n_layers == 0:
        return None
    layers_list = []
    for i in range(n_layers):
#         if i == 0:
#             layers_list.append(tf.keras.layers.Dense(units=output_dim, activation="elu",
#                                                      kernel_regularizer=regularizer))
        if i == n_layers - 1:
            layers_list.append(tf.keras.layers.Dense(units=output_dim, activation="elu",
                              kernel_regularizer=tf.keras.regularizers.L2(l2)))
        else:
            layers_list.append(tf.keras.layers.Dense(units=hidden_dim, activation="elu",
                              kernel_regularizer=tf.keras.regularizers.L2(l2)))
#         if i == n_layers - 1:
#             layers_list.append(tf.keras.layers.Dense(units=output_dim, activation="elu",
#                                                      kernel_regularizer=regularizer))
#         else:
#             layers_list.append(tf.keras.layers.Dense(units=hidden_dim, activation="elu",
#                                                      kernel_regularizer=regularizer))

    return layers_list

In [8]:
class ModelNetwork(tf.keras.Model):
    # This is the class network we fit on the data.
    def __init__(self, n_layers, hidden_dim, output_dim, l2=0):

        super(ModelNetwork, self).__init__()

        self.hidden_layer = _create_connnected_block(n_layers=n_layers, hidden_dim=hidden_dim, 
                                                        output_dim=output_dim, 
                                                     l2=l2)

        self.final_linear = tf.keras.layers.Dense(
            units=1,
            activation=None,
            kernel_regularizer=tf.keras.regularizers.L2(l2)
        )


    def call(self, inputs):
        output = inputs
        if self.hidden_layer is not None:
            for layer in self.hidden_layer:
                output = layer(output)
        output = self.final_linear(output)

        return output

## Sex

In [9]:
# Prepare x_y_mat (income)
x_y_mat = np.hstack([train_data_dict["sex"].reshape(-1, 1), train_data_dict["income-label"].reshape(-1, 1)])
# x_y_mat = 2 * x_y_mat - 1
# response_train_dataset = tf.data.Dataset.from_tensor_slices(x_y_mat[train_indices_array, 0])
# response_val_dataset = tf.data.Dataset.from_tensor_slices(x_y_mat[val_indices_array, 0])

train_dataset = tf.data.Dataset.from_tensor_slices((scaled_train_feature_mat, x_y_mat[train_indices_array, 0]))
val_dataset = tf.data.Dataset.from_tensor_slices((scaled_val_feature_mat, x_y_mat[val_indices_array, 0]))

buffer_size = 50000
batch_size = 20000
train_dataset = train_dataset.shuffle(buffer_size).batch(batch_size)
val_dataset = val_dataset.shuffle(buffer_size).batch(batch_size)

2022-05-03 00:27:36.007597: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10061 MB memory:  -> device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:42:00.0, compute capability: 6.1


In [10]:
model = ModelNetwork(n_layers=0, hidden_dim=70, output_dim=70, 
                    l2=0)

learning_rate = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=[80, 250], values=[0.01, 0.001, 0.0001])
# learning_rate = 0.0001
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.BinaryAccuracy()])

In [11]:
epochs=180

# tf.keras.callbacks.EarlyStopping(
#         # Stop training when `val_loss` is no longer improving
#         monitor="val_loss",
#         # "no longer improving" being defined as "no better than 1e-2 less"
#         min_delta=1e-2,
#         # "no longer improving" being further defined as "for at least 2 epochs"
#         patience=10,
#         verbose=1,
#     ),
callbacks = [
    
    tf.keras.callbacks.ModelCheckpoint(
        # Path where to save the model
        # The two parameters below mean that we will overwrite
        # the current checkpoint if and only if
        # the `val_loss` score has improved.
        # The saved model name will include the current epoch.
        filepath=f"saved_model/model_{epochs}",
        save_best_only=True,  # Only save a model if `val_loss` has improved.
        monitor="val_binary_accuracy",
        verbose=1,
    )
]

In [12]:
model.fit(train_dataset, epochs=epochs, validation_data=val_dataset, callbacks=callbacks)

Epoch 1/180


2022-05-03 00:27:36.532760: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)



Epoch 00001: val_binary_accuracy improved from -inf to 0.41167, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets


2022-05-03 00:27:37.212976: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


Epoch 2/180

Epoch 00002: val_binary_accuracy improved from 0.41167 to 0.42526, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 3/180

Epoch 00003: val_binary_accuracy improved from 0.42526 to 0.43918, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 4/180

Epoch 00004: val_binary_accuracy improved from 0.43918 to 0.44581, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 5/180

Epoch 00005: val_binary_accuracy improved from 0.44581 to 0.45774, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 6/180

Epoch 00006: val_binary_accuracy improved from 0.45774 to 0.46337, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 7/180

Epoch 00007: val_binary_accuracy improved from 0.46337 to 0.47332, saving mod

INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 28/180

Epoch 00028: val_binary_accuracy improved from 0.67948 to 0.68147, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 29/180

Epoch 00029: val_binary_accuracy improved from 0.68147 to 0.68545, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 30/180

Epoch 00030: val_binary_accuracy improved from 0.68545 to 0.69175, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 31/180

Epoch 00031: val_binary_accuracy improved from 0.69175 to 0.69572, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 32/180

Epoch 00032: val_binary_accuracy improved from 0.69572 to 0.70103, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 33/180

Epoch 0


Epoch 00057: val_binary_accuracy did not improve from 0.71362
Epoch 58/180

Epoch 00058: val_binary_accuracy did not improve from 0.71362
Epoch 59/180

Epoch 00059: val_binary_accuracy did not improve from 0.71362
Epoch 60/180

Epoch 00060: val_binary_accuracy did not improve from 0.71362
Epoch 61/180

Epoch 00061: val_binary_accuracy did not improve from 0.71362
Epoch 62/180

Epoch 00062: val_binary_accuracy did not improve from 0.71362
Epoch 63/180

Epoch 00063: val_binary_accuracy did not improve from 0.71362
Epoch 64/180

Epoch 00064: val_binary_accuracy did not improve from 0.71362
Epoch 65/180

Epoch 00065: val_binary_accuracy did not improve from 0.71362
Epoch 66/180

Epoch 00066: val_binary_accuracy did not improve from 0.71362
Epoch 67/180

Epoch 00067: val_binary_accuracy did not improve from 0.71362
Epoch 68/180

Epoch 00068: val_binary_accuracy did not improve from 0.71362
Epoch 69/180

Epoch 00069: val_binary_accuracy did not improve from 0.71362
Epoch 70/180

Epoch 00070


Epoch 00093: val_binary_accuracy improved from 0.71495 to 0.71528, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 94/180

Epoch 00094: val_binary_accuracy improved from 0.71528 to 0.71561, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 95/180

Epoch 00095: val_binary_accuracy improved from 0.71561 to 0.71594, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 96/180

Epoch 00096: val_binary_accuracy did not improve from 0.71594
Epoch 97/180

Epoch 00097: val_binary_accuracy did not improve from 0.71594
Epoch 98/180

Epoch 00098: val_binary_accuracy did not improve from 0.71594
Epoch 99/180

Epoch 00099: val_binary_accuracy did not improve from 0.71594
Epoch 100/180

Epoch 00100: val_binary_accuracy improved from 0.71594 to 0.71627, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved


Epoch 00125: val_binary_accuracy did not improve from 0.71893
Epoch 126/180

Epoch 00126: val_binary_accuracy did not improve from 0.71893
Epoch 127/180

Epoch 00127: val_binary_accuracy did not improve from 0.71893
Epoch 128/180

Epoch 00128: val_binary_accuracy did not improve from 0.71893
Epoch 129/180

Epoch 00129: val_binary_accuracy did not improve from 0.71893
Epoch 130/180

Epoch 00130: val_binary_accuracy did not improve from 0.71893
Epoch 131/180

Epoch 00131: val_binary_accuracy did not improve from 0.71893
Epoch 132/180

Epoch 00132: val_binary_accuracy did not improve from 0.71893
Epoch 133/180

Epoch 00133: val_binary_accuracy did not improve from 0.71893
Epoch 134/180

Epoch 00134: val_binary_accuracy did not improve from 0.71893
Epoch 135/180

Epoch 00135: val_binary_accuracy did not improve from 0.71893
Epoch 136/180

Epoch 00136: val_binary_accuracy did not improve from 0.71893
Epoch 137/180

Epoch 00137: val_binary_accuracy did not improve from 0.71893
Epoch 138/180


Epoch 00163: val_binary_accuracy did not improve from 0.71893
Epoch 164/180

Epoch 00164: val_binary_accuracy did not improve from 0.71893
Epoch 165/180

Epoch 00165: val_binary_accuracy did not improve from 0.71893
Epoch 166/180

Epoch 00166: val_binary_accuracy did not improve from 0.71893
Epoch 167/180

Epoch 00167: val_binary_accuracy did not improve from 0.71893
Epoch 168/180

Epoch 00168: val_binary_accuracy did not improve from 0.71893
Epoch 169/180

Epoch 00169: val_binary_accuracy did not improve from 0.71893
Epoch 170/180

Epoch 00170: val_binary_accuracy did not improve from 0.71893
Epoch 171/180

Epoch 00171: val_binary_accuracy did not improve from 0.71893
Epoch 172/180

Epoch 00172: val_binary_accuracy did not improve from 0.71893
Epoch 173/180

Epoch 00173: val_binary_accuracy did not improve from 0.71893
Epoch 174/180

Epoch 00174: val_binary_accuracy did not improve from 0.71893
Epoch 175/180

Epoch 00175: val_binary_accuracy did not improve from 0.71893
Epoch 176/180

<keras.callbacks.History at 0x7f9cfbb333d0>

In [13]:
test_data_dict = af.preprocess(adult_dt_path="data/adult.test", 
                                categorical_feature_encoder=categorical_feature_encoder, 
                                sex_encoder=sex_encoder,
                                income_encoder=income_encoder, encoder_fit_boolean=False,
                               merge_country_boolean=merge_country_boolean, 
                               merge_education_boolean=merge_education_boolean)

test_feature_mat = np.hstack([test_data_dict["categorical-features"].toarray(), 
               test_data_dict["continuous-features"]])

scaled_test_feature_mat = scaler.transform(test_feature_mat)

test_x_y_mat = np.hstack([test_data_dict["sex"].reshape(-1, 1), test_data_dict["income-label"].reshape(-1, 1)])
test_dataset = tf.data.Dataset.from_tensor_slices((scaled_test_feature_mat, test_x_y_mat[:, 0])).batch(batch_size)

In [14]:
best_model = tf.keras.models.load_model(f"saved_model/model_{epochs}")

In [15]:
for input, label in test_dataset:
    prob = best_model(input)
    y_pred = np.ones(input.shape[0])
    y_pred[prob.numpy().squeeze() < 0.5] = 0
    result = af.score_summary(y_true=label, y_pred=y_pred, pos_label=1)
    

In [16]:
result

Unnamed: 0,accuracy,precision,recall,f1
0,0.723108,0.851878,0.713019,0.776288


In [17]:
af.score_summary(y_true=label, y_pred=y_pred, pos_label=0)

Unnamed: 0,accuracy,precision,recall,f1
0,0.723108,0.556571,0.743945,0.63676


## Income

In [18]:
train_dataset = tf.data.Dataset.from_tensor_slices((scaled_train_feature_mat, x_y_mat[train_indices_array, 1]))
val_dataset = tf.data.Dataset.from_tensor_slices((scaled_val_feature_mat, x_y_mat[val_indices_array, 1]))

buffer_size = 50000
batch_size = 20000
train_dataset = train_dataset.shuffle(buffer_size).batch(batch_size)
val_dataset = val_dataset.shuffle(buffer_size).batch(batch_size)

In [19]:
model = ModelNetwork(n_layers=0, hidden_dim=70, output_dim=70, 
                    l2=0)

learning_rate = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=[12, 250], values=[0.01, 0.001, 0.0001])
# learning_rate = 0.0001
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.BinaryAccuracy()])

In [20]:
epochs=42

# tf.keras.callbacks.EarlyStopping(
#         # Stop training when `val_loss` is no longer improving
#         monitor="val_loss",
#         # "no longer improving" being defined as "no better than 1e-2 less"
#         min_delta=1e-2,
#         # "no longer improving" being further defined as "for at least 2 epochs"
#         patience=10,
#         verbose=1,
#     ),
callbacks = [
    
    tf.keras.callbacks.ModelCheckpoint(
        # Path where to save the model
        # The two parameters below mean that we will overwrite
        # the current checkpoint if and only if
        # the `val_loss` score has improved.
        # The saved model name will include the current epoch.
        filepath=f"saved_model/model_{epochs}",
        save_best_only=True,  # Only save a model if `val_loss` has improved.
        monitor="val_binary_accuracy",
        verbose=1,
    )
]

In [21]:
model.fit(train_dataset, epochs=epochs, validation_data=val_dataset, callbacks=callbacks)

Epoch 1/42

Epoch 00001: val_binary_accuracy improved from -inf to 0.57938, saving model to saved_model/model_42
INFO:tensorflow:Assets written to: saved_model/model_42/assets
Epoch 2/42

Epoch 00002: val_binary_accuracy improved from 0.57938 to 0.59828, saving model to saved_model/model_42
INFO:tensorflow:Assets written to: saved_model/model_42/assets
Epoch 3/42

Epoch 00003: val_binary_accuracy improved from 0.59828 to 0.61850, saving model to saved_model/model_42
INFO:tensorflow:Assets written to: saved_model/model_42/assets
Epoch 4/42

Epoch 00004: val_binary_accuracy improved from 0.61850 to 0.65462, saving model to saved_model/model_42
INFO:tensorflow:Assets written to: saved_model/model_42/assets
Epoch 5/42

Epoch 00005: val_binary_accuracy improved from 0.65462 to 0.69539, saving model to saved_model/model_42
INFO:tensorflow:Assets written to: saved_model/model_42/assets
Epoch 6/42

Epoch 00006: val_binary_accuracy improved from 0.69539 to 0.71826, saving model to saved_model/m


Epoch 00026: val_binary_accuracy improved from 0.76699 to 0.76831, saving model to saved_model/model_42
INFO:tensorflow:Assets written to: saved_model/model_42/assets
Epoch 27/42

Epoch 00027: val_binary_accuracy improved from 0.76831 to 0.76997, saving model to saved_model/model_42
INFO:tensorflow:Assets written to: saved_model/model_42/assets
Epoch 28/42

Epoch 00028: val_binary_accuracy improved from 0.76997 to 0.77063, saving model to saved_model/model_42
INFO:tensorflow:Assets written to: saved_model/model_42/assets
Epoch 29/42

Epoch 00029: val_binary_accuracy improved from 0.77063 to 0.77130, saving model to saved_model/model_42
INFO:tensorflow:Assets written to: saved_model/model_42/assets
Epoch 30/42

Epoch 00030: val_binary_accuracy improved from 0.77130 to 0.77196, saving model to saved_model/model_42
INFO:tensorflow:Assets written to: saved_model/model_42/assets
Epoch 31/42

Epoch 00031: val_binary_accuracy improved from 0.77196 to 0.77295, saving model to saved_model/mode

<keras.callbacks.History at 0x7f9cfb73e410>

In [22]:
test_x_y_mat = np.hstack([test_data_dict["sex"].reshape(-1, 1), test_data_dict["income-label"].reshape(-1, 1)])
test_dataset = tf.data.Dataset.from_tensor_slices((scaled_test_feature_mat, test_x_y_mat[:, 1])).batch(batch_size)

In [23]:
best_model = tf.keras.models.load_model(f"saved_model/model_{epochs}")

In [24]:
for input, label in test_dataset:
    prob = best_model(input)
    y_pred = np.ones(input.shape[0])
    y_pred[prob.numpy().squeeze() < 0.5] = 0
    result = af.score_summary(y_true=label, y_pred=y_pred, pos_label=1)
    

In [25]:
result

Unnamed: 0,accuracy,precision,recall,f1
0,0.77656,0.56023,0.421081,0.48079


In [26]:
af.score_summary(y_true=label, y_pred=y_pred, pos_label=0)

Unnamed: 0,accuracy,precision,recall,f1
0,0.77656,0.825556,0.892342,0.85765


# PCA

In [27]:
scaled_train_feature_mat.shape

(27145, 84)

In [28]:
from sklearn.decomposition import PCA
pca = PCA(n_components=scaled_train_feature_mat.shape[1] - 5)
train_pca_x_mat = pca.fit_transform(scaled_train_feature_mat)

In [29]:
np.cumsum(pca.explained_variance_ratio_)

array([0.03667647, 0.06211572, 0.08496635, 0.10415067, 0.12146717,
       0.13790808, 0.15357531, 0.16855119, 0.18322544, 0.19775808,
       0.21212666, 0.22603055, 0.23979111, 0.25340694, 0.26675827,
       0.28005203, 0.29325943, 0.30634304, 0.31935954, 0.33226456,
       0.34492027, 0.35755612, 0.37012892, 0.38262821, 0.39500825,
       0.40738547, 0.41971001, 0.4319667 , 0.44420509, 0.4564328 ,
       0.46853473, 0.48057313, 0.49256738, 0.50454624, 0.51650008,
       0.5284399 , 0.54037696, 0.55230951, 0.56423883, 0.57616274,
       0.58808409, 0.60000009, 0.61191543, 0.62382776, 0.63573833,
       0.64764623, 0.6595363 , 0.67141923, 0.68329832, 0.69516042,
       0.70700661, 0.71883326, 0.73064611, 0.74241439, 0.75415581,
       0.76585931, 0.77753   , 0.78917721, 0.80077922, 0.8122942 ,
       0.82375581, 0.83519017, 0.84653284, 0.85783054, 0.86895645,
       0.87995118, 0.8908469 , 0.90165268, 0.91234528, 0.92284802,
       0.93320193, 0.94338669, 0.95332953, 0.96240343, 0.97105

In [30]:
np.linalg.cond(train_pca_x_mat)

2.487508684934461

In [31]:
val_pca_x_mat = pca.transform(scaled_val_feature_mat)
test_pca_x_mat = pca.transform(scaled_test_feature_mat)

## Income

In [32]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_pca_x_mat, x_y_mat[train_indices_array, 1]))
val_dataset = tf.data.Dataset.from_tensor_slices((val_pca_x_mat, x_y_mat[val_indices_array, 1]))

buffer_size = 50000
batch_size = 20000
train_dataset = train_dataset.shuffle(buffer_size).batch(batch_size)
val_dataset = val_dataset.shuffle(buffer_size).batch(batch_size)

In [33]:
model = ModelNetwork(n_layers=0, hidden_dim=70, output_dim=70, 
                    l2=0)

learning_rate = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=[20, 74], values=[0.01, 0.001, 0.0001])
# learning_rate = 0.0001
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.BinaryAccuracy()])

In [34]:
epochs=80

# tf.keras.callbacks.EarlyStopping(
#         # Stop training when `val_loss` is no longer improving
#         monitor="val_loss",
#         # "no longer improving" being defined as "no better than 1e-2 less"
#         min_delta=1e-2,
#         # "no longer improving" being further defined as "for at least 2 epochs"
#         patience=10,
#         verbose=1,
#     ),
callbacks = [
    
    tf.keras.callbacks.ModelCheckpoint(
        # Path where to save the model
        # The two parameters below mean that we will overwrite
        # the current checkpoint if and only if
        # the `val_loss` score has improved.
        # The saved model name will include the current epoch.
        filepath=f"saved_model/model_{epochs}",
        save_best_only=True,  # Only save a model if `val_loss` has improved.
        monitor="val_binary_accuracy",
        verbose=1,
    )
]

In [35]:
model.fit(train_dataset, epochs=epochs, validation_data=val_dataset, callbacks=callbacks)

Epoch 1/80

Epoch 00001: val_binary_accuracy improved from -inf to 0.62877, saving model to saved_model/model_80
INFO:tensorflow:Assets written to: saved_model/model_80/assets
Epoch 2/80

Epoch 00002: val_binary_accuracy improved from 0.62877 to 0.66059, saving model to saved_model/model_80
INFO:tensorflow:Assets written to: saved_model/model_80/assets
Epoch 3/80

Epoch 00003: val_binary_accuracy improved from 0.66059 to 0.68213, saving model to saved_model/model_80
INFO:tensorflow:Assets written to: saved_model/model_80/assets
Epoch 4/80

Epoch 00004: val_binary_accuracy improved from 0.68213 to 0.69606, saving model to saved_model/model_80
INFO:tensorflow:Assets written to: saved_model/model_80/assets
Epoch 5/80

Epoch 00005: val_binary_accuracy improved from 0.69606 to 0.70633, saving model to saved_model/model_80
INFO:tensorflow:Assets written to: saved_model/model_80/assets
Epoch 6/80

Epoch 00006: val_binary_accuracy improved from 0.70633 to 0.71594, saving model to saved_model/m

INFO:tensorflow:Assets written to: saved_model/model_80/assets
Epoch 28/80

Epoch 00028: val_binary_accuracy improved from 0.76367 to 0.76400, saving model to saved_model/model_80
INFO:tensorflow:Assets written to: saved_model/model_80/assets
Epoch 29/80

Epoch 00029: val_binary_accuracy improved from 0.76400 to 0.76467, saving model to saved_model/model_80
INFO:tensorflow:Assets written to: saved_model/model_80/assets
Epoch 30/80

Epoch 00030: val_binary_accuracy did not improve from 0.76467
Epoch 31/80

Epoch 00031: val_binary_accuracy did not improve from 0.76467
Epoch 32/80

Epoch 00032: val_binary_accuracy did not improve from 0.76467
Epoch 33/80

Epoch 00033: val_binary_accuracy improved from 0.76467 to 0.76566, saving model to saved_model/model_80
INFO:tensorflow:Assets written to: saved_model/model_80/assets
Epoch 34/80

Epoch 00034: val_binary_accuracy improved from 0.76566 to 0.76632, saving model to saved_model/model_80
INFO:tensorflow:Assets written to: saved_model/model_80


Epoch 00061: val_binary_accuracy did not improve from 0.76864
Epoch 62/80

Epoch 00062: val_binary_accuracy did not improve from 0.76864
Epoch 63/80

Epoch 00063: val_binary_accuracy did not improve from 0.76864
Epoch 64/80

Epoch 00064: val_binary_accuracy did not improve from 0.76864
Epoch 65/80

Epoch 00065: val_binary_accuracy did not improve from 0.76864
Epoch 66/80

Epoch 00066: val_binary_accuracy did not improve from 0.76864
Epoch 67/80

Epoch 00067: val_binary_accuracy improved from 0.76864 to 0.76898, saving model to saved_model/model_80
INFO:tensorflow:Assets written to: saved_model/model_80/assets
Epoch 68/80

Epoch 00068: val_binary_accuracy did not improve from 0.76898
Epoch 69/80

Epoch 00069: val_binary_accuracy did not improve from 0.76898
Epoch 70/80

Epoch 00070: val_binary_accuracy did not improve from 0.76898
Epoch 71/80

Epoch 00071: val_binary_accuracy did not improve from 0.76898
Epoch 72/80

Epoch 00072: val_binary_accuracy improved from 0.76898 to 0.76931, sa

<keras.callbacks.History at 0x7f9cf02a40d0>

In [36]:
test_x_y_mat = np.hstack([test_data_dict["sex"].reshape(-1, 1), test_data_dict["income-label"].reshape(-1, 1)])
test_dataset = tf.data.Dataset.from_tensor_slices((test_pca_x_mat, test_x_y_mat[:, 1])).batch(batch_size)

In [37]:
best_model = tf.keras.models.load_model(f"saved_model/model_{epochs}")

In [38]:
for input, label in test_dataset:
    prob = best_model(input)
    y_pred = np.ones(input.shape[0])
    y_pred[prob.numpy().squeeze() < 0.5] = 0
    result = af.score_summary(y_true=label, y_pred=y_pred, pos_label=1)
    

In [39]:
result

Unnamed: 0,accuracy,precision,recall,f1
0,0.768526,0.593043,0.184324,0.281237


In [40]:
af.score_summary(y_true=label, y_pred=y_pred, pos_label=0)

Unnamed: 0,accuracy,precision,recall,f1
0,0.768526,0.783034,0.958803,0.86205


## Sex

In [41]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_pca_x_mat, x_y_mat[train_indices_array, 0]))
val_dataset = tf.data.Dataset.from_tensor_slices((val_pca_x_mat, x_y_mat[val_indices_array, 0]))

buffer_size = 50000
batch_size = 20000
train_dataset = train_dataset.shuffle(buffer_size).batch(batch_size)
val_dataset = val_dataset.shuffle(buffer_size).batch(batch_size)

In [42]:
model = ModelNetwork(n_layers=0, hidden_dim=70, output_dim=70, 
                    l2=0)

learning_rate = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=[80, 250], values=[0.01, 0.001, 0.0001])
# learning_rate = 0.0001
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.BinaryAccuracy()])

In [43]:
epochs=180

# tf.keras.callbacks.EarlyStopping(
#         # Stop training when `val_loss` is no longer improving
#         monitor="val_loss",
#         # "no longer improving" being defined as "no better than 1e-2 less"
#         min_delta=1e-2,
#         # "no longer improving" being further defined as "for at least 2 epochs"
#         patience=10,
#         verbose=1,
#     ),
callbacks = [
    
    tf.keras.callbacks.ModelCheckpoint(
        # Path where to save the model
        # The two parameters below mean that we will overwrite
        # the current checkpoint if and only if
        # the `val_loss` score has improved.
        # The saved model name will include the current epoch.
        filepath=f"saved_model/model_{epochs}",
        save_best_only=True,  # Only save a model if `val_loss` has improved.
        monitor="val_binary_accuracy",
        verbose=1,
    )
]

In [44]:
model.fit(train_dataset, epochs=epochs, validation_data=val_dataset, callbacks=callbacks)

Epoch 1/180

Epoch 00001: val_binary_accuracy improved from -inf to 0.50779, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 2/180

Epoch 00002: val_binary_accuracy improved from 0.50779 to 0.53928, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 3/180

Epoch 00003: val_binary_accuracy improved from 0.53928 to 0.55883, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 4/180

Epoch 00004: val_binary_accuracy improved from 0.55883 to 0.57275, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 5/180

Epoch 00005: val_binary_accuracy improved from 0.57275 to 0.58237, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 6/180

Epoch 00006: val_binary_accuracy improved from 0.58237 to 0.59463, saving model 


Epoch 00026: val_binary_accuracy did not improve from 0.68611
Epoch 27/180

Epoch 00027: val_binary_accuracy improved from 0.68611 to 0.68810, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 28/180

Epoch 00028: val_binary_accuracy improved from 0.68810 to 0.69175, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 29/180

Epoch 00029: val_binary_accuracy improved from 0.69175 to 0.69506, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 30/180

Epoch 00030: val_binary_accuracy improved from 0.69506 to 0.69572, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 31/180

Epoch 00031: val_binary_accuracy improved from 0.69572 to 0.69738, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 32/180

Epoch 00


Epoch 00055: val_binary_accuracy improved from 0.71362 to 0.71395, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 56/180

Epoch 00056: val_binary_accuracy did not improve from 0.71395
Epoch 57/180

Epoch 00057: val_binary_accuracy did not improve from 0.71395
Epoch 58/180

Epoch 00058: val_binary_accuracy did not improve from 0.71395
Epoch 59/180

Epoch 00059: val_binary_accuracy improved from 0.71395 to 0.71462, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 60/180

Epoch 00060: val_binary_accuracy improved from 0.71462 to 0.71561, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 61/180

Epoch 00061: val_binary_accuracy did not improve from 0.71561
Epoch 62/180

Epoch 00062: val_binary_accuracy improved from 0.71561 to 0.71594, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_


Epoch 00089: val_binary_accuracy did not improve from 0.71661
Epoch 90/180

Epoch 00090: val_binary_accuracy did not improve from 0.71661
Epoch 91/180

Epoch 00091: val_binary_accuracy did not improve from 0.71661
Epoch 92/180

Epoch 00092: val_binary_accuracy did not improve from 0.71661
Epoch 93/180

Epoch 00093: val_binary_accuracy did not improve from 0.71661
Epoch 94/180

Epoch 00094: val_binary_accuracy did not improve from 0.71661
Epoch 95/180

Epoch 00095: val_binary_accuracy did not improve from 0.71661
Epoch 96/180

Epoch 00096: val_binary_accuracy did not improve from 0.71661
Epoch 97/180

Epoch 00097: val_binary_accuracy did not improve from 0.71661
Epoch 98/180

Epoch 00098: val_binary_accuracy did not improve from 0.71661
Epoch 99/180

Epoch 00099: val_binary_accuracy did not improve from 0.71661
Epoch 100/180

Epoch 00100: val_binary_accuracy improved from 0.71661 to 0.71694, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/


Epoch 00123: val_binary_accuracy improved from 0.71926 to 0.71959, saving model to saved_model/model_180
INFO:tensorflow:Assets written to: saved_model/model_180/assets
Epoch 124/180

Epoch 00124: val_binary_accuracy did not improve from 0.71959
Epoch 125/180

Epoch 00125: val_binary_accuracy did not improve from 0.71959
Epoch 126/180

Epoch 00126: val_binary_accuracy did not improve from 0.71959
Epoch 127/180

Epoch 00127: val_binary_accuracy did not improve from 0.71959
Epoch 128/180

Epoch 00128: val_binary_accuracy did not improve from 0.71959
Epoch 129/180

Epoch 00129: val_binary_accuracy did not improve from 0.71959
Epoch 130/180

Epoch 00130: val_binary_accuracy did not improve from 0.71959
Epoch 131/180

Epoch 00131: val_binary_accuracy did not improve from 0.71959
Epoch 132/180

Epoch 00132: val_binary_accuracy did not improve from 0.71959
Epoch 133/180

Epoch 00133: val_binary_accuracy did not improve from 0.71959
Epoch 134/180

Epoch 00134: val_binary_accuracy did not impr


Epoch 00160: val_binary_accuracy did not improve from 0.71959
Epoch 161/180

Epoch 00161: val_binary_accuracy did not improve from 0.71959
Epoch 162/180

Epoch 00162: val_binary_accuracy did not improve from 0.71959
Epoch 163/180

Epoch 00163: val_binary_accuracy did not improve from 0.71959
Epoch 164/180

Epoch 00164: val_binary_accuracy did not improve from 0.71959
Epoch 165/180

Epoch 00165: val_binary_accuracy did not improve from 0.71959
Epoch 166/180

Epoch 00166: val_binary_accuracy did not improve from 0.71959
Epoch 167/180

Epoch 00167: val_binary_accuracy did not improve from 0.71959
Epoch 168/180

Epoch 00168: val_binary_accuracy did not improve from 0.71959
Epoch 169/180

Epoch 00169: val_binary_accuracy did not improve from 0.71959
Epoch 170/180

Epoch 00170: val_binary_accuracy did not improve from 0.71959
Epoch 171/180

Epoch 00171: val_binary_accuracy did not improve from 0.71959
Epoch 172/180

Epoch 00172: val_binary_accuracy did not improve from 0.71959
Epoch 173/180

<keras.callbacks.History at 0x7f9cfba59ad0>

In [45]:
test_x_y_mat = np.hstack([test_data_dict["sex"].reshape(-1, 1), test_data_dict["income-label"].reshape(-1, 1)])
test_dataset = tf.data.Dataset.from_tensor_slices((test_pca_x_mat, test_x_y_mat[:, 0])).batch(batch_size)

In [46]:
best_model = tf.keras.models.load_model(f"saved_model/model_{epochs}")

In [47]:
for input, label in test_dataset:
    prob = best_model(input)
    y_pred = np.ones(input.shape[0])
    y_pred[prob.numpy().squeeze() < 0.5] = 0
    result = af.score_summary(y_true=label, y_pred=y_pred, pos_label=1)
    

In [48]:
result

Unnamed: 0,accuracy,precision,recall,f1
0,0.725299,0.851051,0.717946,0.778853


In [49]:
af.score_summary(y_true=label, y_pred=y_pred, pos_label=0)

Unnamed: 0,accuracy,precision,recall,f1
0,0.725299,0.559692,0.740484,0.637519
