In [23]:
import adult_data_functions as af
import numpy as np
import tensorflow as tf
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [24]:
categorical_feature_encoder = preprocessing.OneHotEncoder()
sex_encoder = preprocessing.LabelEncoder() 
race_encoder = preprocessing.LabelEncoder()
income_encoder = preprocessing.LabelEncoder()

train_data_dict = af.preprocess(adult_dt_path="data/adult.data", 
                                categorical_feature_encoder=categorical_feature_encoder, 
                                sex_encoder=sex_encoder, race_encoder=race_encoder, 
                                income_encoder=income_encoder, encoder_fit_boolean=True)

In [25]:
indices_array = np.arange(train_data_dict["income-label"].shape[0])
len(indices_array)

32561

In [26]:
# split based on income
train_indices_array, val_indices_array = train_test_split(indices_array, 
                                                           stratify=train_data_dict["income-label"], 
                                                           test_size=0.1, 
                                                           random_state=0)

In [27]:
logistic_x_train_mat = np.hstack([train_data_dict["categorical-features"].toarray(), 
                                  train_data_dict["continuous-features"].to_numpy()])
# Prepare x_y_mat (sex, income)
x_y_mat = np.hstack([train_data_dict["sex"].reshape(-1, 1), train_data_dict["income-label"].reshape(-1, 1)])



In [28]:
buffer_size = 50000
batch_size = 20000

train_dataset = train_dataset.shuffle(buffer_size).batch(batch_size)
val_dataset = val_dataset.batch(batch_size)

# Model

In [29]:
class ModelNetwork(tf.keras.Model):
    # This is the class network we fit on the data.
    def __init__(self, n_layers, hidden_dim, output_dim, final_layer_regularizer=None):

        super(ModelNetwork, self).__init__()

        self.hidden_layer = af._create_connnected_block(n_layers=n_layers, hidden_dim=hidden_dim, 
                                                        output_dim=output_dim, regularizer=None)

        self.final_linear = tf.keras.layers.Dense(
            units=2,
            activation="softmax",
            kernel_regularizer=final_layer_regularizer
        )


    def call(self, inputs):
        output = inputs
        if self.hidden_layer is not None:
            for layer in self.hidden_layer:
                output = layer(output)
        output = self.final_linear(output)

        return output

## Income

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((logistic_x_train_mat[train_indices_array, :], 
                                                    x_y_mat[train_indices_array, 1]))
val_dataset = tf.data.Dataset.from_tensor_slices((logistic_x_train_mat[val_indices_array, :], 
                                                   x_y_mat[val_indices_array, 1]))

In [74]:
model = ModelNetwork(n_layers=0, hidden_dim=0, output_dim=0, 
                     final_layer_regularizer=tf.keras.regularizers.L2(10000))
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
              run_eagerly=True)

In [79]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        # Stop training when `val_loss` is no longer improving
        monitor="val_loss",
        # "no longer improving" being defined as "no better than 1e-2 less"
        min_delta=1e-2,
        # "no longer improving" being further defined as "for at least 2 epochs"
        patience=1,
        verbose=1,
    )
]

In [80]:
tf.random.set_seed(0)
model.fit(train_dataset, epochs=20, validation_data=val_dataset, callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f0536599110>

In [83]:
af.tf_score_summary(model=model, dataset=val_dataset, pos_label=1, 
                    activation=lambda x: tf.argmax(x, axis=1))

Unnamed: 0,accuracy,precision,recall,f1
0,0.760823,1.0,0.006378,0.012674


In [84]:
af.tf_score_summary(model=model, dataset=val_dataset, pos_label=0, 
                    activation=lambda x: tf.argmax(x, axis=1))

Unnamed: 0,accuracy,precision,recall,f1
0,0.760823,0.760455,1.0,0.86393


## Sex

In [90]:
train_dataset = tf.data.Dataset.from_tensor_slices((logistic_x_train_mat[train_indices_array, :], 
                                                    x_y_mat[train_indices_array, 0]))
val_dataset = tf.data.Dataset.from_tensor_slices((logistic_x_train_mat[val_indices_array, :], 
                                                   x_y_mat[val_indices_array, 0]))

buffer_size = 50000
batch_size = 20000

train_dataset = train_dataset.shuffle(buffer_size).batch(batch_size)
val_dataset = val_dataset.batch(batch_size)

In [102]:
# tf.keras.regularizers.L2(10000)
model = ModelNetwork(n_layers=0, hidden_dim=0, output_dim=0, 
                     final_layer_regularizer=tf.keras.regularizers.L2(10000))
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
              run_eagerly=True)

In [104]:
tf.random.set_seed(0)
model.fit(train_dataset, epochs=500, validation_data=val_dataset, callbacks=callbacks)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500


Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 00071: early stopping


<keras.callbacks.History at 0x7f0536644750>

In [94]:
af.tf_score_summary(model=model, dataset=val_dataset, pos_label=1, 
                    activation=lambda x: tf.argmax(x, axis=1))

Unnamed: 0,accuracy,precision,recall,f1
0,0.680381,0.68091,0.998647,0.809724


In [95]:
af.tf_score_summary(model=model, dataset=val_dataset, pos_label=0, 
                    activation=lambda x: tf.argmax(x, axis=1))

Unnamed: 0,accuracy,precision,recall,f1
0,0.680381,0.25,0.000962,0.001918


In [106]:
af.tf_score_summary(model=model, dataset=val_dataset, pos_label=1, 
                    activation=lambda x: tf.argmax(x, axis=1))

Unnamed: 0,accuracy,precision,recall,f1
0,0.322997,0.842105,0.007214,0.014305


In [108]:
tf.nn.softmax(np.array([[1, -1], [1, 1]]))

InvalidArgumentError: Value for attr 'T' of int64 is not in the list of allowed values: half, bfloat16, float, double
	; NodeDef: {{node Softmax}}; Op<name=Softmax; signature=logits:T -> softmax:T; attr=T:type,allowed=[DT_HALF, DT_BFLOAT16, DT_FLOAT, DT_DOUBLE]> [Op:Softmax]

In [None]:
class ModelNetwork(tf.keras.Model):
    # This is the class network we fit on the data.
    def __init__(self, final_dim, number_forward_layers=1, hidden_dim=None, education_dim=None,
                 occupation_dim=None):

        super(ModelNetwork, self).__init__()

        self.number_forward_layers = number_forward_layers
        self.categorical_dim = 53
        if education_dim is not None:
            self.education_embedding_layer = tf.keras.layers.Dense(
                units=education_dim,
                input_dim=(16,)
            )
            self.categorical_dim = self.categorical_dim - 16 + education_dim
        self.education_dim = education_dim

        if occupation_dim is not None:
            self.occupation_embedding_layer = tf.keras.layers.Dense(
                units=occupation_dim,
                input_dim=(15,)
            )
            self.categorical_dim = self.categorical_dim - 15 + occupation_dim
        self.occupation_dim = occupation_dim

        initial_units = final_dim
        if number_forward_layers > 1:
            initial_units = hidden_dim
        self.initial_block = tf.keras.layers.Dense(
            units=initial_units,
            activation="elu"
        )

        self.feed_forward_rest_vet = []
        if number_forward_layers > 1:
            hidden_units = hidden_dim
            for i in np.arange(number_forward_layers - 1):
                if i == number_forward_layers - 2:
                    hidden_units = final_dim
                self.feed_forward_rest_vet.append(
                    tf.keras.layers.Dense(
                        units=hidden_units, activation="elu"
                    )
                )

        self.final_linear = tf.keras.layers.Dense(
            units=2,
            activation="softmax"
        )


    def call(self, inputs):
        continuous_tensor, categorical_tensor = inputs
        continuous_tensor = tf.cast(continuous_tensor, tf.float32)
        categorical_tensor = tf.cast(categorical_tensor, tf.float32)
        if len(continuous_tensor.shape) == 1:
            continuous_tensor = tf.reshape(continuous_tensor, (1, -1))
            categorical_tensor = tf.reshape(categorical_tensor, (1, -1))

        # Process Categorical input
        embedding_boolean_edu_array = np.repeat(True, categorical_tensor.shape[1])
        embedding_boolean_occ_array = np.repeat(True, categorical_tensor.shape[1])
        if self.education_dim is not None:
            embedding_boolean_edu_array[9:25] = False
            education_tensor = tf.boolean_mask(categorical_tensor, ~embedding_boolean_edu_array, 1)
            embedded_education_tensor = self.education_embedding_layer(education_tensor)
        if self.occupation_dim is not None:
            embedding_boolean_occ_array[32:47] = False
            occupation_tensor = tf.boolean_mask(categorical_tensor, ~embedding_boolean_occ_array, 1)
            embedded_occupation_tensor = self.occupation_embedding_layer(occupation_tensor)

        not_embedding_boolean_array = embedding_boolean_edu_array & embedding_boolean_occ_array
        categorical_tensor = tf.boolean_mask(categorical_tensor, not_embedding_boolean_array, 1)
        if self.education_dim is not None:
            categorical_tensor = tf.concat([categorical_tensor, embedded_education_tensor], 1)
        if self.occupation_dim is not None:
            categorical_tensor = tf.concat([categorical_tensor, embedded_occupation_tensor], 1)

        input_tensor = tf.concat([continuous_tensor, categorical_tensor], 1)

        output = self.initial_block(input_tensor)
        if self.number_forward_layers != 1:
            for i in np.arange(self.number_forward_layers - 1):
                output = self.feed_forward_rest_vet[i](output)
        output = self.final_linear(output)

        return output