In [1]:
import adult_data_functions as af
import numpy as np
import tensorflow as tf
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Process Data

In [2]:
# Load data
categorical_feature_encoder = preprocessing.OneHotEncoder()
sex_encoder = preprocessing.LabelEncoder() 
race_encoder = preprocessing.LabelEncoder()
income_encoder = preprocessing.LabelEncoder()
train_data_dict = af.preprocess(adult_dt_path="data/adult.data", 
                                categorical_feature_encoder=categorical_feature_encoder, 
                                sex_encoder=sex_encoder, race_encoder=race_encoder, 
                                income_encoder=income_encoder, encoder_fit_boolean=True)

## Train Validation Split

In [3]:
indices_array = np.arange(train_data_dict["income-label"].shape[0])
len(indices_array)

32561

In [4]:
# split based on income
train_indices_array, val_indices_array = train_test_split(indices_array, 
                                                           stratify=train_data_dict["income-label"], 
                                                           test_size=0.1, 
                                                           random_state=0)

In [5]:
categorical_feature_train_dataset = \
    tf.data.Dataset.from_tensor_slices(train_data_dict["categorical-features"][train_indices_array, :].toarray())
categorical_feature_val_dataset = \
    tf.data.Dataset.from_tensor_slices(train_data_dict["categorical-features"][val_indices_array, :].toarray())

2022-04-24 00:01:21.947754: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10180 MB memory:  -> device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:42:00.0, compute capability: 6.1


In [6]:
continuous_feature_train_dataset = \
    tf.data.Dataset.from_tensor_slices(train_data_dict["continuous-features"].iloc[train_indices_array, :])
continuous_feature_val_dataset = \
    tf.data.Dataset.from_tensor_slices(train_data_dict["continuous-features"].iloc[val_indices_array, :])

In [7]:
feature_train_dataset = tf.data.Dataset.zip((continuous_feature_train_dataset, categorical_feature_train_dataset))
feature_val_dataset = tf.data.Dataset.zip((continuous_feature_val_dataset, categorical_feature_val_dataset))

In [8]:
# Prepare x_y_mat (sex, income)
x_y_mat = np.hstack([train_data_dict["sex"].reshape(-1, 1), train_data_dict["income-label"].reshape(-1, 1)])
x_y_mat = 2 * x_y_mat - 1
response_train_dataset = tf.data.Dataset.from_tensor_slices(x_y_mat[train_indices_array, :])
response_val_dataset = tf.data.Dataset.from_tensor_slices(x_y_mat[val_indices_array, :])

In [9]:
train_dataset = tf.data.Dataset.zip((feature_train_dataset, response_train_dataset))
val_dataset = tf.data.Dataset.zip((feature_val_dataset, response_val_dataset))

In [10]:
buffer_size = 50000
batch_size = 20000
train_dataset = train_dataset.shuffle(buffer_size).batch(batch_size)
val_dataset = val_dataset.shuffle(buffer_size).batch(batch_size)

# Training

In [11]:
model = af.BranchesModel(n_shared_layers=1, shared_hidden_dim=0, shared_output_dim=2, 
                         n_x_layers=0, x_hidden_dim=0, n_y_layers=0, y_hidden_dim=0, 
                         education_dim=5, occupation_dim=5,
                        shared_regularizer=tf.keras.regularizers.L2(0.05), x_regularizer=None, 
                        y_regularizer=None)

# model = af.BranchesModel(n_shared_layers=3, shared_hidden_dim=60, shared_output_dim=60, n_x_layers=2, x_hidden_dim=40,
#                       n_y_layers=2, y_hidden_dim=40, education_dim=5, occupation_dim=5)

# model = af.ModelNetwork(number_forward_layers=1, hidden_dim=80, final_dim=60, output_dim=2,
#                        education_dim=5, occupation_dim=5)
# model.initialize();

In [12]:
f1_sex_metric = af.Metric(name="f1", response_name="sex")
f1_income_metric = af.Metric(name="f1", response_name="income-label")
accuracy_sex_metric = af.Metric(name="accuracy", response_name="sex")
accuracy_income_metric = af.Metric(name="accuracy", response_name="income-label")
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),
              loss=af.ising_likelihood, 
              metrics=[f1_sex_metric, f1_income_metric, accuracy_sex_metric, accuracy_income_metric],
              run_eagerly=True)

In [13]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        # Stop training when `val_loss` is no longer improving
        monitor="val_loss",
        # "no longer improving" being defined as "no better than 1e-2 less"
        min_delta=1e-2,
        # "no longer improving" being further defined as "for at least 2 epochs"
        patience=1,
        verbose=1,
    )
]

#      tf.keras.callbacks.ModelCheckpoint(
#         # Path where to save the model
#         # The two parameters below mean that we will overwrite
#         # the current checkpoint if and only if
#         # the `val_loss` score has improved.
#         # The saved model name will include the current epoch.
#         filepath="saved_model/mymodel_{epoch}",
#         save_best_only=True,  # Only save a model if `val_loss` has improved.
#         monitor="val_loss",
#         verbose=1,
#     )

In [14]:
model.fit(train_dataset, epochs=20, validation_data=val_dataset, callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 00003: early stopping


<keras.callbacks.History at 0x7fa449106f50>

# Check Performance on the Test Set

In [15]:
test_data_dict = af.preprocess(adult_dt_path="data/adult.test", 
                                categorical_feature_encoder=categorical_feature_encoder, 
                                sex_encoder=sex_encoder, race_encoder=race_encoder, 
                                income_encoder=income_encoder, encoder_fit_boolean=True)

In [16]:
test_data_dict["continuous-features"]["age"].isna().sum()

0

In [17]:
categorical_feature_test_dataset = \
    tf.data.Dataset.from_tensor_slices(test_data_dict["categorical-features"].toarray())
continuous_feature_test_dataset = \
    tf.data.Dataset.from_tensor_slices(test_data_dict["continuous-features"].to_numpy().astype("float32"))
feature_test_dataset = tf.data.Dataset.zip((continuous_feature_test_dataset, categorical_feature_test_dataset))
feature_test_dataset = feature_test_dataset.batch(batch_size)

In [18]:
test_par_list = []
for input in feature_test_dataset:
    test_par_list.append(model(input))

In [19]:
test_par = tf.concat(test_par_list, axis=0)

In [20]:
test_par.shape

TensorShape([16281, 2])

In [21]:
test_par

<tf.Tensor: shape=(16281, 2), dtype=float32, numpy=
array([[-1., -1.],
       [-1., -1.],
       [-1., -1.],
       ...,
       [-1., -1.],
       [-1., -1.],
       [-1., -1.]], dtype=float32)>

In [22]:
pred_test_x_y_mat = af.ising_predict(test_par)
pred_test_x_y_mat

array([[1, 1],
       [1, 1],
       [1, 1],
       ...,
       [1, 1],
       [1, 1],
       [1, 1]])

In [None]:
model(list(train_dataset.take(1))[0][0])

In [None]:
from sklearn.metrics import accuracy_score, f1_score

## Income

In [None]:
income_test_accuracy = accuracy_score(test_data_dict["income-label"], pred_test_x_y_mat[:, 1])
sex_test_accuracy = accuracy_score(test_data_dict["sex"], pred_test_x_y_mat[:, 0])

print(f"income test accuracy is {income_test_accuracy}")
print(f"sex test accuracy is {sex_test_accuracy}")

In [None]:
income_test_f1 = f1_score(test_data_dict["income-label"], pred_test_x_y_mat[:, 1])
sex_test_f1 = f1_score(test_data_dict["sex"], pred_test_x_y_mat[:, 0])

print(f"income test f1 is {income_test_f1}")
print(f"sex test f1 is {sex_test_f1}")

In [28]:
1-sum(x_y_mat[:, 1]==-1)/x_y_mat.shape[0]

0.2408095574460244