In [39]:
import adult_data_functions as af
import numpy as np
import tensorflow as tf
# import os 
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Process Data

In [40]:
# Load data
categorical_feature_encoder = preprocessing.OneHotEncoder()
sex_encoder = preprocessing.LabelEncoder() 
race_encoder = preprocessing.LabelEncoder()
income_encoder = preprocessing.LabelEncoder()
# train_data_dict = af.preprocess(adult_dt_path="data/adult.data", 
#                                 categorical_feature_encoder=categorical_feature_encoder, 
#                                 sex_encoder=sex_encoder, race_encoder=race_encoder, 
#                                 income_encoder=income_encoder, encoder_fit_boolean=True)
train_data_dict, excessive_data_dict = \
    af.preprocess(adult_dt_path="data/adult.data", 
                  categorical_feature_encoder=categorical_feature_encoder, 
                  sex_encoder=sex_encoder, race_encoder=race_encoder, 
                  income_encoder=income_encoder, encoder_fit_boolean=True,
                  drop_prop_male_poor=0.7)

In [41]:
train_data_dict.keys()

dict_keys(['sex', 'race', 'income-label', 'categorical-features', 'continuous-features'])

## Train Validation Split

In [42]:
indices_array = np.arange(train_data_dict["income-label"].shape[0])
len(indices_array)

21972

In [43]:
# split based on income
train_indices_array, val_indices_array = train_test_split(indices_array, 
                                                           stratify=train_data_dict["income-label"], 
                                                           test_size=0.1, 
                                                           random_state=0)

In [44]:
categorical_feature_train_dataset = \
    tf.data.Dataset.from_tensor_slices(train_data_dict["categorical-features"][train_indices_array, :].toarray())
categorical_feature_val_dataset = \
    tf.data.Dataset.from_tensor_slices(train_data_dict["categorical-features"][val_indices_array, :].toarray())

In [45]:
continuous_feature_train_dataset = \
    tf.data.Dataset.from_tensor_slices(train_data_dict["continuous-features"].iloc[train_indices_array, :])
continuous_feature_val_dataset = \
    tf.data.Dataset.from_tensor_slices(train_data_dict["continuous-features"].iloc[val_indices_array, :])

In [46]:
feature_train_dataset = tf.data.Dataset.zip((continuous_feature_train_dataset, categorical_feature_train_dataset))
feature_val_dataset = tf.data.Dataset.zip((continuous_feature_val_dataset, categorical_feature_val_dataset))

In [47]:
# Prepare x_y_mat (sex, income)
x_y_mat = np.hstack([train_data_dict["sex"].reshape(-1, 1), train_data_dict["income-label"].reshape(-1, 1)])
x_y_mat = 2 * x_y_mat - 1
response_train_dataset = tf.data.Dataset.from_tensor_slices(x_y_mat[train_indices_array, :])
response_val_dataset = tf.data.Dataset.from_tensor_slices(x_y_mat[val_indices_array, :])

In [48]:
train_dataset = tf.data.Dataset.zip((feature_train_dataset, response_train_dataset))
val_dataset = tf.data.Dataset.zip((feature_val_dataset, response_val_dataset))

In [49]:
buffer_size = 50000
batch_size = 20000
train_dataset = train_dataset.shuffle(buffer_size).batch(batch_size)
val_dataset = val_dataset.shuffle(buffer_size).batch(batch_size)

# Training

In [50]:
f1_sex_metric = af.Metric(name="f1", response_name="sex")
f1_income_metric = af.Metric(name="f1", response_name="income-label")
accuracy_sex_metric = af.Metric(name="accuracy", response_name="sex")
accuracy_income_metric = af.Metric(name="accuracy", response_name="income-label")

In [51]:
# model = af.BranchesModel(n_shared_layers=1, shared_hidden_dim=0, shared_output_dim=2, 
#                          n_x_layers=0, x_hidden_dim=0, n_y_layers=0, y_hidden_dim=0, 
#                          education_dim=5, occupation_dim=5,
#                         shared_regularizer=tf.keras.regularizers.L2(0.05), x_regularizer=None, 
#                         y_regularizer=None)

# model = af.BranchesModel(n_shared_layers=3, shared_hidden_dim=60, shared_output_dim=60, n_x_layers=2, x_hidden_dim=40,
#                       n_y_layers=2, y_hidden_dim=40, education_dim=5, occupation_dim=5)

model = af.ModelNetwork(n_layers=0, hidden_dim=None, final_dim=59)
# model_args_dict = {"n_layers": 1, "final_dim": 0, "hidden_dim": None, 
#                    "education_dim": None, "occupation_dim": None}
# model = af.ModelNetwork(**model_args_dict)
# model.initialize();

In [52]:
learning_rate = 0.01
epochs=20

In [53]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
              loss=af.ising_likelihood, 
              metrics=[f1_sex_metric, f1_income_metric, accuracy_sex_metric, accuracy_income_metric],
              run_eagerly=True)

callbacks = [
    tf.keras.callbacks.EarlyStopping(
        # Stop training when `val_loss` is no longer improving
        monitor="val_loss",
        # "no longer improving" being defined as "no better than 1e-2 less"
        min_delta=1e-2,
        # "no longer improving" being further defined as "for at least 2 epochs"
        patience=1,
        verbose=1,
    ),
    tf.keras.callbacks.ModelCheckpoint(
        # Path where to save the model
        # The two parameters below mean that we will overwrite
        # the current checkpoint if and only if
        # the `val_loss` score has improved.
        # The saved model name will include the current epoch.
        filepath=f"saved_model/model_{epoch},",
        save_best_only=True,  # Only save a model if `val_loss` has improved.
        monitor="val_loss",
        verbose=1,
    )
]
 
     

In [54]:
tf.random.seed(0)
model.fit(train_dataset, epochs=epochs, validation_data=val_dataset, callbacks=callbacks)

Epoch 1/20

Epoch 00001: val_loss improved from inf to 60976.31250, saving model to saved_model/model_20,
INFO:tensorflow:Assets written to: saved_model/model_20,/assets
Epoch 2/20

Epoch 00002: val_loss improved from 60976.31250 to 57742.23828, saving model to saved_model/model_20,
INFO:tensorflow:Assets written to: saved_model/model_20,/assets
Epoch 3/20

Epoch 00003: val_loss improved from 57742.23828 to 54508.27344, saving model to saved_model/model_20,
INFO:tensorflow:Assets written to: saved_model/model_20,/assets
Epoch 4/20

Epoch 00004: val_loss improved from 54508.27344 to 51274.20312, saving model to saved_model/model_20,
INFO:tensorflow:Assets written to: saved_model/model_20,/assets
Epoch 5/20

Epoch 00005: val_loss improved from 51274.20312 to 48040.23828, saving model to saved_model/model_20,
INFO:tensorflow:Assets written to: saved_model/model_20,/assets
Epoch 6/20

Epoch 00006: val_loss improved from 48040.23828 to 44806.91797, saving model to saved_model/model_20,
INFO

Epoch 18/20

Epoch 00018: val_loss improved from 9558.75977 to 6416.41113, saving model to saved_model/model_20,
INFO:tensorflow:Assets written to: saved_model/model_20,/assets
Epoch 19/20

Epoch 00019: val_loss improved from 6416.41113 to 3347.94727, saving model to saved_model/model_20,
INFO:tensorflow:Assets written to: saved_model/model_20,/assets
Epoch 20/20

Epoch 00020: val_loss improved from 3347.94727 to 673.00623, saving model to saved_model/model_20,
INFO:tensorflow:Assets written to: saved_model/model_20,/assets


<keras.callbacks.History at 0x7f918d383890>

# Check Performance on the Test Set

In [23]:
test_data_dict = af.preprocess(adult_dt_path="data/adult.test", 
                                categorical_feature_encoder=categorical_feature_encoder, 
                                sex_encoder=sex_encoder, race_encoder=race_encoder, 
                                income_encoder=income_encoder, encoder_fit_boolean=True)

In [24]:
test_data_dict["continuous-features"]["age"].isna().sum()

0

In [25]:
categorical_feature_test_dataset = \
    tf.data.Dataset.from_tensor_slices(test_data_dict["categorical-features"].toarray())
continuous_feature_test_dataset = \
    tf.data.Dataset.from_tensor_slices(test_data_dict["continuous-features"].to_numpy().astype("float32"))
feature_test_dataset = tf.data.Dataset.zip((continuous_feature_test_dataset, categorical_feature_test_dataset))

x_y_mat = np.hstack([test_data_dict["sex"].reshape(-1, 1), test_data_dict["income-label"].reshape(-1, 1)])
x_y_mat = 2 * x_y_mat - 1

test_dataset = tf.data.Dataset.zip((feature_test_dataset, tf.data.Dataset.from_tensor_slices(x_y_mat)))
test_dataset = test_dataset.batch(batch_size)

## Income

In [27]:
af.tf_score_summary(model=model, dataset=test_dataset)

Unnamed: 0,accuracy,precision,recall,f1
sex-1,0.332965,0.332965,1.0,0.499585
income1,0.763774,0.763774,1.0,0.866068


In [37]:
prob = af.ising_predict(model(list(test_dataset.take(1))[0][0]))

In [38]:
prob

<tf.Tensor: shape=(16281, 2), dtype=float32, numpy=
array([[0., 0.],
       [0., 0.],
       [0., 0.],
       ...,
       [0., 0.],
       [0., 0.],
       [0., 0.]], dtype=float32)>

In [None]:
income_test_f1 = f1_score(test_data_dict["income-label"], pred_test_x_y_mat[:, 1])
sex_test_f1 = f1_score(test_data_dict["sex"], pred_test_x_y_mat[:, 0])

print(f"income test f1 is {income_test_f1}")
print(f"sex test f1 is {sex_test_f1}")

In [None]:
1-sum(x_y_mat[:, 1]==-1)/x_y_mat.shape[0]