# Predict FLSA code with job descriptions and compensation
***

Import modules

In [None]:
from __future__ import division, print_function, unicode_literals
import numpy as np
import os
import bert
from bert import BertModelLayer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, BatchNormalization, Input, Lambda, Dropout, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import Model
import tqdm

## Code will work with either tensorflow version, but needs to be executed eagerly if 1.X
if tf.__version__[0] == '1':
    tf.enable_eager_execution()

In [None]:
def build_strat(type='cpu', tpu=None, zone=None, project=None):
    if type == 'cpu':
        return tf.distribute.OneDeviceStrategy(device='/cpu:0')
    elif type == 'gpu':
        return tf.distribute.OneDeviceStrategy(device='/gpu:0')
    elif type == 'tpu':
        cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            tpu=tpu, zone=zone, project=project)
        tf.config.experimental_connect_to_cluster(cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
        tpu_strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
        return tpu_strategy
    elif type == 'mirror':
        return tf.distribute.MirroredStrategy(devices=['/gpu:0', '/gpu:1'])
    else:
        raise ValueError('Available strategy types are cpu, gpu, tpu, and mirror')

class DenseBlock(keras.layers.Layer):
    def __init__(self, units, activation='elu', momentum=0.9, dropout=0.5, **kwargs):
        super().__init__(**kwargs)
        self.units = units
        self.activation = activation
        self.momentum = momentum
        self.dropout = dropout
        self.dense_x = Dense(units=units, activation=None, kernel_initializer='he_normal')
        self.bn_x = BatchNormalization(momentum=momentum)
        self.activ_x = Activation(activation)
        self.drop_x = Dropout(dropout)

    def call(self, inputs):
        x = self.dense_x(inputs)
        x = self.bn_x(x)
        x = self.activ_x(x)
        return self.drop_x(x)
    
    def get_config(self):
        config = super().get_config()
        config.update({
            'units': self.units,
            'activation': self.activation,
            'momentum': self.momentum,
            'dropout': self.dropout,
        })
        return config

def build_model(strategy, lr, activation, n_layers=3, layer_units=None):
    if layer_units is None:
        layer_units = [100] * n_layers
    with strategy.scope():
        with tf.io.gfile.GFile(bert_config_file, "r") as reader:
            bc = bert.loader.StockBertConfig.from_json_string(reader.read())
            bert_params = bert.loader.map_stock_config_to_params(bc)
            bert_layer = BertModelLayer().from_params(bert_params, name='bert')

        bert_in = Input(shape=(max_seq_len,), dtype='int32', name="bert_input")
        bert_out = bert_layer(bert_in)
        bert_out = Lambda(lambda seq: seq[:, 0, :])(bert_out)
        bert_out = Dropout(0.5)(bert_out)

        comp_in = Input(shape=(1), dtype='float32', name='comp_input')
        x = keras.layers.concatenate([bert_out, comp_in])
        x = DenseBlock(units=769, activation=activation)(x)
        for n in range(n_layers):
            x = DenseBlock(
                units=layer_units[n], 
                activation=activation, 
                name='dense{}'.format(n))(x)
        out = Dense(units=n_classes, activation="softmax")(x)

        model = keras.Model(inputs=[bert_in, comp_in], outputs=out)
        bert.loader.load_stock_weights(bert_layer, bert_ckpt_file)

        model.compile(loss='categorical_crossentropy',
                      optimizer=keras.optimizers.Adam(lr),
                      metrics=['accuracy'])
    return model

def deep_eval(model, threshold):
    
    def filtered_acc(thresh):
        acc_list = []
        for i, val in enumerate(pred_vals): 
            if preds[i, np.argmax(y_test[i])] > thresh:
                acc_list.append(val == y_vals[i])
        return np.mean(acc_list)
        
    preds = model.predict(X_test)
    y_vals = np.array([classes[np.argmax(i)] for i in y_test])
    pred_vals = np.array([classes[np.argmax(i)] for i in preds])
    plain_acc = (y_vals == pred_vals).mean()
    print('Plain test accuracy:', plain_acc)
    thresh_acc = filtered_acc(threshold)
    print('Test accuracy when probability is above {} threshold: {}'.format(threshold, thresh_acc))
    null_class_acc = {}
    for col in classes:
        null_class_acc[col] = []
        for i, x in enumerate(pred_vals):
            if x == col:
                null_class_acc[col].append(x == y_vals[i])

    filt_class_acc = {}
    for col in classes:
        filt_class_acc[col] = []
        for i, x in enumerate(pred_vals):
            if x == col and preds[i, np.argmax(y_test[i])] > .5:
                filt_class_acc[col].append(x == y_vals[i])
    import pandas as pd
    performance = {
        'class': [],
        'filtered_len': [],
        'total_len': [],
        'filtered_acc': [],
        'total_acc': [],
        'retain_pct': []
    }
    for label in classes:
        if len(null_class_acc[label]) == 0:
            print('Group', label, 'not predicted for any instance')
            continue
        performance['class'].append(label)
        performance['filtered_len'].append(len(filt_class_acc[label]))
        performance['total_len'].append(len(null_class_acc[label]))
        performance['filtered_acc'].append(np.mean(filt_class_acc[label]))
        performance['total_acc'].append(np.mean(null_class_acc[label]))
        performance['retain_pct'].append(len(filt_class_acc[label]) / len(null_class_acc[label]) * 100)
        performance_df = pd.DataFrame(performance)
    performance_df = performance_df.sort_values('total_len')
    use_pct = performance_df.filtered_len.sum() / performance_df.total_len.sum()
    print('Ratio of predictions that would be used:', use_pct)
    return performance_df

## <font color=red>Define Environment Variables and Hyperparameters</font>
***

In [None]:
## Environment Variables
distr_strat = 'cpu'
do_train = True
do_eval = True

## BERT Params
model_id = 2
from_scratch = True
bert_type = 'uncased_base'
do_lower_case = True
max_seq_len = 256
bert_epochs = 3
activ = 'elu'
epochs = 50
batch_size = 32
lr = 5e-4
n_layers = 3
layer_units = [300, 150, 50]

## Pre-Processing
***

Set up files, file names, and directories to be referenced

In [None]:
## Include strategy variables if tpu
if distr_strat == 'tpu':
    tpu_name = 'node-2'
    tpu_zone = 'us-central1-c'
    tpu_proj = 'cedar-pottery-252818'
else:
    tpu_name = None
    tpu_zone = None
    tpu_proj = None

## Relevant directories - saved model metadata in model_comparison.xlsx by id
proj_dir = 'gs://eri-ml-bucket-1/flsa_prediction'
data_dir = os.path.join(proj_dir, 'data')
tf.io.gfile.makedirs('saved_models')
model_file = os.path.join('saved_models', 'model.{:02d}.h5'.format(model_id))
if not from_scratch and not tf.io.gfile.exists(model_file):
    tf.io.gfile.copy(os.path.join(proj_dir, model_file), model_file)

## BERT pretrained files
bert_dir = os.path.join(os.getcwd(), 'data', bert_type)
bert_ckpt_file = os.path.join(bert_dir, 'bert_model.ckpt')
bert_config_file = os.path.join(bert_dir, 'bert_config.json')
bert_vocab_file = os.path.join(bert_dir, 'vocab.txt')

Load raw data from GCP

In [None]:
if do_train:
    file = os.path.join(data_dir, 'bert_input_train.npy')
    with tf.io.gfile.GFile(file, 'rb') as f:
        bert_input_train = np.load(f, allow_pickle=True)
    file = os.path.join(data_dir, 'comp_input_train.npy')
    with tf.io.gfile.GFile(file, 'rb') as f:
        comp_input_train = np.load(f, allow_pickle=True)
    file = os.path.join(data_dir, 'y_train.npy')
    with tf.io.gfile.GFile(file, 'rb') as f:
        y_train = np.load(f, allow_pickle=True)
    train_ds = tf.data.Dataset.from_tensor_slices(
        ({'bert_input': bert_input_train,
        'comp_input': comp_input_train},
        y_train))
    train_ds = train_ds.shuffle(buffer_size=100).batch(batch_size)
    train_ds = train_ds.prefetch(tf.data.experimental.AUTOTUNE)
    train_size = len(y_train)
    del bert_input_train, comp_input_train, y_train
    
    file = os.path.join(data_dir, 'bert_input_valid.npy')
    with tf.io.gfile.GFile(file, 'rb') as f:
        bert_input_valid = np.load(f, allow_pickle=True)
    file = os.path.join(data_dir, 'comp_input_valid.npy')
    with tf.io.gfile.GFile(file, 'rb') as f:
        comp_input_valid = np.load(f, allow_pickle=True)
    file = os.path.join(data_dir, 'y_valid.npy')
    with tf.io.gfile.GFile(file, 'rb') as f:
        y_valid = np.load(f, allow_pickle=True)
    valid_ds = tf.data.Dataset.from_tensor_slices(
        ({'bert_input': bert_input_valid,
        'comp_input': comp_input_valid},
        y_valid))
    valid_ds = valid_ds.shuffle(buffer_size=100).batch(batch_size)
    valid_ds = valid_ds.prefetch(tf.data.experimental.AUTOTUNE)
    valid_size = len(y_valid)
    del bert_input_valid, comp_input_valid, y_valid
    
if do_eval:
    file = os.path.join(data_dir, 'bert_input_test.npy')
    with tf.io.gfile.GFile(file, 'rb') as f:
        bert_input_test = np.load(f, allow_pickle=True)
    file = os.path.join(data_dir, 'comp_input_test.npy')
    with tf.io.gfile.GFile(file, 'rb') as f:
        comp_input_test = np.load(f, allow_pickle=True)
    file = os.path.join(data_dir, 'y_test.npy')
    with tf.io.gfile.GFile(file, 'rb') as f:
        y_test = np.load(f, allow_pickle=True)
    test_ds = tf.data.Dataset.from_tensor_slices(
        ({'bert_input': bert_input_test,
        'comp_input': comp_input_test},
        y_test))
    test_ds = test_ds.shuffle(buffer_size=100).batch(batch_size)
    test_ds = test_ds.prefetch(tf.data.experimental.AUTOTUNE)
    test_size = len(y_test)
    del bert_input_test, comp_input_test, y_test

file = os.path.join(data_dir, 'classes.npy')
with tf.io.gfile.GFile(file, 'rb') as f:
    classes = np.load(f, allow_pickle=True)
n_classes = len(classes)

## BERT Fine-Tuning
***

Build pure BERT model from "scratch" or load previous h5 file

In [None]:
strat = build_strat(type=distr_strat, tpu=tpu_name, zone=tpu_zone, project=tpu_proj)
if from_scratch:
    model = build_model(strat, lr, activ, n_layers=n_layers, layer_units=layer_units)
else:
    model = keras.models.load_model(model_file, custom_objects={'BertModelLayer': bert.BertModelLayer,
                                                                'DenseBlock': DenseBlock})

Train the model, saving the epoch with the highest accuracy locally

In [None]:
model.fit(train_ds, epochs=bert_epochs)

trained_bert_file = os.path.join('saved_models', 'trained_bert_model.{:02d}.h5'.format(model_id))
model.save(trained_bert_file)
tf.io.gfile.copy(trained_bert_file, os.path.join(proj_dir, trained_bert_file), overwrite=True)

In [None]:
model.evaluate(train_ds)

In [None]:
model.layers[1].trainable = False
early_stop = EarlyStopping(
    monitor='val_accuracy',
    min_delta=0.001,
    patience=7,
    restore_best_weights=True)
checkpoint = ModelCheckpoint(
    filepath=model_file,
    monitor='val_accuracy',
    save_best_only=True)
model.fit(
    train_ds,
    validation_data=valid_ds,
    epochs=epochs, 
    initial_epoch=bert_epochs, 
    callbacks=[early_stop, checkpoint])

In [None]:
model.evaluate(test_ds)

In [None]:
tf.io.gfile.copy(model_file, os.path.join(proj_dir, model_file), overwrite=True)

#### <font color=red>Update GCP with the latest version of this script</font>

In [None]:
tf.io.gfile.copy('model.ipynb', os.path.join(proj_dir, 'model.ipynb'), overwrite=True)

In [None]:
serve_model = os.path.join('saved_models', 'model_{%02d}'.format(model_id))
model.save(serve_model, save_format='tf')
!gsutil -m cp -R $serve_model gs://eri-ml-bucket-1/ml_job_match/$serve_model