# Predict FLSA code with job descriptions and compensation
***

Import modules

In [None]:
from __future__ import division, print_function, unicode_literals
import numpy as np
import os
import bert
from bert import BertModelLayer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, BatchNormalization, Input, Lambda, Dropout, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.models import Model
import tqdm
import pandas as pd

## Code will work with either tensorflow version, but needs to be executed eagerly if 1.X
if tf.__version__[0] == '1':
    tf.enable_eager_execution()

In [None]:
def build_strat(type='cpu', tpu=None, zone=None, project=None):
    if type == 'cpu':
        return tf.distribute.OneDeviceStrategy(device='/cpu:0')
    elif type == 'gpu':
        return tf.distribute.OneDeviceStrategy(device='/gpu:0')
    elif type == 'tpu':
        cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            tpu=tpu, zone=zone, project=project)
        tf.config.experimental_connect_to_cluster(cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
        tpu_strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
        return tpu_strategy
    elif type == 'mirror':
        return tf.distribute.MirroredStrategy(devices=['/gpu:0', '/gpu:1'])
    else:
        raise ValueError('Available strategy types are cpu, gpu, tpu, and mirror')

def build_model(strategy, lr, activation, n_layers=3, layer_units=None):
    if layer_units is None:
        layer_units = [100] * n_layers
    with strategy.scope():
        with tf.io.gfile.GFile(bert_config_file, "r") as reader:
            bc = bert.loader.StockBertConfig.from_json_string(reader.read())
            bert_params = bert.loader.map_stock_config_to_params(bc)
            bert_layer = BertModelLayer().from_params(bert_params, name='bert')

        bert_in = Input(shape=(max_seq_len,), dtype='int32', name="bert_input")
        bert_out = bert_layer(bert_in)
        bert_out = Lambda(lambda seq: seq[:, 0, :])(bert_out)
        bert_out = Dropout(0.5)(bert_out)
        x = Dense(768, activation=None, kernel_initializer='he_normal')(bert_out)
        x = BatchNormalization(momentum=0.9)(x)
        x = Activation(activ)(x)
        out = Dense(units=n_classes, activation="sigmoid")(x)

        model = keras.Model(inputs=bert_in, outputs=out)
        bert.loader.load_stock_weights(bert_layer, bert_ckpt_file)

        model.compile(loss='binary_crossentropy',
                      optimizer=keras.optimizers.Adam(lr),
                      metrics=['accuracy'])
    return model

## <font color=red>Define Environment Variables and Hyperparameters</font>
***

In [None]:
## Environment Variables
distr_strat = 'gpu'
do_train = True
do_eval = True

## BERT Params
model_id = 1
from_scratch = True
bert_type = 'uncased_base'
do_lower_case = True
max_seq_len = 256
activ = 'elu'
epochs = 3
batch_size = 32
lr = 5e-4

## Pre-Processing
***

Set up files, file names, and directories to be referenced

In [None]:
## Include strategy variables if tpu
if distr_strat == 'tpu':
    tpu_name = 'node-2'
    tpu_zone = 'us-central1-c'
    tpu_proj = 'cedar-pottery-252818'
else:
    tpu_name = None
    tpu_zone = None
    tpu_proj = None

## Relevant directories - saved model metadata in model_comparison.xlsx by id
proj_dir = 'gs://eri-ml-bucket-1/flsa_prediction'
data_dir = os.path.join(proj_dir, 'data', 'bert_only')
tf.io.gfile.makedirs('saved_models')
model_file = os.path.join('saved_models', 'model.{:02d}.h5'.format(model_id))
if not from_scratch and not tf.io.gfile.exists(model_file):
    tf.io.gfile.copy(os.path.join(proj_dir, model_file), model_file)

## BERT pretrained files
bert_dir = os.path.join(os.getcwd(), 'data', bert_type)
bert_ckpt_file = os.path.join(bert_dir, 'bert_model.ckpt')
bert_config_file = os.path.join(bert_dir, 'bert_config.json')
bert_vocab_file = os.path.join(bert_dir, 'vocab.txt')

Load raw data from GCP

In [None]:
if do_train:
    file = os.path.join(data_dir, 'X_train.npy')
    with tf.io.gfile.GFile(file, 'rb') as f:
        X_train = np.load(f, allow_pickle=True)
    file = os.path.join(data_dir, 'y_train.npy')
    with tf.io.gfile.GFile(file, 'rb') as f:
        y_train = np.load(f, allow_pickle=True)
    train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    train_ds = train_ds.shuffle(buffer_size=100).batch(batch_size)
    train_ds = train_ds.prefetch(tf.data.experimental.AUTOTUNE)
    train_size = len(y_train)
    del X_train, y_train
    
    file = os.path.join(data_dir, 'X_valid.npy')
    with tf.io.gfile.GFile(file, 'rb') as f:
        X_valid = np.load(f, allow_pickle=True)
    file = os.path.join(data_dir, 'y_valid.npy')
    with tf.io.gfile.GFile(file, 'rb') as f:
        y_valid = np.load(f, allow_pickle=True)
    valid_ds = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))
    valid_ds = valid_ds.shuffle(buffer_size=100).batch(batch_size)
    valid_ds = valid_ds.prefetch(tf.data.experimental.AUTOTUNE)
    valid_size = len(y_valid)
    del X_valid, y_valid
    
if do_eval:
    file = os.path.join(data_dir, 'X_test.npy')
    with tf.io.gfile.GFile(file, 'rb') as f:
        X_test = np.load(f, allow_pickle=True)
    file = os.path.join(data_dir, 'y_test.npy')
    with tf.io.gfile.GFile(file, 'rb') as f:
        y_test = np.load(f, allow_pickle=True)
    test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test))
    test_ds = test_ds.shuffle(buffer_size=100).batch(batch_size)
    test_ds = test_ds.prefetch(tf.data.experimental.AUTOTUNE)
    test_size = len(y_test)
    del X_test, y_test

n_classes = 1

## BERT Fine-Tuning
***

Build pure BERT model from "scratch" or load previous h5 file

In [None]:
strat = build_strat(type=distr_strat, tpu=tpu_name, zone=tpu_zone, project=tpu_proj)
if from_scratch:
    model = build_model(strat, lr, activ, n_layers=n_layers, layer_units=layer_units)
else:
    model = keras.models.load_model(model_file, custom_objects={'BertModelLayer': bert.BertModelLayer,
                                                                'DenseBlock': DenseBlock})

Train the model, saving the epoch with the highest accuracy locally

In [None]:
model.fit(
    train_ds, 
    epochs=bert_epochs, 
    validation_data=valid_ds)

trained_bert_file = os.path.join('saved_models', 'trained_bert_model.{:02d}.h5'.format(model_id))
model.save(trained_bert_file)
tf.io.gfile.copy(trained_bert_file, os.path.join(proj_dir, trained_bert_file), overwrite=True)

In [None]:
model.evaluate(test_ds)

In [None]:
model.layers[1].trainable = False
early_stop = EarlyStopping(
    monitor='val_loss',
    min_delta=0.001,
    patience=7,
    restore_best_weights=True)
checkpoint = ModelCheckpoint(
    filepath=model_file,
    monitor='val_loss',
    save_best_only=True)
his = model.fit(
    train_ds,
    validation_data=valid_ds,
    epochs=epochs, 
    initial_epoch=bert_epochs, 
    callbacks=[early_stop, checkpoint])

his_file = os.path.join(proj_dir, 'saved_models', 'fit_history.csv')
his = pd.DataFrame(his.history)
his.to_csv(his_file)

In [None]:
model.evaluate(train_ds)

In [None]:
tf.io.gfile.copy(model_file, os.path.join(proj_dir, model_file), overwrite=True)

#### <font color=red>Update GCP with the latest version of this script</font>

In [None]:
tf.io.gfile.copy('model.ipynb', os.path.join(proj_dir, 'model.ipynb'), overwrite=True)

In [None]:
serve_model = os.path.join('saved_models', 'model_{%02d}'.format(model_id))
model.save(serve_model, save_format='tf')
!gsutil -m cp -R $serve_model gs://eri-ml-bucket-1/ml_job_match/$serve_model