In this file we present the model code for our ACL 2022 submission.

In this file we load preprocessed data, and train 3 models: 
* a baseline model, 
* the demographic model and 
* the encoded demographic model. 

For this latter model we use an Autoencoder to encode demograhic information, which is also presented in this notebook.

Please refer to section 4 of our paper for a more elaborate description of the models

In [48]:
import os, sys
import json

import random
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

import tensorflow as tf
from tensorflow.keras import layers, activations, optimizers, losses
import transformers
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from transformers import TFDistilBertModel, DistilBertConfig
from tensorflow.keras import backend as K

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tf.get_logger().setLevel('ERROR')


In [30]:
RANDOM_STATE = 42

# MODEL HYPERPARAMETERS
EPOCHS = 25
LEARNING_RATE = 5e-5
LAYER_DROPOUT = 0.2
VECTOR_LENGTH = 49
MAX_LENGTH=54
BATCH_SIZE=64

#AUTOENCODER HYPERPARAMETERS
ENCODING_DIM = 32
LEARNING_RATE_ENCODER = 5e-3

#UPV LABEL SETTINGS
MIN_T3_OCCURENCECS = 10
EXCLUDED_LABELS = ['functionality', 'performance (personal)']

#DATA PATHS
DATA_PATH=''
MODELS_PATH = 'Models'
INPUT_DATA_PATH = ''

# Load data

In [31]:
#open preprocessed and split input data
with open(INPUT_DATA_PATH) as json_file:
        inputs = json.load(json_file)

#open training data
train_text                  = inputs['train_text']
train_gold_labels           = inputs['train_gold_labels']
train_demographic_vectors   = inputs['train_demographic_vectors']

#open validation data
val_text                    = inputs['val_text']
val_gold_labels             = inputs['val_gold_labels']
val_demographic_vectors     = inputs['val_demographic_vectors']

#open test data
test_text                   = inputs['test_text']
test_gold_labels            = inputs['test_gold_labels']
test_demographic_vectors    = inputs['test_demographic_vectors']

In [32]:
#load DistilBert tokenizer
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

In [33]:
def batch_encode(tokenizer, texts, batch_size=BATCH_SIZE, max_length=MAX_LENGTH):
    """""""""
    A function that encodes a batch of texts and returns the texts'
    corresponding encodings and attention masks that are ready to be fed 
    into a pre-trained transformer model.
    
    Input:
        - tokenizer:   Tokenizer object from the PreTrainedTokenizer Class
        - texts:       List of tuples where each tuple represents a label and an interview extract
        - batch_size:  Integer controlling number of texts in a batch
        - max_length:  Integer controlling max number of words to tokenize in a given text
    Output:
        - input_ids:       sequence of texts encoded as a tf.Tensor object
        - attention_mask:  the texts' attention mask encoded as a tf.Tensor object
    """""""""
    
    input_ids = []
    attention_mask = []
    token_type_ids = []

    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, truncation=True, padding='max_length', max_length=max_length,\
                            return_attention_mask=True, return_token_type_ids=True)

        input_ids.extend(inputs['input_ids'])
        attention_mask.extend(inputs['attention_mask'])
        token_type_ids.extend(inputs['token_type_ids'])

    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask), tf.convert_to_tensor(token_type_ids)

In [34]:
def format_inputs(train_text, train_demographic_vectors, val_text, val_demographic_vectors, test_text, test_demographic_vectors):
        """""""""
    A function that formats train/val/test data by creating batches for all text data
    converting demographic vectors to tensors,
    and creating encoded demographic vectors by retrieving the endoded version of the vector and converting it to a tensor. 
    
    Input:
        - train/val/test_text :                 List of tuples where each tuple represents a label and an interview extract
        - train/val/test_demographic_vectors :  List of lists where each list represents a vector of demographic information
    Output:
        - X_train/val/test_ids:                 sequence of texts encoded as a tf.Tensor object
        - X_train/val/test_attention:           the texts' attention mask encoded as a tf.Tensor object
        - X_train/val/test_demographic_vectors: demographic vectors as a tf.Tensor object
        - X_train/val/test_demographic_vectors_encoded : demographic vectors encoded by autoencoder as a tf.Tensor object
    """""""""
        X_train_ids, X_train_attention, X_train_token_type_ids      = batch_encode(tokenizer, train_text)
        X_val_ids, X_val_attention, X_val_token_type_ids            = batch_encode(tokenizer, val_text)
        X_test_ids, X_test_attention, X_test_token_type_ids         = batch_encode(tokenizer, test_text)

        X_train_demographic_vectors_encoded = tf.convert_to_tensor([vectors_to_encoding[tuple(vector)] for vector in train_demographic_vectors])
        X_val_demographic_vectors_encoded = tf.convert_to_tensor([vectors_to_encoding[tuple(vector)] for vector in val_demographic_vectors])
        X_test_demographic_vectors_encoded = tf.convert_to_tensor([vectors_to_encoding[tuple(vector)] for vector in test_demographic_vectors])

        X_train_demographic_vectors       = tf.convert_to_tensor(train_demographic_vectors)
        X_val_demographic_vectors         = tf.convert_to_tensor(val_demographic_vectors)
        X_test_demographic_vectors        = tf.convert_to_tensor(test_demographic_vectors)

        return(X_train_ids, X_train_attention, X_train_token_type_ids, X_train_demographic_vectors, X_train_demographic_vectors_encoded, \
            X_val_ids, X_val_attention, X_val_token_type_ids, X_val_demographic_vectors, X_val_demographic_vectors_encoded, \
            X_test_ids, X_test_attention, X_test_token_type_ids, X_test_demographic_vectors, X_test_demographic_vectors_encoded)

# Autoencoder

In [35]:
AUTOENCODER_PATH = ''
## build autoencoder

# Define weight initializer with a random seed
weight_initializer = tf.keras.initializers.GlorotNormal(seed=RANDOM_STATE) 

# This is the input vector
input_vector = tf.keras.Input(shape=(VECTOR_LENGTH,))
dense1 = layers.Dense(44, activation='relu', name='dense1')(input_vector)
dense2 = layers.Dense(40, activation='relu', name='dense2')(dense1)
dense3 = layers.Dense(40, activation='relu', name='dense3')(dense2)
dense4 = layers.Dense(36, activation='relu', name='dense4')(dense3)
#encoded representation of the input vector
encoded = layers.Dense(ENCODING_DIM, activation='relu', name='encoder_layer')(dense4)
# the reconstruction of the input
dense5 = layers.Dense(36, activation='relu', name='dense5')(encoded)
dense6 = layers.Dense(40, activation='relu', name='dense6')(dense5)
dense7 = layers.Dense(40, activation='relu', name='dense7')(dense6)
dense8 = layers.Dense(44, activation='relu', name='dense8')(dense7)
decoded = layers.Dense(VECTOR_LENGTH, activation='sigmoid', name = 'decoder_layer')(dense8)

# This model maps an input to its reconstruction
autoencoder = tf.keras.Model(input_vector, decoded)

# This model maps an input to its encoded representation
encoder = tf.keras.Model(input_vector, encoded)

autoencoder.compile(tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE_ENCODER), loss='binary_crossentropy')
autoencoder.summary()
#load autoencoder weights
autoencoder.load_weights(os.path.join(AUTOENCODER_PATH, '/weights'))
encoder.load_weights(os.path.join(AUTOENCODER_PATH, '/weights'))

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 49)]              0         
_________________________________________________________________
dense1 (Dense)               (None, 44)                2200      
_________________________________________________________________
dense2 (Dense)               (None, 40)                1800      
_________________________________________________________________
dense3 (Dense)               (None, 40)                1640      
_________________________________________________________________
dense4 (Dense)               (None, 36)                1476      
_________________________________________________________________
encoder_layer (Dense)        (None, 32)                1184      
_________________________________________________________________
dense5 (Dense)               (None, 36)               

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f1bfaf3f3a0>

# Prepare data

In [36]:
#create a list of all unique demographic vectors, so we can encode them all at once
demographic_vectors = [list(x) for x in set(tuple(x) for x in (train_demographic_vectors + val_demographic_vectors + test_demographic_vectors))]
#encode the vectors
encoded_vectors = encoder.predict(demographic_vectors)
#save encoded vectors in a dict
vectors_to_encoding = {tuple(demvec) : encoded_vectors[n] for n, demvec in enumerate(demographic_vectors)}

In [39]:
#format input data
X_train_ids, X_train_attention, X_train_token_type_ids, X_train_demographic_vectors, X_train_demographic_vectors_encoded, \
    X_val_ids, X_val_attention, X_val_token_type_ids, X_val_demographic_vectors, X_val_demographic_vectors_encoded, \
        X_test_ids, X_test_attention, X_test_token_type_ids, X_test_demographic_vectors, X_test_demographic_vectors_encoded = \
            format_inputs(train_text, train_demographic_vectors, val_text, val_demographic_vectors, test_text, test_demographic_vectors)

# Load models

In [40]:
config = DistilBertConfig(output_hidden_states=True)
                          
# Load the pre-trained DistilBERT transformer model outputting raw hidden-states, without any specific head on top.
distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config, name='ourDistilBert')

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_projector', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


#### Load evaluation metrics

In [41]:
def recall_m(y_true, y_pred): # recall
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) # TP
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) # P
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred): #precision
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) # TP
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) # TP + FP
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred): #F1-score
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def TP(y_true, y_pred): #true positives
    tp = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) # TP
    y_pos = K.round(K.clip(y_true, 0, 1))
    n_pos = K.sum(y_pos)
    y_neg = 1 - y_pos
    n_neg = K.sum(y_neg)
    n = n_pos + n_neg
    return tp/n

def TN(y_true, y_pred): #true negatives
    y_pos = K.round(K.clip(y_true, 0, 1))
    n_pos = K.sum(y_pos)
    y_neg = 1 - y_pos
    n_neg = K.sum(y_neg)
    n = n_pos + n_neg
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos
    tn = K.sum(K.round(K.clip(y_neg * y_pred_neg, 0, 1))) # TN
    return tn/n

def FP(y_true, y_pred): #false positives
    y_pos = K.round(K.clip(y_true, 0, 1))
    n_pos = K.sum(y_pos)
    y_neg = 1 - y_pos
    n_neg = K.sum(y_neg)
    n = n_pos + n_neg
    tn = K.sum(K.round(K.clip(y_neg * y_pred, 0, 1))) # FP
    return tn/n

def FN(y_true, y_pred): #false negatives
    y_pos = K.round(K.clip(y_true, 0, 1))
    n_pos = K.sum(y_pos)
    y_neg = 1 - y_pos
    n_neg = K.sum(y_neg)
    n = n_pos + n_neg
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos
    tn = K.sum(K.round(K.clip(y_true * y_pred_neg, 0, 1))) # FN
    return tn/n

## Baseline model

In [None]:
def build_model(transformer, learning_rate, max_length=MAX_LENGTH):
    """""""""
    This function builds our baseline model that performs the UPV classification task without demographic information
    
    Input:
      - transformer:    DistilBERT transformer model object
                        with no added classification head attached.
      - learning_rate:  integer controlling the learning rate of the model
      - max_length:     integer controlling the maximum number of encoded tokens 
                        in a given sequence.
    
    Output:
      - model:        a compiled tf.keras.Model with added classification layers 
                      on top of the base pre-trained model architecture.
    """""""""
    
    # Define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=RANDOM_STATE) 
    
    # Define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='input_attention', 
                                                  dtype='int32')
    input_token_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='token_type_ids', 
                                                  dtype='int32')
  
    # get the hidden-state at the output of the model's last layer.
    last_hidden_state = transformer({'input_ids': input_ids_layer, 'attention_mask': input_attention_layer, 'token_type_ids': input_token_ids_layer})[0]
    
    # get the [CLS] token
    cls_token = last_hidden_state[:, 0, :]
    
    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(1, 
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,  
                                   kernel_constraint=None,
                                   bias_initializer='zeros',
                                   name='t3_classifier'
                                   )(cls_token)
    # Define the model
    model = tf.keras.Model([input_ids_layer, input_attention_layer, input_token_ids_layer], output)

    # Compile the model
    model.compile(tf.keras.optimizers.Adam(learning_rate=learning_rate), 
                  loss=tf.keras.losses.BinaryCrossentropy(),
                  metrics=[tf.keras.metrics.BinaryAccuracy(), f1_m, precision_m, recall_m, TP, TN, FP, FN])
    return model

In [None]:
def train_model(model_folder, learning_rate) :
    """""""""
    This function builds the model and trains it 
    
    Input:
      - model_folter :      folder in which to save the model
      - learning_rate:      integer controlling the learning rate of the model
    Output:
      - model:        a trained tf.keras.Model 
    """""""""

    NUM_STEPS = len(X_train_ids) // BATCH_SIZE
    #build the model
    model=build_model(distilBERT, learning_rate, MAX_LENGTH)

    model.summary()

    #add early stopping with patience of 7
    stop_early = tf.keras.callbacks.EarlyStopping(
        monitor="val_f1_m", 
        min_delta=0,
        patience=7,
        verbose=1,
        mode="max",
        baseline=None,
        restore_best_weights=True
    )

    csv_logger = tf.keras.callbacks.CSVLogger(os.path.join(MODELS_PATH, model_folder, 'training.log'))

    #Train the model
    train_history1 = model.fit(
        x = [X_train_ids, X_train_attention, X_train_token_type_ids],
        y = np.array(train_gold_labels),
        epochs = EPOCHS,
        batch_size = BATCH_SIZE,
        steps_per_epoch = NUM_STEPS,
        callbacks= [stop_early, csv_logger],
        validation_data=([X_val_ids, X_val_attention, X_val_token_type_ids],  np.array(val_gold_labels)),
        verbose=1
    )
    #save model weights
    model.save_weights(os.path.join(MODELS_PATH, model_folder, 'weights'))
    return(model)

### Regular baseline

In [None]:
BASELINE_PATH = ''
baseline_model = train_model(BASELINE_PATH, LEARNING_RATE)

In [101]:
results_baseline_model = baseline_model.evaluate(
                    x = [X_test_ids, X_test_attention, X_test_token_type_ids],
                    y = np.array(test_gold_labels), 
                    return_dict=True, 
                    batch_size=BATCH_SIZE)
print(results_baseline_model)

{'loss': 0.07448320835828781, 'binary_accuracy': 0.9771679043769836, 'f1_m': 0.6911736130714417, 'precision_m': 0.654262125492096, 'recall_m': 0.8183943033218384, 'TP': 0.025825539603829384, 'TN': 0.9513434171676636, 'FP': 0.016625888645648956, 'FN': 0.00620514340698719}


## Demographic model

In [42]:
def build_model(transformer, learning_rate, max_length=MAX_LENGTH, vector_length=VECTOR_LENGTH):
    """""""""
    This function builds our demographic model that performs the UPV classification task with included demographic information
    
    Input:
      - transformer:    DistilBERT transformer model object
                        with no added classification head attached.
      - learning_rate:  integer controlling the learning rate of the model
      - max_length:     integer controlling the maximum number of encoded tokens 
                        in a given sequence.
      - vector_length : integer controlling the length of the demographic vector
    
    Output:
      - model:        a compiled tf.keras.Model with added classification layers 
                      on top of the base pre-trained model architecture.
    """""""""
    
    # Define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=RANDOM_STATE) 
    
    # Define input layers
    demographic_input_layer = tf.keras.layers.Input(shape= (vector_length,),
                                            name='demographic_features',
                                            dtype='float') 
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='input_attention', 
                                                  dtype='int32')
    input_token_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='token_type_ids', 
                                                  dtype='int32')
  
    # get the hidden state at the output of the model's last layer: a tf.Tensor of shape (batch_size, sequence_length, hidden_size=768).
    last_hidden_state = transformer({'input_ids': input_ids_layer, 'attention_mask': input_attention_layer, 'token_type_ids': input_token_ids_layer})[0]
    
    # get DistilBERT's output for the [CLS] token, 
    cls_token = last_hidden_state[:, 0, :] #dimensionality = 768
    
    demographic_layer_1 = tf.keras.layers.Dense(128, activation='relu', name='demographic_layer_1')(demographic_input_layer) 
    dropout_dem_layer_1 = tf.keras.layers.Dropout(LAYER_DROPOUT, seed=RANDOM_STATE, name='dropout_dem_layer_1')(demographic_layer_1)
    demographic_layer_2 = tf.keras.layers.Dense(256, activation='relu', name='demographic_layer_2')(dropout_dem_layer_1)
    dropout_dem_layer_2 = tf.keras.layers.Dropout(LAYER_DROPOUT, seed=RANDOM_STATE, name='dropout_dem_layer_2')(demographic_layer_2)
    combined_vector_1 = tf.keras.layers.Concatenate()([cls_token, dropout_dem_layer_2]) #dimensionality = 768 + 256 = 1024
    combined_layer_1 = tf.keras.layers.Dense(512, activation='relu', name='combined_dense_layer_1')(combined_vector_1)
    dropout_combined_1 = tf.keras.layers.Dropout(LAYER_DROPOUT, seed=RANDOM_STATE, name='dropout_combined_layer_1')(combined_layer_1)
    combined_vector_2 = tf.keras.layers.Concatenate()([cls_token, dropout_combined_1]) #dimensionality = 768 + 512 = 1280
    combined_layer_2 = tf.keras.layers.Dense(512, activation='relu', name='combined_dense_layer_2')(combined_vector_2)
    dropout_combined_2 = tf.keras.layers.Dropout(LAYER_DROPOUT, seed=RANDOM_STATE, name='dropout_combined_layer_2')(combined_layer_2)
    combined_vector_3 = tf.keras.layers.Concatenate()([cls_token, dropout_combined_2]) #dimensionality = 768 + 512 = 1280
    
    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(1, 
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,  
                                   kernel_constraint=None,
                                   bias_initializer='zeros',
                                   name='t3_classifier'
                                   )(combined_vector_3)
    
    # Define the model
    model = tf.keras.Model([demographic_input_layer, input_ids_layer, input_attention_layer, input_token_ids_layer], output)
    
    # Compile the model
    model.compile(tf.keras.optimizers.Adam(learning_rate=learning_rate), 
                  loss=tf.keras.losses.BinaryCrossentropy(),
                  metrics=[tf.keras.metrics.BinaryAccuracy(), f1_m, precision_m, recall_m, TP, TN, FP, FN])
    return model

In [43]:
def train_model(model_folder, learning_rate=LEARNING_RATE) :
    """""""""
    This function builds the model and trains it 
    
    Input:
      - model_folder :      folder in which to save the model
      - learning_rate:      integer controlling the learning rate of the model
    Output:
      - model:        a trained tf.keras.Model 
    """""""""

    NUM_STEPS = len(X_train_ids) // BATCH_SIZE
    #build the model
    model=build_model(distilBERT, learning_rate, MAX_LENGTH)

    model.summary()

    #include early stopping with patience of 7
    stop_early = tf.keras.callbacks.EarlyStopping(
        monitor="val_f1_m",
        min_delta=0,
        patience=7,
        verbose=1,
        mode="max",
        baseline=None,
        restore_best_weights=True
    )

    csv_logger = tf.keras.callbacks.CSVLogger(os.path.join(MODELS_PATH, model_folder, 'training.log'))

    #Train the model
    train_history1 = model.fit(
        x = [X_train_demographic_vectors, X_train_ids, X_train_attention, X_train_token_type_ids],
        y = np.array(train_gold_labels),
        epochs = EPOCHS,
        batch_size = BATCH_SIZE,
        steps_per_epoch = NUM_STEPS,
        callbacks= [stop_early, csv_logger],
        validation_data=([X_val_demographic_vectors, X_val_ids, X_val_attention, X_val_token_type_ids],  np.array(val_gold_labels)),
        verbose=1
    )
    #save model weights
    model.save_weights(os.path.join(MODELS_PATH, model_folder, 'weights'))
    return(model)

In [None]:
DEMOGRAPHIC_MODEL_PATH = ''
demographic_model = train_model(DEMOGRAPHIC_MODEL_PATH, LEARNING_RATE)

In [140]:
results_demographic_model = demographic_model.evaluate(
                    x = [X_test_demographic_vectors, X_test_ids, X_test_attention, X_test_token_type_ids],
                    y = np.array(test_gold_labels), 
                    return_dict=True, 
                    batch_size=BATCH_SIZE)
print(results_demographic_model)

{'loss': 0.09242386370897293, 'binary_accuracy': 0.9781400561332703, 'f1_m': 0.7074137926101685, 'precision_m': 0.6753308773040771, 'recall_m': 0.8340508341789246, 'TP': 0.026270028203725815, 'TN': 0.9518707990646362, 'FP': 0.01609848439693451, 'FN': 0.0057606566697359085}


# Encoded demographic model

In [37]:
def build_model(transformer, learning_rate, max_length=MAX_LENGTH, vector_length=VECTOR_LENGTH):
    """""""""
    This function builds our demographic model that performs the UPV classification task with included demographic information,
    where this information is added through encoded vectors
    
    Input:
      - transformer:    DistilBERT transformer model object
                        with no added classification head attached.
      - learning_rate:  integer controlling the learning rate of the model
      - max_length:     integer controlling the maximum number of encoded tokens 
                        in a given sequence.
      - vector_length : integer controlling the length of the demographic vector
    
    Output:
      - model:        a compiled tf.keras.Model with added classification layers 
                      on top of the base pre-trained model architecture.
    """""""""
    
    # Define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=RANDOM_STATE) 
    
    # Define input layers
    demographic_input_layer = tf.keras.layers.Input(shape= (ENCODING_DIM,),
                                            name='demographic_features',
                                            dtype='float') 
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='input_attention', 
                                                  dtype='int32')
    input_token_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='token_type_ids', 
                                                  dtype='int32')
  
     # get the hidden state at the output of the model's last layer: a tf.Tensor of shape (batch_size, sequence_length, hidden_size=768).
    last_hidden_state = transformer({'input_ids': input_ids_layer, 'attention_mask': input_attention_layer, 'token_type_ids': input_token_ids_layer})[0]
    
    # get DistilBERT's output for the [CLS] token
    cls_token = last_hidden_state[:, 0, :] #dimensionality = 768
    demographic_layer_1 = tf.keras.layers.Dense(128, activation='relu', name='demographic_layer_1')(demographic_input_layer)
    dropout_dem_layer_1 = tf.keras.layers.Dropout(LAYER_DROPOUT, seed=RANDOM_STATE, name='dropout_dem_layer_1')(demographic_layer_1)
    demographic_layer_2 = tf.keras.layers.Dense(256, activation='relu', name='demographic_layer_2')(dropout_dem_layer_1)
    dropout_dem_layer_2 = tf.keras.layers.Dropout(LAYER_DROPOUT, seed=RANDOM_STATE, name='dropout_dem_layer_2')(demographic_layer_2)
    demographic_layer_3 = tf.keras.layers.Dense(256, activation='relu', name='demographic_layer_3')(dropout_dem_layer_2)
    dropout_dem_layer_3 = tf.keras.layers.Dropout(LAYER_DROPOUT, seed=RANDOM_STATE, name='dropout_dem_layer_3')(demographic_layer_3)
    demographic_layer_4 = tf.keras.layers.Dense(256, activation='relu', name='demographic_layer_4')(dropout_dem_layer_3)
    dropout_dem_layer_4 = tf.keras.layers.Dropout(LAYER_DROPOUT, seed=RANDOM_STATE, name='dropout_dem_layer_4')(demographic_layer_4)
    combined_vector_1 = tf.keras.layers.Concatenate()([cls_token, dropout_dem_layer_4]) #dimensionality = 768 + 256 = 1024
    combined_layer_1 = tf.keras.layers.Dense(512, activation='relu', name='combined_dense_layer_1')(combined_vector_1)
    dropout_combined_1 = tf.keras.layers.Dropout(LAYER_DROPOUT, seed=RANDOM_STATE, name='dropout_combined_layer_1')(combined_layer_1)
    combined_vector_2 = tf.keras.layers.Concatenate()([cls_token, dropout_combined_1]) #dimensionality = 768 + 512 = 1280
    combined_layer_2 = tf.keras.layers.Dense(512, activation='relu', name='combined_dense_layer_2')(combined_vector_2)
    dropout_combined_2 = tf.keras.layers.Dropout(LAYER_DROPOUT, seed=RANDOM_STATE, name='dropout_combined_layer_2')(combined_layer_2)
    combined_vector_3 = tf.keras.layers.Concatenate()([cls_token, dropout_combined_2]) #dimensionality = 768 + 512 = 1280
    
    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(1, 
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,  
                                   kernel_constraint=None,
                                   bias_initializer='zeros',
                                   name='t3_classifier'
                                   )(combined_vector_3)
    
    # Define the model
    model = tf.keras.Model([demographic_input_layer, input_ids_layer, input_attention_layer, input_token_ids_layer], output)

    # Compile the model
    model.compile(tf.keras.optimizers.Adam(learning_rate=learning_rate), 
                  loss=tf.keras.losses.BinaryCrossentropy(),
                  metrics=[tf.keras.metrics.BinaryAccuracy(), f1_m, precision_m, recall_m, TP, TN, FP, FN])
    return model

In [38]:
LEARNING_RATE = 5e-5
def train_model(model_folder, learning_rate=LEARNING_RATE) :
    """""""""
    This function builds the model and trains it 
    
    Input:
      - model_folder :      folder in which to save the model
      - learning_rate:      integer controlling the learning rate of the model
    Output:
      - model:        a trained tf.keras.Model 
    """""""""

    NUM_STEPS = len(X_train_ids) // BATCH_SIZE
    model=build_model(distilBERT, learning_rate, MAX_LENGTH)

    model.summary()

    #add early stopping
    stop_early = tf.keras.callbacks.EarlyStopping(
        monitor="val_f1_m",
        min_delta=0,
        patience=7,
        verbose=1,
        mode="max",
        baseline=None,
        restore_best_weights=True
    )

    csv_logger = tf.keras.callbacks.CSVLogger(os.path.join(MODELS_PATH, model_folder, 'training.log'))

    #Train the model
    train_history1 = model.fit(
        x = [X_train_demographic_vectors_encoded, X_train_ids, X_train_attention, X_train_token_type_ids],
        y = np.array(train_gold_labels),
        epochs = EPOCHS,
        batch_size = BATCH_SIZE,
        steps_per_epoch = NUM_STEPS,
        callbacks= [stop_early, csv_logger],
        validation_data=([X_val_demographic_vectors_encoded, X_val_ids, X_val_attention, X_val_token_type_ids],  np.array(val_gold_labels)),
        verbose=1
    )
    #save model weights
    model.save_weights(os.path.join(MODELS_PATH, model_folder, 'weights'))
    return(model)

### Train regular encoded demographic model

In [24]:
ENC_DEM_PATH = ''
encoded_demographic_model = train_model(ENC_DEM_PATH, LEARNING_RATE)

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
demographic_features (InputLaye [(None, 32)]         0                                            
__________________________________________________________________________________________________
demographic_layer_1 (Dense)     (None, 128)          4224        demographic_features[0][0]       
__________________________________________________________________________________________________
dropout_dem_layer_1 (Dropout)   (None, 128)          0           demographic_layer_1[0][0]        
__________________________________________________________________________________________________
demographic_layer_2 (Dense)     (None, 256)          33024       dropout_dem_layer_1[0][0]        
____________________________________________________________________________________________

In [25]:
results_encoded_demographic_model = encoded_demographic_model.evaluate(
                    x = [X_test_demographic_vectors_encoded, X_test_ids, X_test_attention, X_test_token_type_ids],
                    y = np.array(test_gold_labels), 
                    return_dict=True, 
                    batch_size=BATCH_SIZE)
print(results_encoded_demographic_model)

{'loss': 0.05849744752049446, 'binary_accuracy': 0.9805329442024231, 'f1_m': 0.7200949192047119, 'precision_m': 0.6933411955833435, 'recall_m': 0.8251991271972656, 'TP': 0.02592114545404911, 'TN': 0.9546120166778564, 'FP': 0.013357256539165974, 'FN': 0.0061095403507351875}
