In [None]:
import tensorflow as tf
import time
import os
import numpy as np
import json
import pandas as pd
from tensorflow.keras.callbacks import ModelCheckpoint, Callback
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
from tqdm import tqdm
import sys

In [None]:
with open('../params.json', 'r') as f:
    params = json.load(f)

max_length = params['max_length']
padding_type = params['padding_type']
vocab_size = params['vocab_size']
embedding_dim = params['embedding_dim']
trunc_type = params['trunc_type']
oov_tok = params['oov_tok']

In [None]:
path='../models/1706293064.2710369'

In [None]:
word_index=json.load(open(f'../processed/word_index.json','r'))

In [None]:
train_x=np.load('../processed/train_padded.npy')
train_y=np.load('../processed/train_y.npy')
val_x=np.load('../processed/val_padded.npy')
val_y=np.load('../processed/val_y.npy')
embeddings=np.load('../glove_embeddings_200d.npy')
train_y_meta=pd.read_csv('../processed/train_meta.csv')
val_y_meta=pd.read_csv('../processed/val_meta.csv')

In [None]:
# model=tf.keras.Sequential([ 
#     tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
#     tf.keras.layers.LSTM(100,return_sequences=True),
#     tf.keras.layers.GlobalAveragePooling1D(),
#     tf.keras.layers.Dropout(0.1),
#     tf.keras.layers.Dense(50,activation='relu'),
#     tf.keras.layers.Dropout(0.1),
#     tf.keras.layers.Dense(50,activation='relu'),
#     tf.keras.layers.Dropout(0.1),
#     tf.keras.layers.Dense(1,activation='sigmoid')
# ])

# model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
# model.summary()

In [None]:
def calculate_worst_group_accuracy(labels, predictions):
    # Add predictions to labels DataFrame
    labels.loc[:, 'pred'] = predictions

    # Define the categories to consider
    categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white']

    # Initialize lists to store accuracies and group names
    group_accuracies = []
    group_names = []

    # For each category
    for category in categories:
        # For each label in the category
        for label in [0, 1]:
            # Select the group with the current category and label
            group = labels.loc[labels[category] == label]
            # Calculate the accuracy of the predictions for this group
            group_accuracy = (group['y'] == (group['pred'] > 0.5)).mean()
            # Append the group name and accuracy to the respective lists
            group_names.append(f'{category}_{label}')
            group_accuracies.append(group_accuracy)

    # Calculate the worst group accuracy
    worst_group_accuracy = np.min(group_accuracies)

    # Return the worst group accuracy and a dictionary mapping group names to accuracies
    return worst_group_accuracy, dict(zip(group_names, group_accuracies))

In [None]:
def batch_and_predict(input_data, meta_data, model):
    # Reshape the input data to 2D
    reshaped_input = input_data.reshape((input_data.shape[0], -1))

    # Create a TensorFlow dataset from the reshaped input and meta data
    dataset = tf.data.Dataset.from_tensor_slices((reshaped_input, meta_data))

    # Batch the dataset
    batched_dataset = dataset.batch(32)

    # Initialize a list to store predictions
    predictions = []

    # For each batch in the dataset
    for batch_idx, (inputs, _) in tqdm(enumerate(batched_dataset), leave=False):
        # Make predictions using the model
        batch_predictions = model(inputs, training=False)
        # Extend the predictions list with the current batch predictions
        predictions.extend(tf.squeeze(batch_predictions).numpy().tolist())

    return predictions

In [None]:
# Define a custom Keras Callback to calculate and print the worst group accuracy after each epoch
class WorstGroupAccuracy(Callback):
    # Initialize the callback with training and validation data
    def __init__(self, training_data, validation_data):
        super(WorstGroupAccuracy, self).__init__()
        self.training_data = training_data
        self.validation_data = validation_data

    # This method is called at the end of each epoch
    def on_epoch_end(self, epoch, logs=None):
        # Unpack the validation data
        validation_inputs, _, validation_meta_data = self.validation_data
        
        # Use the model to make predictions on the validation data
        validation_predictions = batch_and_predict(validation_inputs, validation_meta_data, self.model)
        
        # Calculate the worst group accuracy and the metric for each group
        worst_group_accuracy, group_metrics = calculate_worst_group_accuracy(validation_meta_data, validation_predictions)
        
        # Print the worst group accuracy and the metric for each group
        print(f'{worst_group_accuracy}, Validation WGA: {group_metrics}')
        
# Instantiate the callback with training and validation data
wga_callback = WorstGroupAccuracy((train_x, train_y, train_meta), (val_x, val_y, val_y_meta))

In [None]:
# This function creates a custom loss function that takes into account demographic information
def demographic_loss_wrapper(demography, loss_fn):
    # The custom loss function
    def demographic_loss(y_true, y_pred):
        # Split the true labels and the demographic information
        y_true, demography = tf.split(y_true, [1, 1], axis=-1)
        
        # Cast the demographic information to boolean
        demography = tf.cast(demography, tf.bool)
        
        # Cast the true labels to float32
        y_true = tf.cast(y_true, tf.float32)
        
        # Create a tensor filled with 0.5 of the same shape as y_true
        half_tensor = tf.fill(tf.shape(y_true), 0.5)
        
        # If the demographic information is True, set the true label to 0.5, otherwise keep the original value
        y_true = tf.where(demography, half_tensor, y_pred)
        
        # Calculate the loss using the provided loss function
        return loss_fn(y_true, y_pred)
    
    # Return the custom loss function
    return demographic_loss

In [None]:
# Define a sequential model
model = tf.keras.Sequential([ 
    # Add an Embedding layer that turns positive integers (indexes) into dense vectors of fixed size
    # The model will take as input an integer matrix of size (batch, input_length)
    # The largest integer (i.e. word index) in the input should be no larger than vocab_size (vocabulary size)
    # Now model.output_shape == (None, input_length, embedding_dim)
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length, weights=[embeddings], trainable=False),
    
    # Add a Bidirectional layer with a GRU layer that contains 50 units
    # The output sequences from the GRU layer are returned as they are
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(50, return_sequences=True)),
    
    # Add a GlobalAveragePooling1D layer that averages over the sequence dimension (axis 1) to produce a fixed length output vector
    # This allows the model to handle input of variable length in a simple and computationally efficient way
    tf.keras.layers.GlobalAveragePooling1D(),
    
    # Add a Dropout layer that randomly sets 10% of input units to 0 at each update during training time
    # This helps prevent overfitting
    tf.keras.layers.Dropout(0.1),
    
    # Add a Dense layer with 50 units and a ReLU activation function
    tf.keras.layers.Dense(50, activation='relu'),
    
    # Add another Dropout layer
    tf.keras.layers.Dropout(0.1),
    
    # Add a Dense output layer with a single unit and a sigmoid activation function
    # The sigmoid function squashes the output values to be between 0 and 1, which is ideal for binary classification
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Define the loss function to be binary cross entropy
# Binary cross entropy is suitable for binary classification problems
bce_loss = tf.keras.losses.BinaryCrossentropy()

# The model is compiled with the custom loss function (commented out in this code), Adam optimizer, and accuracy as the metric

# Print a summary of the model
model.summary()

In [None]:
num_epochs = 10
batch_size= 128
    
checkpoint = ModelCheckpoint('model-{epoch:03d}.keras', monitor='val_loss', save_best_only=False, mode='auto')
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)
history = model.fit(
    train_x, np.vstack([train_y_meta.loc[:,['y','black']].values]), 
    batch_size=batch_size, 
    epochs=num_epochs,
    validation_data=(val_x, 
                     np.vstack([val_y_meta.loc[:,['y','black']].values])),callbacks=[checkpoint, wga])

In [None]:
# Define a sequential model
model = tf.keras.Sequential([ 
    # Add an Embedding layer that turns positive integers (indexes) into dense vectors of fixed size
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length, weights=[embeddings], trainable=False),
    
    # Add a Bidirectional layer with a GRU layer that contains 150 units
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(150, return_sequences=True)),
    
    # Add a 1D convolution layer with 150 filters, kernel size of 3, same padding, He uniform initializer and ReLU activation function
    tf.keras.layers.Conv1D(filters=150, kernel_size=3, padding='same', kernel_initializer='he_uniform', activation='relu'),
    
    # Add a MaxPooling1D layer with pool size of 3
    tf.keras.layers.MaxPooling1D(pool_size=3),
    
    # Add a GlobalAveragePooling1D layer
    tf.keras.layers.GlobalAveragePooling1D(),
    
    # Add a BatchNormalization layer
    tf.keras.layers.BatchNormalization(),
    
    # Add a Dense layer with 50 units, ReLU activation function and He uniform initializer
    tf.keras.layers.Dense(50, activation='relu', kernel_initializer='he_uniform'),
    
    # Add a Dropout layer that randomly sets 20% of input units to 0 at each update during training time
    tf.keras.layers.Dropout(0.2),
    
    # Add another Dense layer with 30 units, ReLU activation function and He uniform initializer
    tf.keras.layers.Dense(30, activation='relu', kernel_initializer='he_uniform'),
    
    # Add another Dropout layer
    tf.keras.layers.Dropout(0.2),
    
    # Add a Dense output layer with a single unit, sigmoid activation function and Glorot uniform initializer
    tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer='glorot_uniform')
])

# Define the loss function to be binary cross entropy
bce_loss = tf.keras.losses.BinaryCrossentropy()

# Compile the model with the custom loss function, Adam optimizer, and accuracy as the metric
model.compile(loss=[demographic_loss_wrapper('black', bce_loss)], optimizer='adam', metrics=['accuracy'])

# Print a summary of the model
model.summary()

In [None]:
num_epochs = 32
batch_size= 64
    
checkpoint = ModelCheckpoint('model-{epoch:03d}.keras', monitor='val_loss', save_best_only=False, mode='auto')
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)

history = model.fit(
    train_x, np.vstack([train_y_meta.loc[:,['y','black']].values]), 
    batch_size=batch_size, 
    epochs=num_epochs,
    validation_data=(val_x, 
                     np.vstack([val_y_meta.loc[:,['y','black']].values])),callbacks=[checkpoint, wga])

In [None]:
t = str(time.time())
path=os.path.join('../models',t)
if not os.path.exists(path):
    os.makedirs(path)
model_path=f'{str(round(history.history["val_accuracy"][-1],2))}'
export_path = os.path.join(path,model_path)
model.save(f'{export_path}.keras')
json.dump(history.history,open(f'{export_path}.json','w'))

In [None]:
params = {
    "max_length": max_length,
    "padding_type": padding_type,
    "vocab_size": vocab_size,
    "embedding_dim": embedding_dim,
    "trunc_type": trunc_type,
    "oov_tok": oov_tok,
    'model_accuracy': f'{str(round(history.history["val_accuracy"][-1],2))}'
}
params_json = json.dumps(params, indent=4)
with open(f'{path}/params.json', 'w') as f:
    f.write(params_json)
