In [None]:
import tensorflow as tf
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
path='../models/1706293064.2710369'
batch_size=128

In [None]:
with open(f'{path}/params.json', 'r') as f:
    params = json.load(f)

max_length = params['max_length']
padding_type = params['padding_type']
vocab_size = params['vocab_size']
embedding_dim = params['embedding_dim']
trunc_type = params['trunc_type']
oov_tok = params['oov_tok']
model_accuracy=params['model_accuracy']

In [None]:
path=os.path.join(path,str(model_accuracy))

In [None]:
# using final model
model = tf.keras.models.load_model(f'{path}.keras')
history=json.load(open(f'{path}.json','r'))
word_index=json.load(open(f'../processed/word_index.json','r'))

In [None]:
val_x = np.load('../processed/val_padded.npy')
val_y = np.load('../processed/val_y.npy')
complete_y=pd.read_csv('../kaggle_data/val_y.csv')
complete_val_x=pd.read_csv('../kaggle_data/val_x.csv')

In [None]:
model.summary()

In [None]:
history

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history[string])
  plt.plot(history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
# First get the weights of the embedding layer
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

In [None]:
import io

# Create a dictionary where the keys are the values from word_index and the values are the keys from word_index
index_to_word = {value: key for key, value in word_index.items()}

# Open two files for writing: one for the vectors and one for the metadata
with io.open('./plot_embeddings/vectors.tsv', 'w', encoding='utf-8') as vectors_file, \
     io.open('./plot_embeddings/metadata.tsv', 'w', encoding='utf-8') as metadata_file:
    # For each word in the vocabulary
    for index in range(1, vocab_size):
        # Get the word and its embedding
        word = index_to_word[index]
        embedding = weights[index]
        # Write the word to the metadata file
        metadata_file.write(word + "\n")
        # Write the embedding to the vectors file
        vectors_file.write('\t'.join([str(x) for x in embedding]) + "\n")

In [None]:
def calculate_worst_group_accuracy(predictions, labels):
    # Add predictions to labels DataFrame
    labels.loc[:, 'pred'] = predictions.pred

    # Define the categories to consider
    categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white']

    # Initialize lists to store accuracies and group names
    accuracies = []
    group_names = []

    # For each category
    for category in categories:
        # For each label in the category
        for label in [0, 1]:
            # Select the group with the current category and label
            group = labels.loc[labels[category] == label]
            # Calculate the accuracy of the predictions for this group
            group_accuracy = (group['y'] == (group['pred'] > 0.5)).mean()
            # Append the group name and accuracy to the respective lists
            group_names.append(f'{category}_{label}')
            accuracies.append(group_accuracy)

    # Calculate the worst group accuracy
    worst_group_accuracy = np.min(accuracies)

    return worst_group_accuracy, group_names, accuracies

In [None]:
def evaluate_model(model, data_loader, loss_function):
    # Initialize lists to store losses, predictions, and indices
    batch_losses, all_predictions, batch_indices = [], [], []

    # Iterate over batches in the data loader
    for batch_idx, (inputs, targets) in tqdm(enumerate(data_loader), leave=False):
        # Make predictions using the model
        predictions = model(inputs, training=False)
        # Calculate the loss
        batch_loss = loss_function(targets, tf.squeeze(predictions))
        # Extend the losses list with the current batch loss repeated for each target in the batch
        batch_losses.extend([batch_loss.numpy()] * len(targets))
        # Extend the predictions list with the current batch predictions
        all_predictions.extend(tf.squeeze(predictions).numpy().tolist())
        # Extend the indices list with the current batch index repeated for each target in the batch
        batch_indices.extend([batch_idx] * len(targets))

    # Create a DataFrame with the batch indices and predictions
    predictions_df = pd.DataFrame({'index': batch_indices, 'pred': all_predictions})
    # Calculate the mean loss over the entire dataset
    mean_loss = np.mean(batch_losses)

    return predictions_df, mean_loss

In [None]:
criterion = tf.keras.losses.BinaryCrossentropy()
val_x = val_x.reshape((val_x.shape[0], -1))
data=tf.data.Dataset.from_tensor_slices((val_x,val_y))
data=data.batch(batch_size)

pred_df, loss = evaluate_model(model, data, criterion)
wga, groups, accuracies = calculate_worst_group_accuracy(pred_df, complete_y)

wga_dict=dict(zip(groups,accuracies))
wga_dict['loss']=str(loss)
wga_dict['wga']=str(wga)

In [None]:
pred=pred_df.copy()

In [None]:
pred['pred']=(pred['pred']>0.5).astype(int)
pred.drop('index',axis=1,inplace=True)
pred=pd.concat([pred,complete_y],axis=1)
pred=pd.concat([pred,complete_val_x],axis=1).drop('index',axis=1)

In [None]:
temp=path.split('/0')[0]
pred.to_csv(f'{temp}/validation_pred.csv',index=False)

In [None]:
with open(f'{path}_wga.json','w') as f:
    json.dump(wga_dict,f)