<a href="https://colab.research.google.com/github/gyannetics/llm-evaluations/blob/main/Evaluation_of_LLMs_1_WEAT_WordEmbeddingAssociationTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Evaluation of LLMs : 1. WEAT - WordEmbeddingAssociationTest

# We are trying to find the association between Professions and Genders and identify if there us any bias

In [16]:
X = ['doctor', 'engineer', 'scientist'] # TargetSet 1
Y = ['nurse', 'teacher', 'receptionist'] # TargetSet 2

# Gender Specific Attribute Sets
A = ['man', 'male'] # AttributeSet 1
B = ['woman', 'female'] # Attribute Set2

In [17]:
from transformers import BertTokenizer, BertModel
import torch

# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to generate embeddings
def get_embeddings(word_list):
    embeddings_dict = {}
    with torch.no_grad():  # No need to calculate gradients
        for word in word_list:
            inputs = tokenizer(word, return_tensors="pt")
            outputs = model(**inputs)
            # Take the embeddings from the last hidden state
            # The shape is [batch_size, sequence_length, hidden_size]
            # We take the mean of the sequence_length dimension to get a single vector
            embeddings = outputs.last_hidden_state.mean(dim=1)
            embeddings_dict[word] = embeddings
    return embeddings_dict

# Combine your lists
words = ['doctor', 'engineer', 'scientist', 'nurse', 'teacher', 'receptionist', 'man', 'male',  'woman', 'female']

# Get embeddings
embeddings = get_embeddings(words)

# Print the shape of the embedding for the first word to verify
print(f"Shape of '{words[0]}' embedding:", embeddings[words[0]].shape)


Shape of 'doctor' embedding: torch.Size([1, 768])


In [12]:
# prompt: create word embeddings to the above items in the lists and return them as a dictionary for each item in the list

import numpy as np

def create_word_embeddings(X, Y, A, B):
  # Concatenate all the lists into a single list
  all_words = X + Y + A + B

  # Create a dictionary to store the word embeddings
  word_embeddings = {}

  # Iterate over the list of words and create a random embedding for each word
  for word in all_words:
    word_embeddings[word] = np.random.rand(10)  # Replace 100 with the desired embedding dimension

  return word_embeddings

# Call the function to create word embeddings
word_embeddings = create_word_embeddings(X, Y, A, B)

# Print the word embeddings
print(word_embeddings)


{'doctor': array([0.20775136, 0.84769692, 0.23346669, 0.77797114, 0.64938575,
       0.62109598, 0.02598373, 0.78171709, 0.06262078, 0.62742006]), 'engineer': array([0.05914125, 0.49459845, 0.08427656, 0.58388544, 0.09980845,
       0.59715969, 0.36476647, 0.0372791 , 0.4569011 , 0.22430984]), 'scientist': array([0.51448205, 0.47207151, 0.5166217 , 0.85510807, 0.45523899,
       0.72229528, 0.11633141, 0.81926343, 0.10807944, 0.37482172]), 'nurse': array([0.66064657, 0.98340902, 0.76074813, 0.6823608 , 0.01934922,
       0.6473569 , 0.79191913, 0.96181932, 0.36293627, 0.87350419]), 'teacher': array([0.20243003, 0.78632863, 0.49297467, 0.07464676, 0.85823365,
       0.07124516, 0.34884318, 0.51810275, 0.48282992, 0.20317198]), 'receptionist': array([0.54442437, 0.99420296, 0.5121149 , 0.84854754, 0.565648  ,
       0.96696614, 0.90646459, 0.68113484, 0.13411828, 0.08918251]), 'man': array([0.47451315, 0.12558826, 0.17568515, 0.13052301, 0.06955689,
       0.27641262, 0.88921514, 0.50854

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
word_embeddings = embeddings.copy()

In [19]:
def s(w, X, Y):
  sim_X = np.mean([cosine_similarity(word_embeddings[w].reshape(1,-1), word_embeddings[x].reshape(1,-1)) for x in X])
  sim_Y = np.mean([cosine_similarity(word_embeddings[w].reshape(1,-1), word_embeddings[y].reshape(1,-1)) for y in Y])
  return sim_X - sim_Y

In [20]:
weat_score = sum([s(a, X, Y) for a in A]) - sum([s(b, X, Y) for b in B])

In [21]:
weat_score

0.017807185649871826