### Necessary libraries

In [None]:
import logging as log
import math
import itertools as it
import numpy as np
import scipy.special
import scipy.stats

import torch
import pandas as pd
import numpy as np
from pathlib import Path
from typing import *
%matplotlib inline
import sys
import os

### Mount Google Drive and copy data to `content`

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Copy WEAT Data

In [None]:
! cp -R /content/drive/MyDrive/Contextual_Bias_Data/weat_bn_data /content/

Copy SEAT Data

In [None]:
! cp -R /content/drive/MyDrive/Contextual_Bias_Data/seat_bn_data /content/

Copy **GloVe** model

In [None]:
# copy the data from drive folder to content folder
! cp -R /content/drive/MyDrive/Contextual_Bias_Data/bn_glove.39M.300d.txt /content/

Copy **word2vec** model

In [None]:
! cp -R /content/drive/MyDrive/Contextual_Bias_Data/bangla_embeddings/full_data/w2v_512 /content/

Install `bnlp_toolkit` library for GloVe

In [None]:
! pip install bnlp_toolkit

Installing collected packages: sentencepiece, python-crfsuite, emoji, sklearn-crfsuite, ftfy, bnlp_toolkit
Successfully installed bnlp_toolkit-4.0.0 emoji-1.7.0 ftfy-6.1.1 python-crfsuite-0.9.9 sentencepiece-0.1.99 sklearn-crfsuite-0.3.6


Install `gensim` for **w2v** and **fasttext**

In [None]:
! pip install gensim
# For loading "bangla_embeddings/clean_subset/ft_512/ft_512", gensim==3.8.3 is required

Install dependencies for BERT

In [None]:
! pip install git+https://github.com/csebuetnlp/normalizer
! pip install transformers

Import BengaliGlove from bnlp_toolkit

In [None]:
from bnlp import BengaliGlove
glove_path = '/content/bn_glove.39M.300d.txt'

punkt not found. downloading...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
If you are using version <=3.3.2 please specify bnlp_toolkit with exact version, otherwise it will raises error in the upcoming version. 
To migrate feel free to checkout the newer version (4.0.0). It will release soon as beta.


In [None]:
# test_model = BengaliGlove()
# words =["মেয়ে"]
# for word in words:
#     result = test_model.closest_word(glove_path, word)
#     print(f'{word} --> {result}')

মেয়ে --> ['মেয়ে', 'মেয়ের', 'ঘসেটি', 'জোসনা', 'বিবাহযোগ্য', 'ছেল', 'কন্যা', 'মায়ের', 'মনোয়ারা', 'বড়লোকের']


Import **Word2Vec** from `gensim`

In [None]:
from gensim.models import Word2Vec

### Fetch model from transformers

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from normalizer import normalize

tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglabert_large")
model = AutoModelForMaskedLM.from_pretrained("csebuetnlp/banglabert_large", output_hidden_states = True)
model.eval()

### Implement WEAT

Load Json Data

In [None]:
import json
# WEAT_SETS = ["targ1", "targ2", "attr1", "attr2"]
# CATEGORY = "category"

def load_json(sent_file):
    ''' Load from json. We expect a certain format later, so do some post processing '''
    print(f"Loading {sent_file}...")
    all_data = json.load(open(sent_file, 'r'))
    data = {}
    targets = [ all_data['targ1']['category'], all_data['targ2']['category'] ]
    attributes = [all_data['attr1']['category'], all_data['attr2']['category']]
    for k, v in all_data.items():
        examples = v["examples"]
        data[k] = examples
        v["examples"] = examples

    return all_data, targets, attributes  # data

In [None]:
data, targets, attributes = load_json('/content/weat_bn_data/weat6b.jsonl')
print(targets)
print(attributes)
# vector = model.word2vec(glove_path, data.targ1.examples[0])
# len(vector)

The following function encodes the words into a dictionary where -
- `keys`: words
- `values`: vector representations of words

In [None]:
def encode(model, data, MODEL_NAME):
    dict_word2vec = {}
    if MODEL_NAME == 'glove':
        for word in data:
            vector = model.word2vec(glove_path, word)
            dict_word2vec[word] = vector
    elif MODEL_NAME == 'w2v':
        for word in data:
            vector = model.wv[word]
            dict_word2vec[word] = vector

    elif MODEL_NAME == 'bert':
        # need to call getwordvector for 'count' sentences having same word, then store the mean of the 5 vectors in dict_word2vec[word]
        for i, word in enumerate(data['words']):
            # print(word)
            itr = 0
            vector_sum = np.zeros(1024, dtype=float)
            while itr < data['count']:
                # print(itr)
                word_vector, _ = get_word_vector_normal(data['examples'][i*data['count'] + itr], word)
                vector_sum = np.add(vector_sum, word_vector)
                itr +=1
            average_vector = vector_sum / data['count']
            # Store the average vector in the dictionary
            dict_word2vec[word] = average_vector

    return dict_word2vec

In [None]:
dict_word2vec = encode(model, all_seat_data[0]['targ1'], 'bert')
type(dict_word2vec['গোলাপ'])

`encs_targ1` is a python dictionary having the word as **key** and the vector representation as **value**

In [None]:
def encode_data(model, data, MODEL_NAME, suppress_printables = False):
    if suppress_printables == False:
        print('encoding data...')
    if MODEL_NAME == 'bert':
        encs_targ1 = encode(model, data["targ1"], MODEL_NAME)
        encs_targ2 = encode(model, data["targ2"], MODEL_NAME)
        encs_attr1 = encode(model, data["attr1"], MODEL_NAME)
        encs_attr2 = encode(model, data["attr2"], MODEL_NAME)
    else:
        encs_targ1 = encode(model, data["targ1"]["examples"], MODEL_NAME)
        encs_targ2 = encode(model, data["targ2"]["examples"], MODEL_NAME)
        encs_attr1 = encode(model, data["attr1"]["examples"], MODEL_NAME)
        encs_attr2 = encode(model, data["attr2"]["examples"], MODEL_NAME)

    return encs_targ1, encs_targ2, encs_attr1, encs_attr2

This function adds the vector encodings of the words to the **data dictionary**

In [None]:
def add_encodings_to_dict(data, encs_targ1, encs_targ2, encs_attr1, encs_attr2, suppress_printables = False):
    if suppress_printables == False:
        print('adding encoded vectors to data dict...')
    data["targ1"]["encs"] = encs_targ1
    data["targ2"]["encs"] = encs_targ2
    data["attr1"]["encs"] = encs_attr1
    data["attr2"]["encs"] = encs_attr2

Saves the encodings of `glove` to a file for caching ( GloVe takes a lot of time to encode a word )

In [None]:
CATEGORIES = ['targ1', 'targ2', 'attr1', 'attr2']
def save_encodings(data, filename):
    for category in CATEGORIES:
        for word, vec in data[category]['encs'].items():
            data[category]['encs'][word] = data[category]['encs'][word].tolist()
    with open('/content/weat_bn_encoded_data/weat8b_enc.jsonl', 'w') as f:
        json.dump(data, f)

> `X` and `Y` are two sets of target words of equal size

> `A` and `B` are two sets of attribute words

$s(X, Y, A, B) = [\mathcal{E}_{x∈X}s(x, A, B)− \mathcal{E}_{y∈Y}s(y, A, B)]$

$s(w, A, B) = [mean_{a∈A}\cos(w, a)− mean_{b∈B}\cos(w, b)]$

$d = \frac{mean_{x∈X}s(x, A, B) - mean_{y∈Y}s(y, A, B)}{std\_dev_{w∈X \cup Y}s(w, A, B)}$





Calculate similarity scores -

In [None]:
def cosine_similarity(x, y):
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

In [None]:
def construct_cossim_lookup(XY, AB):
    """
    XY: mapping from target string to target vector (either in X or Y)
    AB: mapping from attribute string to attribute vectore (either in A or B)
    Returns an array of size (len(XY), len(AB)) containing cosine similarities
    between items in XY and items in AB.
    """

    cossims = np.zeros((len(XY), len(AB)))
    for xy in XY:
        for ab in AB:
            cossims[xy, ab] = cosine_similarity(XY[xy], AB[ab])
    return cossims

In [None]:
def s_wAB(A, B, cossims):
    """
    Return vector of s(w, A, B) across w, where
        s(w, A, B) = mean_{a in A} cos(w, a) - mean_{b in B} cos(w, b).
    """
    return cossims[:, A].mean(axis=1) - cossims[:, B].mean(axis=1)

In [None]:
def s_XAB(X, s_wAB_memo):
    return s_wAB_memo[X].sum()

def s_XYAB(X, Y, s_wAB_memo):
    return s_XAB(X, s_wAB_memo) - s_XAB(Y, s_wAB_memo)

Permutation Test

In [None]:
def p_val_permutation_test(X, Y, A, B, n_samples, cossims, parametric=False, suppress_printables = False):
    ''' Compute the p-val for the permutation test, which is defined as
        the probability that a random even partition X_i, Y_i of X u Y
        satisfies P[s(X_i, Y_i, A, B) > s(X, Y, A, B)]
    '''
    X = np.array(list(X), dtype=int)
    Y = np.array(list(Y), dtype=int)
    A = np.array(list(A), dtype=int)
    B = np.array(list(B), dtype=int)

    assert len(X) == len(Y)
    size = len(X)
    s_wAB_memo = s_wAB(A, B, cossims=cossims)
    XY = np.concatenate((X, Y))

    if parametric:
        if suppress_printables == False:
            print('Using parametric test')
        s = s_XYAB(X, Y, s_wAB_memo)

        if suppress_printables == False:
            print('Drawing {} samples'.format(n_samples))
        samples = []
        for _ in range(n_samples):
            np.random.shuffle(XY)
            Xi = XY[:size]
            Yi = XY[size:]
            assert len(Xi) == len(Yi)
            si = s_XYAB(Xi, Yi, s_wAB_memo)
            samples.append(si)

        # Compute sample standard deviation and compute p-value by
        # assuming normality of null distribution
        if suppress_printables == False:
            print('Inferring p-value based on normal distribution')
        (shapiro_test_stat, shapiro_p_val) = scipy.stats.shapiro(samples)
        if suppress_printables == False:
            print('Shapiro-Wilk normality test statistic: {:.2g}, p-value: {:.2g}'.format(
            shapiro_test_stat, shapiro_p_val))
        sample_mean = np.mean(samples)
        sample_std = np.std(samples, ddof=1)
        if suppress_printables == False:
            print('Sample mean: {:.2g}, sample standard deviation: {:.2g}'.format(
            sample_mean, sample_std))
        p_val = scipy.stats.norm.sf(s, loc=sample_mean, scale=sample_std)
        return p_val

    else:
        if suppress_printables == False:
            print('Using non-parametric test')
        s = s_XAB(X, s_wAB_memo)
        total_true = 0
        total_equal = 0
        total = 0

        num_partitions = int(scipy.special.binom(2 * len(X), len(X)))
        if num_partitions > n_samples:
            # We only have as much precision as the number of samples drawn;
            # bias the p-value (hallucinate a positive observation) to
            # reflect that.
            total_true += 1
            total += 1
            if suppress_printables == False:
                print('Drawing {} samples (and biasing by 1)'.format(n_samples - total))
            for _ in range(n_samples - 1):
                np.random.shuffle(XY)
                Xi = XY[:size]
                assert 2 * len(Xi) == len(XY)
                si = s_XAB(Xi, s_wAB_memo)
                if si > s:
                    total_true += 1
                elif si == s:  # use conservative test
                    total_true += 1
                    total_equal += 1
                total += 1

        else:
            if suppress_printables == False:
                print('Using exact test ({} partitions)'.format(num_partitions))
            for Xi in it.combinations(XY, len(X)):
                Xi = np.array(Xi, dtype=
                              int)
                assert 2 * len(Xi) == len(XY)
                si = s_XAB(Xi, s_wAB_memo)
                if si > s:
                    total_true += 1
                elif si == s:  # use conservative test
                    total_true += 1
                    total_equal += 1
                total += 1

        if total_equal:
            if suppress_printables == False:
                print('Equalities contributed {}/{} to p-value'.format(total_equal, total))

        return total_true / total



Calculate `mean` and `standard deviation`

In [None]:
def mean_s_wAB(X, A, B, cossims):
    return np.mean(s_wAB(A, B, cossims[X]))

def stdev_s_wAB(X, A, B, cossims):
    return np.std(s_wAB(A, B, cossims[X]), ddof=1)

Calculate **effect size**

In [None]:
def effect_size(X, Y, A, B, cossims):
    """
    Compute the effect size, which is defined as
        [mean_{x in X} s(x, A, B) - mean_{y in Y} s(y, A, B)] /
            [ stddev_{w in X u Y} s(w, A, B) ]
    args:
        - X, Y, A, B : sets of target (X, Y) and attribute (A, B) indices
    """
    X = list(X)
    Y = list(Y)
    A = list(A)
    B = list(B)

    numerator = mean_s_wAB(X, A, B, cossims=cossims) - mean_s_wAB(Y, A, B, cossims=cossims)
    denominator = stdev_s_wAB(X + Y, A, B, cossims=cossims)
    return numerator / denominator

In [None]:
def convert_keys_to_ints(X, Y):
    return (
        dict((i, v) for (i, (k, v)) in enumerate(X.items())),
        dict((i + len(X), v) for (i, (k, v)) in enumerate(Y.items())),
    )

In [None]:
def run_test(encs, n_samples, parametric=False, suppress_printables = False):
    ''' Run a WEAT.
    args:
        - encs (Dict[str: Dict]): dictionary mapping targ1, targ2, attr1, attr2
            to dictionaries containing the category and the encodings
        - n_samples (int): number of samples to draw to estimate p-value
            (use exact test if number of permutations is less than or
            equal to n_samples)
    '''
    X, Y = encs["targ1"]["encs"], encs["targ2"]["encs"]
    A, B = encs["attr1"]["encs"], encs["attr2"]["encs"]

    # First convert all keys to ints to facilitate array lookups
    (X, Y) = convert_keys_to_ints(X, Y)
    (A, B) = convert_keys_to_ints(A, B)

    XY = X.copy()
    XY.update(Y)
    AB = A.copy()
    AB.update(B)

    if suppress_printables == False:
        print("Computing cosine similarities...")
    cossims = construct_cossim_lookup(XY, AB)

    if suppress_printables == False:
        print(f"Null hypothesis: no difference between {encs['targ1']['category']} and {encs['targ2']['category']} in \
    association to attributes {encs['attr1']['category']} and {encs['attr2']['category']}")
        print("Computing pval...")
    pval = p_val_permutation_test(X, Y, A, B, n_samples, cossims=cossims, parametric=parametric, suppress_printables=suppress_printables)
    print(f"pval: {pval}")

    if suppress_printables == False:
        print("computing effect size...")
    esize = effect_size(X, Y, A, B, cossims=cossims)
    print(f"Effect size: {esize}")
    return esize, pval


Demo cell to test functionality

In [None]:
X = {"x" + str(i): 2 * np.random.rand(10) - 1 for i in range(25)}
Y = {"y" + str(i): 2 * np.random.rand(10) - 1 for i in range(25)}
A = {"a" + str(i): 2 * np.random.rand(10) - 1 for i in range(25)}
B = {"b" + str(i): 2 * np.random.rand(10) - 1 for i in range(25)}
A = X
B = Y
print(f'X: {X}')
print(f'Y: {Y}')
(X, Y) = convert_keys_to_ints(X, Y)
print(f'X: {X}')
print(f'Y: {Y}')
(A, B) = convert_keys_to_ints(A, B) # converts the keys (string) to keys (integer)

XY = X.copy()
XY.update(Y)
AB = A.copy()
AB.update(B)

cossims = construct_cossim_lookup(XY, AB)
print("computing pval...")
pval = p_val_permutation_test(X, Y, A, B, cossims=cossims, n_samples=10000)
print("pval: %g", pval)

print("computing effect size...")
esize = effect_size(X, Y, A, B, cossims=cossims)
print(f"Effect size: {esize}")

Load intended Model

In [None]:
MODEL_NAME = "glove"

if MODEL_NAME == 'w2v':
    print('Loading w2v model...')
    model = Word2Vec.load("/content/w2v_512/w2v_512")
elif MODEL_NAME == 'glove':
    print('Loading glove model...')
    model = BengaliGlove()

Loading glove model...


WEAT on single data

In [None]:
filename = '/content/weat_bn_data/weat8b.jsonl'
data, targets, attributes = load_json(filename)

et1, et2, ea1, ea2 = encode_data(model, data, MODEL_NAME, suppress_printables = False)
add_encodings_to_dict(data, et1, et2, ea1, ea2, suppress_printables = False)
save_encodings(data, filename)
run_test(data, 10000, suppress_printables = False) # X, Y need to have same length for p_val permutation test

print(f"{targets[0]} vs {targets[1]} ({attributes[0]} / {attributes[1]})")

Loading /content/weat_bn_data/weat8b.jsonl...
encoding data...
adding encoded vectors to data dict...
Using non-parametric test
Drawing 9999 samples (and biasing by 1)
pval: 0.9991
Effect size: -1.0394699306880417
Science vs Arts (MaleNames / FemaleNames)


Load saved encoding and re-run GloVe Test

In [None]:
data_enc = json.load(open('/content/weat_bn_encoded_data/weat8b_enc.jsonl', 'r'))
run_test(data, 10000, parametric = True, suppress_printables = False)

Using parametric test
Drawing 10000 samples
Inferring p-value based on normal distribution
Shapiro-Wilk normality test statistic: 1, p-value: 0.49
Sample mean: 0.0077, sample standard deviation: 0.38
pval: 0.998898652803399
Effect size: -1.0394699306880417




(-1.0394699306880417, 0.998898652803399)

In [None]:
# from google.colab import files
# ! zip -r /content/wn_bn_data_encoded.zip /content/weat_bn_encoded_data/
# files.download('/content/wn_bn_data_encoded.zip')

WEAT on whole dataset

In [None]:
folder_path = '/content/weat_bn_data'
files = os.listdir(folder_path)

for file in files:
    file_path = os.path.join(folder_path, file)
    if os.path.isfile(file_path):
        # print(f"filepath -> {file_path}")
        data, targets, attributes = load_json(file_path)
        et1, et2, ea1, ea2 = encode_data(model, data, MODEL_NAME, suppress_printables = True)
        add_encodings_to_dict(data, et1, et2, ea1, ea2, suppress_printables = True)

        print(f"{targets[0]} vs {targets[1]} ({attributes[0]} / {attributes[1]})")
        run_test(data, 10000, suppress_printables = True)
        print()


### Implement WEAT for BERT

In [None]:
def get_word_vector_broken(sentence, word):
  normalized_sent = normalize(sentence)
#   print(normalized_sent)
  input_token = tokenizer(normalized_sent, return_tensors="pt")
#   print(input_token)
  sent_list = sentence.split(' ')
  idx = sent_list.index(word) + 1 # for [CLS]
#   print(f'{sentence} \n {word} -- {idx}')
  with torch.no_grad():
    outputs = model(**input_token)
    # print(outputs.hidden_states[-1].shape) (1, 8, 1024)
    print(outputs.hidden_states[-1][0][idx])
    # return outputs[1][24][0][idx]
    return outputs[1][-1][0].detach().cpu().numpy()[idx]# + outputs[1][-1][0].detach().cpu().numpy()[idx + 1] + outputs[1][-1][0].detach().cpu().numpy()[idx + 2]

In [None]:
sentence = "ছেলেরা বিকেলে মাঠে ফুটবল খেলে।"
get_word_vector(sentence, 'খেলে')

Test `get_word_vector`

In [None]:
import re

def get_word_vector_normal(sentence, word):

    normalized_sent = normalize(sentence)
    word = normalize(word)
    # print(f"normalized: {normalized_sent}")
    input_token_mappings = tokenizer(normalized_sent, return_tensors="pt", return_offsets_mapping = True)
    input_token = tokenizer(normalized_sent, return_tensors="pt")
    # print(f"tokens: {input_token_mappings}")
    decoded = tokenizer.decode(input_token['input_ids'][0])
    # print(f"Decoded tokens: {decoded}")
    sent_list = normalized_sent.split(' ')
    # print(f"sentence list: {sent_list}")
    if word in sent_list:
        idx = sent_list.index(word) + 1
    else:
        pattern = r'\b' + word + r'\W*'
        # print(pattern)
        for i, w in enumerate(sent_list):
            if re.search(pattern, w):
                # print("found")
                idx = i + 1
    # print(f'{sentence} -> {word}({idx})')
    with torch.no_grad():
        outputs = model(**input_token)
        # print(type(outputs[1][24][0]))
        # print(len(outputs[1][24][0]))
        # print(idx)
        return outputs.hidden_states[-1][0].detach().cpu().numpy()[idx], input_token_mappings

In [None]:
sentence = "এটি একটি কৃষ্ণচূড়া।"
print(get_word_vector_normal(sentence, 'কৃষ্ণচূড়া'))
# len(get_word_vector(sentence, 'ফুটবল'))

Check [this](https://colab.research.google.com/drive/1RZgGPBSIdnMnDhr2r9Ov3W_6exbL_wOQ#scrollTo=EjZvVx7YKseh) notebook for problematic tokens

In [None]:
sentence = "বাগানে রজনীগন্ধা ফুল ফুটেছে।"
sentence2 = "বাগানে গোলাপ ফুল ফুটেছে।"
vector1 = get_word_vector_broken(sentence, 'রজনীগন্ধা')
vector2, _ = get_word_vector_normal(sentence2, 'গোলাপ')
vector2.shape
# cosine_similarity(vector1, vector2)

tensor([ 0.1118,  0.3700,  0.7958,  ...,  0.4475,  1.1334, -0.2145])


(1024,)

In [None]:
import json
# WEAT_SETS = ["targ1", "targ2", "attr1", "attr2"]
# CATEGORY = "category"

def load_json_templates(sent_file):
    ''' Load from json. We expect a certain format later, so do some post processing '''
    print(f"Loading {sent_file}...")
    all_data = json.load(open(sent_file, 'r'))
    data = {}
    targets = [ all_data['targ1']['category'], all_data['targ2']['category'] ]
    attributes = [all_data['attr1']['category'], all_data['attr2']['category']]
    for k, v in all_data.items():
        templates = v["templates"]
        data[k] = templates
        v["templates"] = templates

    return all_data, targets, attributes  # data

In [None]:
folder_path = '/content/seat_bn_data/templates'
files = os.listdir(folder_path)
all_seat_template_data = []
files.sort()
for file in files:
    file_path = os.path.join(folder_path, file)
    if os.path.isfile(file_path):
        # print(f"filepath -> {file_path}")
        seat_template_data, targets, attributes = load_json_templates(file_path)
        all_seat_template_data.append(seat_template_data)
        # print(data['targ1']['category'])
        print(f"{seat_template_data['targ1']['category']} has {len(seat_template_data['targ1']['templates'])} sentences")
        print(f"{seat_template_data['targ2']['category']} has {len(seat_template_data['targ2']['templates'])} sentences")
        print(f"{seat_template_data['attr1']['category']} has {len(seat_template_data['attr1']['templates'])} sentences")
        print(f"{seat_template_data['attr2']['category']} has {len(seat_template_data['attr2']['templates'])} sentences")

In [None]:
weat_folder_path = '/content/weat_bn_data'
weat_files = os.listdir(weat_folder_path)
weat_files.sort()
all_weat_data = []

for file in weat_files:
    weat_file_path = os.path.join(weat_folder_path, file)
    if os.path.isfile(weat_file_path):
        # print(f"filepath -> {file_path}")
        weat_data, targets, attributes = load_json(weat_file_path)
        all_weat_data.append(weat_data)
        # print(data['targ1']['category'])
        print(f"{weat_data['targ1']['category']} has {len(weat_data['targ1']['examples'])} words")
        print(f"{weat_data['targ2']['category']} has {len(weat_data['targ2']['examples'])} words")
        print(f"{weat_data['attr1']['category']} has {len(weat_data['attr1']['examples'])} words")
        print(f"{weat_data['attr2']['category']} has {len(weat_data['attr2']['examples'])} words")

In [None]:
for i, data in enumerate(all_weat_data):
    data['targ1']['count'] = len(all_seat_template_data[i]['targ1']['templates'])
    data['targ2']['count'] = len(all_seat_template_data[i]['targ2']['templates'])
    data['attr1']['count'] = len(all_seat_template_data[i]['attr1']['templates'])
    data['attr2']['count'] = len(all_seat_template_data[i]['attr2']['templates'])

`all_weat_data` now has an additional key-value pair which contains the number of template sentences for each target/attribute

print `all_weat_data[0]` for clarification

In [None]:
all_seat_data[0]

In [None]:
folder_path = '/content/seat_bn_data/data'
files = os.listdir(folder_path)
all_seat_data = []
files.sort()
for file in files:
    file_path = os.path.join(folder_path, file)
    if os.path.isfile(file_path):
        # print(f"filepath -> {file_path}")
        seat_data, targets, attributes = load_json(file_path)
        all_seat_data.append(seat_data)
        # print(data['targ1']['category'])
        print(f"{seat_data['targ1']['category']} has {len(seat_data['targ1']['examples'])} sentences")
        print(f"{seat_data['targ2']['category']} has {len(seat_data['targ2']['examples'])} sentences")
        print(f"{seat_data['attr1']['category']} has {len(seat_data['attr1']['examples'])} sentences")
        print(f"{seat_data['attr2']['category']} has {len(seat_data['attr2']['examples'])} sentences")

Need to use `all_weat_data` and `all_seat_sentences` to identify which word's vector should be fetched

In [None]:
for i, seat_data in enumerate(all_seat_data):
    seat_data['targ1']['words'] = all_weat_data[i]['targ1']['examples']
    seat_data['targ1']['count'] = all_weat_data[i]['targ1']['count']

    seat_data['targ2']['words'] = all_weat_data[i]['targ2']['examples']
    seat_data['targ2']['count'] = all_weat_data[i]['targ2']['count']

    seat_data['attr1']['words'] = all_weat_data[i]['attr1']['examples']
    seat_data['attr1']['count'] = all_weat_data[i]['attr1']['count']

    seat_data['attr2']['words'] = all_weat_data[i]['attr2']['examples']
    seat_data['attr2']['count'] = all_weat_data[i]['attr2']['count']

Do a sanity check ✔

In [None]:
for i, seat_data in enumerate(all_seat_data):
    if len(seat_data['targ1']['words'])*seat_data['targ1']['count'] != len(seat_data['targ1']['examples']):
        print("Issue")
    if len(seat_data['targ2']['words'])*seat_data['targ2']['count'] != len(seat_data['targ2']['examples']):
        print("Issue")
    if len(seat_data['attr1']['words'])*seat_data['attr1']['count'] != len(seat_data['attr1']['examples']):
        print("Issue")
    if len(seat_data['attr2']['words'])*seat_data['attr2']['count'] != len(seat_data['attr2']['examples']):
        print("Issue")

SEAT on single file

In [None]:
MODEL_NAME = 'bert'

et1, et2, ea1, ea2 = encode_data(model, all_seat_data[0], MODEL_NAME, suppress_printables = False)
add_encodings_to_dict(all_seat_data[0], et1, et2, ea1, ea2, suppress_printables = False)
run_test(all_seat_data[0], 10000, suppress_printables = False) # X, Y need to have same length for p_val permutation test

SEAT on whole dataset

In [None]:
MODEL_NAME = 'bert'
for seat_data in all_seat_data:
    et1, et2, ea1, ea2 = encode_data(model, seat_data, MODEL_NAME, suppress_printables = False)
    add_encodings_to_dict(seat_data, et1, et2, ea1, ea2, suppress_printables = False)
    run_test(seat_data, 10000, suppress_printables = False) # X, Y need to have same length for p_val permutation test
    print(f"{seat_data['targ1']['category']} vs {seat_data['targ2']['category']} ({seat_data['attr1']['category']} / {seat_data['attr2']['category']})")