<a href="https://colab.research.google.com/github/chetan-parthiban/Conceptors/blob/master/Coding_Up_WEAT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations, filterfalse
import random

# Preparing WEAT Code

**Cosine Similarity** <br>
Define functions pertinent to computing cosine similarity/distance

In [0]:
def similarity_difference(W, A, B):
    
    simA = cosine_similarity(W,A)
    simB = cosine_similarity(W,B)
    
    meanA = np.mean(simA, axis = 1)
    meanB = np.mean(simB, axis = 1)
    
    return meanA - meanB

def association_difference(W, Z,A,B):  
    swAB = similarity_difference(W,A,B)
    szAB = similarity_difference(Z,A,B)
    return sum(swAB) - sum(szAB)

**Calculating p-value**

In [0]:
def random_permutation(iterable, r=None):
    pool = tuple(iterable)
    r = len(pool) if r is None else r
    return tuple(random.sample(pool, r))


def WEAT_pval(W,Z,A,B,test_statistic, sample):
    
    perm_size = W.shape[0]
    WuZ = np.vstack((W,Z))
    distribution = []
    
    if not sample:
        permutations = combinations(WuZ, perm_size)
    else:
        permutations = [random_permutation(WuZ, perm_size) for s in range(sample)]
    
    for Wi in permutations:
        Zi = []
        def ifWi(w):
            for wi in Wi:
                if np.all(w == wi):
                    return True
            return False

        Zi = filterfalse(ifWi, WuZ)
        Zi = [z for z in Zi]    
        distribution.append(association_difference(Wi,Zi,A,B))

    greaterthan = np.array([o > test_statistic for o in distribution])
    
    return greaterthan.sum()/greaterthan.size

**WEAT Final Form** <br>
Final function - takes in two sets of target words and two sets of attribute words and performs the word embedding association test

In [0]:
def WEAT(W, Z, A, B, sample = 2000):

    test_statistic = association_difference(W,Z,A,B)
    p = WEAT_pval(W,Z,A,B,test_statistic, sample)
    
    std = np.sqrt(np.var(similarity_difference(np.vstack((W,Z)), A, B)))   
    effect_size = test_statistic/(std*W.shape[0])
   
    return effect_size, p

# Importing Embeddings

In [0]:
!pip install flair

Collecting flair
[?25l  Downloading https://files.pythonhosted.org/packages/44/54/76374f9a448ca765446502e7f2bb53c976e9c055102290fe6f8b0b038b37/flair-0.4.1.tar.gz (78kB)
[K     |████████████████████████████████| 81kB 2.2MB/s 
Collecting segtok>=1.5.7 (from flair)
  Downloading https://files.pythonhosted.org/packages/1d/59/6ed78856ab99d2da04084b59e7da797972baa0efecb71546b16d48e49d9b/segtok-1.5.7.tar.gz
Collecting mpld3>=0.3 (from flair)
[?25l  Downloading https://files.pythonhosted.org/packages/91/95/a52d3a83d0a29ba0d6898f6727e9858fe7a43f6c2ce81a5fe7e05f0f4912/mpld3-0.3.tar.gz (788kB)
[K     |████████████████████████████████| 798kB 5.8MB/s 
Collecting sqlitedict>=1.6.0 (from flair)
  Downloading https://files.pythonhosted.org/packages/0f/1c/c757b93147a219cf1e25cef7e1ad9b595b7f802159493c45ce116521caff/sqlitedict-1.6.0.tar.gz
Collecting deprecated>=1.2.4 (from flair)
  Downloading https://files.pythonhosted.org/packages/9f/7a/003fa432f1e45625626549726c2fbb7a29baa764e9d1fdb2323a5d779f8a

In [0]:
from flair.embeddings import WordEmbeddings
from flair.embeddings import ELMoEmbeddings
from flair.data import Sentence
glove = WordEmbeddings('glove')
fast = WordEmbeddings('crawl')

2019-05-22 23:08:50,213 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmp_b6u7pdm


100%|██████████| 160000128/160000128 [00:08<00:00, 19756692.80B/s]

2019-05-22 23:08:58,863 copying /tmp/tmp_b6u7pdm to cache at /root/.flair/embeddings/glove.gensim.vectors.npy





2019-05-22 23:08:59,308 removing temp file /tmp/tmp_b6u7pdm
2019-05-22 23:08:59,820 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim not found in cache, downloading to /tmp/tmp_1cj3rgb


100%|██████████| 21494764/21494764 [00:01<00:00, 12022193.96B/s]

2019-05-22 23:09:02,141 copying /tmp/tmp_1cj3rgb to cache at /root/.flair/embeddings/glove.gensim
2019-05-22 23:09:02,181 removing temp file /tmp/tmp_1cj3rgb
2019-05-22 23:09:02,183 this function is deprecated, use smart_open.open instead





2019-05-22 23:09:04,382 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/en-fasttext-crawl-300d-1M.vectors.npy not found in cache, downloading to /tmp/tmp0k8wtypz


100%|██████████| 1200000128/1200000128 [00:56<00:00, 21322455.20B/s]

2019-05-22 23:10:01,218 copying /tmp/tmp0k8wtypz to cache at /root/.flair/embeddings/en-fasttext-crawl-300d-1M.vectors.npy





2019-05-22 23:10:08,137 removing temp file /tmp/tmp0k8wtypz
2019-05-22 23:10:08,617 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/en-fasttext-crawl-300d-1M not found in cache, downloading to /tmp/tmp2ug9smfw


100%|██████████| 39323680/39323680 [00:02<00:00, 14700195.06B/s]

2019-05-22 23:10:11,879 copying /tmp/tmp2ug9smfw to cache at /root/.flair/embeddings/en-fasttext-crawl-300d-1M





2019-05-22 23:10:11,945 removing temp file /tmp/tmp2ug9smfw
2019-05-22 23:10:11,946 this function is deprecated, use smart_open.open instead


# Obtaining Brown Corpus and WordLists

In [0]:
import numpy as np
import torch
import matplotlib.pyplot as plt
t = np.transpose
%matplotlib inline
from tqdm import tqdm
import nltk
nltk.download('brown')

from nltk.corpus import brown
brown_corpus = brown.sents()
brown_corpus = brown_corpus[:10000]

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [0]:
# Gender word lists
!git clone https://github.com/uclanlp/gn_glove
!git clone https://github.com/uclanlp/corefBias
!wget https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/female.txt
!wget https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/male.txt
    
# our code for debiasing -- also includes word lists    
!git clone https://github.com/jsedoc/ConceptorDebias

Cloning into 'gn_glove'...
remote: Enumerating objects: 48, done.[K
remote: Counting objects: 100% (48/48), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 162 (delta 18), reused 25 (delta 9), pack-reused 114[K
Receiving objects: 100% (162/162), 73.36 KiB | 5.24 MiB/s, done.
Resolving deltas: 100% (64/64), done.
Cloning into 'corefBias'...
remote: Enumerating objects: 14, done.[K
remote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 471 (delta 3), reused 0 (delta 0), pack-reused 457[K
Receiving objects: 100% (471/471), 84.18 MiB | 37.77 MiB/s, done.
Resolving deltas: 100% (273/273), done.
--2019-05-22 23:11:37--  https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/female.txt
Resolving www.cs.cmu.edu (www.cs.cmu.edu)... 128.2.42.95
Connecting to www.cs.cmu.edu (www.cs.cmu.edu)|128.2.42.95|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 35751 (35K) [text/plain]
Savin

In [0]:
# Get Winobias word lists
winoWordsPath = './' + 'corefBias/WinoBias/wino/extra_gendered_words.txt'
male_vino_extra = []
female_vino_extra = []
with open(winoWordsPath, "r+") as f_in:
    for line in f_in:
        male_vino_extra.append(line.split('\t')[0])
        female_vino_extra.append(line.strip().split('\t')[1])

# Get CMU word lists
cmuMaleWordPath = './' + 'male.txt'
male_cmu = []
with open(cmuMaleWordPath, "r+") as f_in:
  for line in f_in:
    w = line.strip()
    if len(w)>0 and w[0] != '#':
      male_cmu.append(w)
cmuFemaleWordPath = './' + 'female.txt'
female_cmu = []
with open(cmuFemaleWordPath, "r+") as f_in:
  for line in f_in:
    w = line.strip()
    if len(w)>0 and w[0] != '#':
      female_cmu.append(w)
    
# Get gnGlove word lists
gnGloveFemaleWordPath = './' + 'gn_glove/wordlist/female_word_file.txt'
female_gnGlove = []
with open(gnGloveFemaleWordPath, "r+") as f_in:
    for line in f_in:
        female_gnGlove.append(line.strip())
gnGloveMaleWordPath = './' + 'gn_glove/wordlist/male_word_file.txt'
male_gnGlove = []
with open(gnGloveMaleWordPath, "r+") as f_in:
    for line in f_in:
        male_gnGlove.append(line.strip())
    
# Get WEAT lists and conceptor functionality
from ConceptorDebias.Conceptors.conceptor_fxns import *
from ConceptorDebias.lists import WEAT_lists
WEATLists = WEAT_lists.WEATLists()

# Adding Some Useful Functions

In [0]:
def pick_embeddings(embedding, word_list):
    labels = []
    sentence = Sentence(' '.join(word_list))
    embedding.embed(sentence)
    X = torch.stack([token.embedding for token in sentence]).numpy()
    for w in sentence:
        labels.append(w)
    return X, labels

def do_plot(X_fit, title=None, labels = ['']):
    dimension = X_fit.shape[1]
    label_types = sorted(list(set(labels)))
    num_labels = len(label_types)
    colors = cm.Accent(np.linspace(0,1,num_labels))
    with plt.style.context(plt_style):
        fig = plt.figure()
        if dimension == 2:
            ax = fig.add_subplot(111)
            
            for lab,col in zip(label_types, colors):
                if num_labels>1:
                    idxs = [i for i,v in enumerate(labels) if v == lab]
                    ax.scatter([X_fit[i,0] for i in idxs], 
                               [X_fit[i,1] for i in idxs], 
                               c = [col], label = lab)
                else:
                    ax.scatter(X_fit[:,0],
                               X_fit[:,1],
                               c = [col])
        elif dimension == 3:
            ax = fig.add_subplot(111, projection ='3d')
            for lab, col in zip(label_types,colors):
                ax.scatter(X_fit[labels==lab,0],
                           X_fit[labels==lab,1],
                           X_fit[labels==lab,2],
                           c=[col])
        else:
            raise Exception('Bad Dimensions')
        plt.title(title)
        if num_labels >1:
            ax.legend()
        plt.show()
        
gender_list_pronouns = WEATLists.W_7_Male_terms + WEATLists.W_7_Female_terms + WEATLists.W_8_Male_terms + WEATLists.W_8_Female_terms
gender_list_pronouns = list(set(gender_list_pronouns))

pronouns_male = WEATLists.W_7_Male_terms + WEATLists.W_8_Male_terms
pronouns_male = list(set(pronouns_male))
pronouns_female = WEATLists.W_8_Female_terms + WEATLists.W_8_Female_terms
pronouns_female = list(set(pronouns_female))


gender_list_extended = male_vino_extra + female_vino_extra + male_gnGlove + female_gnGlove
gender_list_extended = list(set(gender_list_extended))

extended_male = male_vino_extra + male_gnGlove
extended_male = list(set(extended_male))
extended_female = female_vino_extra + female_gnGlove
extended_female = list(set(extended_female))

gender_list_propernouns = male_cmu + female_cmu
gender_list_propernouns = list(set(gender_list_propernouns))

propernouns_male = list(set(male_cmu))
propernouns_female = list(set(female_cmu))

gender_list_all = gender_list_pronouns + gender_list_extended + gender_list_propernouns
gender_list_all = list(set(gender_list_all))

all_male = list(set(pronouns_male + extended_male + propernouns_male))
all_female = list(set(pronouns_female + extended_female + propernouns_female))

career = list(set(WEATLists.W_6_Career))
family = list(set(WEATLists.W_6_Family))
malename = list(set(WEATLists.W_6_Male_names))
femalename = list(set(WEATLists.W_6_Female_names))

# Testing WEAT

In [0]:
pronouns_male = ['him', 'his', 'uncle', 'grandfather', 'son', 'he', 'boy', 'father', 'brother']
pronouns_female = ['her', 'grandmother', 'aunt', 'hers', 'daughter', 'sister', 'mother', 'she', 'girl']
emb = fast

male_e, _ = pick_embeddings(fast,pronouns_male)
female_e, _ = pick_embeddings(fast,pronouns_female)
career_e, _ = pick_embeddings(fast,career)
family_e, _ = pick_embeddings(fast,family)

effect_size, p = WEAT(male_e, female_e, career_e, family_e, sample = 10000)

print('Effect Size: ', effect_size, '\n', 'p-value: ', p)

Effect Size:  0.399042785590551 
 p-value:  0.2105
