In [1]:
%load_ext autoreload
%autoreload 2

# Replications of the Experiments in WEAT paper

In [None]:
from wefe.metrics import WEAT
from wefe.datasets import load_weat
from wefe.query import Query
from wefe.word_embedding import 
from wefe.utils import run_queries

import gensim.downloader as api

# Load the wordset
weat_wordset = load_weat()

# Define the 10 Queries:
queries = [
    # Flowers vs Insects wrt Pleasant (5) and Unpleasant (5)
    Query([weat_wordset['flowers'], weat_wordset['insects']],
          [weat_wordset['pleasant_5'], weat_wordset['unpleasant_5']],
          ['Flowers', 'Insects'], ['Pleasant(5)', 'Unpleasant(5)']),

    # Instruments vs Weapons wrt Pleasant (5) and Unpleasant (5)
    Query([weat_wordset['instruments'], weat_wordset['weapons']],
          [weat_wordset['pleasant_5'], weat_wordset['unpleasant_5']],
          ['Instruments', 'Weapons'], ['Pleasant(5)', 'Unpleasant(5)']),

    # European american names(5) vs African american names(5)
    # wrt Pleasant (5) and Unpleasant (5)
    Query([
        weat_wordset['european_american_names_5'],
        weat_wordset['african_american_names_5']
    ], [weat_wordset['pleasant_5'], weat_wordset['unpleasant_5']],
          ['European american names(5)', 'African american names(5)'],
          ['Pleasant(5)', 'Unpleasant(5)']),

    # European american names(7) vs African american names(7)
    # wrt Pleasant (5) and Unpleasant (5)
    Query([
        weat_wordset['european_american_names_7'],
        weat_wordset['african_american_names_7']
    ], [weat_wordset['pleasant_5'], weat_wordset['unpleasant_5']],
          ['European american names(7)', 'African american names(7)'],
          ['Pleasant(5)', 'Unpleasant(5)']),

    # European american names(7) vs African american names(7)
    # wrt Pleasant (9) and Unpleasant (9)
    Query([
        weat_wordset['european_american_names_7'],
        weat_wordset['african_american_names_7']
    ], [weat_wordset['pleasant_9'], weat_wordset['unpleasant_9']],
          ['European american names(7)', 'African american names(7)'],
          ['Pleasant(9)', 'Unpleasant(9)']),

    # Male and female names wrt Career and family
    Query([weat_wordset['male_names'], weat_wordset['female_names']],
          [weat_wordset['career'], weat_wordset['family']],
          ['Male names', 'Female names'], ['Career', 'Family']),

    # Math and arts wrt male and female terms
    Query([weat_wordset['math'], weat_wordset['arts']],
          [weat_wordset['male_terms'], weat_wordset['female_terms']],
          ['Math', 'Arts'], ['Male terms', 'Female terms']),

    # Science and arts wrt male and female terms
    Query([weat_wordset['science'], weat_wordset['arts_2']],
          [weat_wordset['male_terms'], weat_wordset['female_terms']],
          ['Science', 'Arts 2'], ['Male terms', 'Female terms']),

    # Mental and Physical disease wrt Temporary and Permanent
    Query([weat_wordset['mental_disease'], weat_wordset['physical_disease']],
          [weat_wordset['temporary'], weat_wordset['permanent']],
          ['Mental disease', 'Physical disease'], ['Temporary', 'Permanent']),

    # Young people names and Old people names disease wrt Pleasant(9) and Unpleasant(9)
    Query(
        [weat_wordset['young_people_names'], weat_wordset['old_people_names']],
        [weat_wordset['pleasant_9'], weat_wordset['unpleasant_9']],
        ['Young peoples names', 'Old peoples names'],
        ['Pleasant(9)', 'Unpleasant(9)'])
]

# Load the embedding models
w2v = (api.load('word2vec-google-news-300'),
                         'word2vec-google-news-300')
glove = (api.load('glove-wiki-gigaword-300'),
                           'glove-wiki-gigaword-300')

# Execute the queries with the models and WEAT
results = run_queries(WEAT,
                      queries, [w2v, glove],
                      aggregate_results=True,
                      aggregation_function='abs_avg',
                      warn_filtered_words=True,
                      metric_params={
                          'return_effect_size': True
                      },
                      lost_vocabulary_threshold=.25)

results.T.round(2)

In [None]:
from wefe.utils import plot_queries_results

fig = plot_queries_results(results)
fig.show()

In [None]:
fig.write_image('./doc/images/WEAT_replication.png', width = 1200, height= 600, scale=3)

In [2]:
from wefe.metrics import WEAT
from wefe.datasets import load_weat
from wefe.query import Query
from wefe.word_embedding_model import WordEmbeddingModel
from wefe.utils import run_queries

import gensim.downloader as api

# Load the wordset
weat_wordset = load_weat()

# Define the 10 Queries:
queries = [
    # Flowers vs Insects wrt Pleasant (5) and Unpleasant (5)
    Query([weat_wordset['flowers'], weat_wordset['insects']],
          [weat_wordset['pleasant_5'], weat_wordset['unpleasant_5']],
          ['Flowers', 'Insects'], ['Pleasant(5)', 'Unpleasant(5)']),

    # Instruments vs Weapons wrt Pleasant (5) and Unpleasant (5)
    Query([weat_wordset['instruments'], weat_wordset['weapons']],
          [weat_wordset['pleasant_5'], weat_wordset['unpleasant_5']],
          ['Instruments', 'Weapons'], ['Pleasant(5)', 'Unpleasant(5)']),

    # European american names(5) vs African american names(5)
    # wrt Pleasant (5) and Unpleasant (5)
    Query([
        weat_wordset['european_american_names_5'],
        weat_wordset['african_american_names_5']
    ], [weat_wordset['pleasant_5'], weat_wordset['unpleasant_5']],
          ['European american names(5)', 'African american names(5)'],
          ['Pleasant(5)', 'Unpleasant(5)']),

    # European american names(7) vs African american names(7)
    # wrt Pleasant (5) and Unpleasant (5)
    Query([
        weat_wordset['european_american_names_7'],
        weat_wordset['african_american_names_7']
    ], [weat_wordset['pleasant_5'], weat_wordset['unpleasant_5']],
          ['European american names(7)', 'African american names(7)'],
          ['Pleasant(5)', 'Unpleasant(5)']),

    # European american names(7) vs African american names(7)
    # wrt Pleasant (9) and Unpleasant (9)
    Query([
        weat_wordset['european_american_names_7'],
        weat_wordset['african_american_names_7']
    ], [weat_wordset['pleasant_9'], weat_wordset['unpleasant_9']],
          ['European american names(7)', 'African american names(7)'],
          ['Pleasant(9)', 'Unpleasant(9)']),

    # Male and female names wrt Career and family
    Query([weat_wordset['male_names'], weat_wordset['female_names']],
          [weat_wordset['career'], weat_wordset['family']],
          ['Male names', 'Female names'], ['Career', 'Family']),

    # Math and arts wrt male and female terms
    Query([weat_wordset['math'], weat_wordset['arts']],
          [weat_wordset['male_terms'], weat_wordset['female_terms']],
          ['Math', 'Arts'], ['Male terms', 'Female terms']),

    # Science and arts wrt male and female terms
    Query([weat_wordset['science'], weat_wordset['arts_2']],
          [weat_wordset['male_terms'], weat_wordset['female_terms']],
          ['Science', 'Arts 2'], ['Male terms', 'Female terms']),

    # Mental and Physical disease wrt Temporary and Permanent
    Query([weat_wordset['mental_disease'], weat_wordset['physical_disease']],
          [weat_wordset['temporary'], weat_wordset['permanent']],
          ['Mental disease', 'Physical disease'], ['Temporary', 'Permanent']),

    # Young people names and Old people names disease wrt Pleasant(9) and Unpleasant(9)
    Query(
        [weat_wordset['young_people_names'], weat_wordset['old_people_names']],
        [weat_wordset['pleasant_9'], weat_wordset['unpleasant_9']],
        ['Young peoples names', 'Old peoples names'],
        ['Pleasant(9)', 'Unpleasant(9)'])
]

# Load the embedding models
w2v = WordEmbeddingModel(api.load('word2vec-google-news-300'),
                         'word2vec-google-news-300')
glove = WordEmbeddingModel(api.load('glove-wiki-gigaword-300'),
                           'glove-wiki-gigaword-300')



INFO:gensim.models.keyedvectors:loading projection weights from C:\Users\pablo/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz
DEBUG:smart_open.smart_open_lib:{'uri': 'C:\\Users\\pablo/gensim-data\\word2vec-google-news-300\\word2vec-google-news-300.gz', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
INFO:gensim.models.keyedvectors:loaded (3000000, 300) matrix from C:\Users\pablo/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz
INFO:gensim.models.keyedvectors:loading projection weights from C:\Users\pablo/gensim-data\glove-wiki-gigaword-300\glove-wiki-gigaword-300.gz
DEBUG:smart_open.smart_open_lib:{'uri': 'C:\\Users\\pablo/gensim-data\\glove-wiki-gigaword-300\\glove-wiki-gigaword-300.gz', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params'

In [None]:
# Execute the queries with the models and WEAT
results = run_queries(WEAT,
                      queries, [w2v, glove],
                      aggregate_results=True,
                      aggregation_function='abs_avg',
                      warn_filtered_words=True,
                      metric_params={
                          'return_effect_size': True
                      },
                      lost_vocabulary_threshold=.25)

results.T.round(2)

In [24]:
from wefe.metrics import WEAT
WEAT().run_query(queries[0], w2v, calc_p_value=True ,log_p_value_status=False)

INFO:root:weat_original_result 1.4078285694122314
INFO:root:Number of possible total permutations: 30414093201713378043612608166064768844377641568960512000000000000. Maximum iterations allowed: 10000
INFO:root:WEAT p-value: 0 / 10000 runs
INFO:root:WEAT p-value: 500 / 10000 runs
INFO:root:WEAT p-value: 1000 / 10000 runs
INFO:root:WEAT p-value: 1500 / 10000 runs
INFO:root:WEAT p-value: 2000 / 10000 runs
INFO:root:WEAT p-value: 2500 / 10000 runs
INFO:root:WEAT p-value: 3000 / 10000 runs
INFO:root:WEAT p-value: 3500 / 10000 runs
INFO:root:WEAT p-value: 4000 / 10000 runs
INFO:root:WEAT p-value: 4500 / 10000 runs
INFO:root:WEAT p-value: 5000 / 10000 runs
INFO:root:WEAT p-value: 5500 / 10000 runs
INFO:root:WEAT p-value: 6000 / 10000 runs
INFO:root:WEAT p-value: 6500 / 10000 runs
INFO:root:WEAT p-value: 7000 / 10000 runs
INFO:root:WEAT p-value: 7500 / 10000 runs
INFO:root:WEAT p-value: 8000 / 10000 runs
INFO:root:WEAT p-value: 8500 / 10000 runs
INFO:root:WEAT p-value: 9000 / 10000 runs
INFO:r

{'query_name': 'Flowers and Insects wrt Pleasant(5) and Unpleasant(5)',
 'result': 1.4078286,
 'weat': 1.4078286,
 'effect_size': 1.5549757,
 'p-value': 0.0}