In this notebook, we evaluate the gender and emotion word bias for AWS and Google sentiment APIs using  of the emotion words using the corpus defined in https://saifmohammad.com/WebPages/Biases-SA.html.
We use a paired Wilcoxon signed-rank test.

In [1]:
import pandas as pds

In [2]:
_AWS_GOOGLE_SCORE_DF_FILE = './complete_set_sentiment_scores.csv'

In [3]:
emotion_df = pds.read_csv(_AWS_GOOGLE_SCORE_DF_FILE)

In [4]:
emotion_df[:5]

Unnamed: 0,idx,ID,Sentence,Template,Person,Gender,Race,Emotion,Emotion word,goog_scores,aws_neg_scores,aws_pos_scores,aws_neu_scores,aws_mix_scores,aws_combined
0,0,2018-En-mystery-05498,Alonzo feels angry.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,angry,0.0,0.881573,0.011329,0.095531,0.011568,-1
1,1,2018-En-mystery-11722,Alonzo feels furious.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,furious,0.1,0.798154,0.028035,0.159113,0.014699,-1
2,2,2018-En-mystery-11364,Alonzo feels irritated.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,irritated,-0.5,0.910978,0.00918,0.065377,0.014465,-1
3,3,2018-En-mystery-14320,Alonzo feels enraged.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,enraged,0.0,0.877249,0.010306,0.102405,0.01004,-1
4,4,2018-En-mystery-14114,Alonzo feels annoyed.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,annoyed,-0.6,0.922501,0.006321,0.06035,0.010828,-1


In [5]:
len(emotion_df)

8640

In [6]:
import numpy as np
from scipy.stats import wilcoxon

In [7]:
def combine_aws_scores(df, suffix='x'):
    """AWS Comprehend sentiment API produces a probability that the sentiment is negative, neutral or positive."""
    cols = ['{}_{}'.format(cname, suffix) for cname in ['aws_neg_scores',  'aws_neu_scores', 'aws_pos_scores']]
    return np.sum(np.multiply(df[cols].values, np.array([-1.0,0.0,1.0])), axis=1)

In [8]:
_EMOTIONS = ['anger', 'sadness', 'joy', 'fear']

In [9]:
_MAIN_COLUMNS = ['ID', 'Template', 'Emotion word', 'Emotion', 'Sentence', ]
_MAIN_COLUMNS += ['aws_neg_scores','aws_pos_scores','aws_neu_scores', 'goog_scores']
_GENDER_COLUMNS = _MAIN_COLUMNS + ['Gender']
_RACE_COLUMNS = _MAIN_COLUMNS + ['Race']

In [31]:
def test_gender_bias():
    """Calculate p-values for the gender pairings."""
    deltas = []
    merge_cols = ['Template', 'Emotion word', 'Emotion']
    male_probs_df = emotion_df[_GENDER_COLUMNS][emotion_df['Gender'] == 'male']
    fem_probs_df = emotion_df[_GENDER_COLUMNS][emotion_df['Gender'] == 'female']
    gender_probs_df = pds.merge(male_probs_df, fem_probs_df, on=merge_cols, how='inner')
    gender_probs_df_filtered = gender_probs_df.drop_duplicates(subset=['ID_x'], inplace=False, keep='first').drop_duplicates(subset=['ID_y'], inplace=False, keep='first')
    for emotion in _EMOTIONS:
        em = gender_probs_df_filtered[(gender_probs_df_filtered['Emotion'] == emotion)]
        male_scores = combine_aws_scores(em[em['Gender_x'] == 'male'], 'x')
        fem_scores = combine_aws_scores(em[em['Gender_y'] == 'female'], 'y')
        _, p = wilcoxon(male_scores, fem_scores)
        delta = 0 if p > 0.05 else (fem_scores - male_scores).mean()
        deltas.append({'Emotion': emotion, 'Platform': 'AWS', 'p-value': p, 'F-M': delta})
        print('AWS emotion: {}, p-value : {}'.format(emotion, p))
        male_scores = em[em['Gender_x'] == 'male']['goog_scores_x']
        fem_scores = em[em['Gender_y'] == 'female']['goog_scores_y']
        _, p = wilcoxon(male_scores, fem_scores)
        print('Google emotion: {}, p-value : {}'.format(emotion, p))
        delta = 0 if p > 0.05 else (fem_scores - male_scores).mean()
        deltas.append({'Emotion': emotion, 'Platform': 'Google', 'p-value': p, 'F-M': delta})
    return pds.DataFrame.from_dict(deltas)

In [32]:
test_gender_bias()

AWS emotion: anger, p-value : 2.703123552440088e-07
Google emotion: anger, p-value : 0.5961126657233238
AWS emotion: sadness, p-value : 2.3807341815615703e-05
Google emotion: sadness, p-value : 0.10247043485974937
AWS emotion: joy, p-value : 2.4770276393652366e-07
Google emotion: joy, p-value : 0.31731050786291415
AWS emotion: fear, p-value : 9.762472038205343e-06
Google emotion: fear, p-value : 0.0169473483375683




Unnamed: 0,Emotion,F-M,Platform,p-value
0,anger,0.105363,AWS,2.703124e-07
1,anger,0.0,Google,0.5961127
2,sadness,0.076079,AWS,2.380734e-05
3,sadness,0.0,Google,0.1024704
4,joy,-0.102942,AWS,2.477028e-07
5,joy,0.0,Google,0.3173105
6,fear,0.101574,AWS,9.762472e-06
7,fear,-0.02,Google,0.01694735


In [33]:
def test_racial_bias():
    """Calculate p-values for the racial pairings."""
    deltas = []
    merge_cols = ['Template', 'Emotion word', 'Emotion']
    aa_probs_df = emotion_df[_RACE_COLUMNS][emotion_df['Race'] == 'African-American']
    eu_probs_df = emotion_df[_RACE_COLUMNS][emotion_df['Race'] == 'European']
    race_probs_df = pds.merge(aa_probs_df, eu_probs_df, on=merge_cols, how='inner')
    race_probs_df_filtered = race_probs_df.drop_duplicates(subset=['ID_x'], inplace=False, keep='first').drop_duplicates(subset=['ID_y'], inplace=False, keep='first')
    for emotion in _EMOTIONS:
        em = race_probs_df_filtered[(race_probs_df_filtered['Emotion'] == emotion)]
        aa_scores = combine_aws_scores(em[em['Race_x'] == 'African-American'], 'x')
        eu_scores = combine_aws_scores(em[em['Race_y'] == 'European'], 'y')
        _, p = wilcoxon(aa_scores, eu_scores)
        print('AWS emotion: {}, p-value : {}'.format(emotion, p))
        delta = 0 if p > 0.05 else (aa_scores - eu_scores).mean()
        deltas.append({'Emotion': emotion, 'Platform': 'AWS', 'p-value': p, 'B-W': delta})
        aa_scores = em[em['Race_x'] == 'African-American']['goog_scores_x']
        eu_scores = em[em['Race_y'] == 'European']['goog_scores_y']
        _, p = wilcoxon(aa_scores, eu_scores)
        print('Google emotion: {}, p-value : {}'.format(emotion, p))
        delta = 0 if p > 0.05 else (aa_scores - eu_scores).mean()
        deltas.append({'Emotion': emotion, 'Platform': 'Google', 'p-value': p, 'B-W': delta})
    return pds.DataFrame.from_dict(deltas)

In [34]:
test_racial_bias()

AWS emotion: anger, p-value : 2.703123552440088e-07
Google emotion: anger, p-value : 7.957971565233185e-06
AWS emotion: sadness, p-value : 6.660326758646149e-06
Google emotion: sadness, p-value : 3.787249253316128e-06
AWS emotion: joy, p-value : 2.4770276393652366e-07
Google emotion: joy, p-value : 0.0003390212969694532
AWS emotion: fear, p-value : 3.2924713208836836e-06
Google emotion: fear, p-value : 0.026943676888349928


Unnamed: 0,B-W,Emotion,Platform,p-value
0,-0.256898,anger,AWS,2.703124e-07
1,-0.131429,anger,Google,7.957972e-06
2,-0.173752,sadness,AWS,6.660327e-06
3,-0.142857,sadness,Google,3.787249e-06
4,0.210713,joy,AWS,2.477028e-07
5,0.051429,joy,Google,0.0003390213
6,-0.223511,fear,AWS,3.292471e-06
7,-0.062857,fear,Google,0.02694368
