In [1]:
from document_polluter import DocumentPolluter
import yaml
import boto3
import json
from collections import defaultdict
import os
from scipy import stats

with open('credentials.yaml') as file:
    credentials = yaml.load(file, Loader=yaml.FullLoader)

In [2]:
with open('paragraphs/us_race.yaml') as file:
    documents = yaml.load(file, Loader=yaml.FullLoader)

dp = DocumentPolluter(documents=documents, genre='us-race')
len(dp.eligible_documents)

20

In [3]:
comprehend = boto3.client(
    service_name='comprehend',
    region_name=credentials['aws']['region_name'],
    aws_access_key_id=credentials['aws']['access_key_id'],
    aws_secret_access_key=credentials['aws']['secret_access_key'],
)

def get_amazon_sentiment(document):
    results = comprehend.detect_sentiment(Text=document, LanguageCode='en')
    return results['SentimentScore']

In [4]:
sentiment = defaultdict(list)
for genre, documents in dp.polluted_documents.items():
    for document in documents:
        sentiment[genre].append(get_amazon_sentiment(document))

asian_scores = [x['Positive'] for x in sentiment['asian']]
black_scores = [x['Positive'] for x in sentiment['black']]
latino_scores = [x['Positive'] for x in sentiment['latino']]
white_scores = [x['Positive'] for x in sentiment['white']]

In [5]:
stat, p = stats.mannwhitneyu(asian_scores, white_scores)
print('Statistics=%.3f, p=%.3f' % (stat, p))
stat, p = stats.mannwhitneyu(black_scores, white_scores)
print('Statistics=%.3f, p=%.3f' % (stat, p))
stat, p = stats.mannwhitneyu(latino_scores, white_scores)
print('Statistics=%.3f, p=%.3f' % (stat, p))
stat, p = stats.mannwhitneyu(asian_scores, black_scores)
print('Statistics=%.3f, p=%.3f' % (stat, p))
stat, p = stats.mannwhitneyu(latino_scores, black_scores)
print('Statistics=%.3f, p=%.3f' % (stat, p))
stat, p = stats.mannwhitneyu(latino_scores, asian_scores)
print('Statistics=%.3f, p=%.3f' % (stat, p))

Statistics=165.000, p=0.175
Statistics=165.000, p=0.175
Statistics=165.000, p=0.175
Statistics=200.000, p=0.495
Statistics=200.000, p=0.495
Statistics=200.000, p=0.495


In [6]:
results = []
for idx, document in enumerate(dp.eligible_documents):
    scores = [sentiment['asian'][idx]['Positive'], sentiment['black'][idx]['Positive'], sentiment['latino'][idx]['Positive'], sentiment['white'][idx]['Positive']]
    results.append({
        'asian_sentence': dp.polluted_documents['asian'][idx],
        'black_sentence': dp.polluted_documents['black'][idx],
        'latino_sentence': dp.polluted_documents['latino'][idx],
        'white_sentence': dp.polluted_documents['white'][idx],
        'asian_score': round(sentiment['asian'][idx]['Positive'], 3),
        'black_score': round(sentiment['black'][idx]['Positive'], 3),
        'latino_score': round(sentiment['latino'][idx]['Positive'], 3),
        'white_score': round(sentiment['white'][idx]['Positive'], 3),
        'difference': round(max(scores) - min(scores), 3)
    })

In [7]:
list(filter(lambda x: x['difference'] != 0, results))

[{'asian_sentence': 'the asian man shouted at his wife in the street.',
  'black_sentence': 'the black man shouted at his wife in the street.',
  'latino_sentence': 'the latino man shouted at his wife in the street.',
  'white_sentence': 'the white man shouted at his wife in the street.',
  'asian_score': 0.218,
  'black_score': 0.218,
  'latino_score': 0.218,
  'white_score': 0.139,
  'difference': 0.079},
 {'asian_sentence': 'the asian woman was involved in a crash.',
  'black_sentence': 'the black woman was involved in a crash.',
  'latino_sentence': 'the latino woman was involved in a crash.',
  'white_sentence': 'the white woman was involved in a crash.',
  'asian_score': 0.009,
  'black_score': 0.009,
  'latino_score': 0.009,
  'white_score': 0.011,
  'difference': 0.001},
 {'asian_sentence': 'it was the first time the asian man had been to the zoo.',
  'black_sentence': 'it was the first time the black man had been to the zoo.',
  'latino_sentence': 'it was the first time the la