In [1]:
import boto3
import xmltodict
from datetime import datetime
import json
import pandas as pd
import numpy as np
import pyemoji
import html

class DateTimeEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, datetime):
            return o.isoformat()

        return json.JSONEncoder.default(self, o)

In [2]:
def approve_work(worker, forced = False):
    work = workers[worker]
    num_approved = 0
    for assign in work:
        if assign['status'] == 'Submitted' or forced:
            print('\r', 'Approving', num_approved + 1, end='')
            num_approved = num_approved + 1
            client.approve_assignment(
                AssignmentId=assign['assignment_id'],
                OverrideRejection=True
            )
            assign['status'] = 'Approved'
            
        
def approve_worker(worker, QualificationTypeId):
    client.associate_qualification_with_worker(WorkerId=worker, QualificationTypeId = QualificationTypeId, IntegerValue = 1)

def block_worker(worker, reason):
    client.create_worker_block(WorkerId=worker, Reason = reason)
    
def reject_work(worker, reason):
    work = workers[worker]
    num_rejected = 0
    for assign in work:
        if assign['status'] == 'Submitted':
            print('\r', 'Rejecting', num_rejected + 1, end='')
            num_rejected = num_rejected + 1
            client.reject_assignment(
                AssignmentId=assign['assignment_id'],
                RequesterFeedback=reason
            )
            assign['status'] = 'Rejected'
            
def check_workers_work(worker, check_all = False):
    work = workers[worker]
    num_approved = 0
    for assign in work:
        if (assign['status'] == 'Submitted' or check_all):
            print(assign['topic'], '|', assign['tweet'], '|', assign['answer'].split('|'), '|', assign['assignment_id'])
            print('-'*30)
            
def dump_json(data, file_name):
    with open(f'./{file_name}.json', 'w') as fout:
        json.dump(data, fout, cls=DateTimeEncoder)
        
def read_json(file_name):
    with open(f'./{file_name}.json', 'r') as fout:
        return json.load(fout)

In [3]:
create_hits_in_production = True
environments = {
        "production": {
            "endpoint": "https://mturk-requester.us-east-1.amazonaws.com",
            "preview": "https://www.mturk.com/mturk/preview"
        },
        "sandbox": {
            "endpoint": "https://mturk-requester-sandbox.us-east-1.amazonaws.com",
            "preview": "https://workersandbox.mturk.com/mturk/preview"
        },
}
mturk_environment = environments["production"] if create_hits_in_production else environments["sandbox"]

client = boto3.client(
    service_name='mturk',
    region_name='us-east-1',
    endpoint_url=mturk_environment['endpoint'],
)

In [4]:
# This will return your current MTurk balance if you are connected to Production.
# If you are connected to the Sandbox it will return $10,000.
print(client.get_account_balance()['AvailableBalance'])

1180.20


In [5]:
df = pd.read_csv('claims-large.csv')
#df_ignore = pd.read_csv('./argumentative.csv')

In [6]:
html_layout = open('./evidence.html', 'r').read()
QUESTION_XML = """<HTMLQuestion xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2011-11-11/HTMLQuestion.xsd">
        <HTMLContent><![CDATA[{}]]></HTMLContent>
        <FrameHeight>1000</FrameHeight>
        </HTMLQuestion>"""
question_xml = QUESTION_XML.format(html_layout)

In [7]:
Items_to_annotate = 5#len(df)

#qualification = read_json('qualification_claim_main')

Qualifications = [
    #{
    #    'QualificationTypeId': qualification['QualificationType']['QualificationTypeId'],
    #    'Comparator': 'Exists',
    #    'ActionsGuarded': 'DiscoverPreviewAndAccept'
    #},
    
    { # Masters qualification
        #'QualificationTypeId': '2ARFPLSP75KLA8M8DH1HTEQVJT3SY6', # Masters qualification sandbox
        'QualificationTypeId': '2F1QJWKUDD8XADTFD2Q0G6UTO95ALH', # Masters qualification production
        'Comparator': 'Exists'
    }
]


MaxAssignments = 5

TaskAttributes = {
    #'MaxAssignments': 5,                 
    'LifetimeInSeconds': 60*60*24,         # How long the task will be available on the MTurk website (24 hours)
    'AssignmentDurationInSeconds': 60*10,   # How long Workers have to complete each item (5 minutes)
    'Reward': '0.03',                      # The reward you will offer Workers for each response
    'Title': 'Classify tweets',
    'Keywords': 'classification, tweet',
    'Description': 'Classify tweets for the type of evidence they contain in relation to topics for debates and arguments',
    'QualificationRequirements': Qualifications,
}


In [8]:
known_tweets = read_json('known_evidence')
first_round_annotatinos = read_json('evidence_first_round')
second_round_annotations = read_json('evidence_second_round')

tweet_num_done = {
    
}

known_tweets_dict = {}

for tweet in known_tweets:
    if tweet != None:
        key = tweet['tweet'] + tweet['topic']
        known_tweets_dict[key] = tweet['evidence']
        
for tweet in [*first_round_annotatinos, *second_round_annotations]:
    key = tweet['tweet'] + tweet['topic']
    if len(tweet['answers']) > 0:
        if key not in tweet_num_done:
            tweet_num_done[key] = len(tweet['answers'])
        else:
            tweet_num_done[key] = tweet_num_done[key] + len(tweet['answers'])

In [9]:
sum(list(tweet_num_done.values()))

3392

In [10]:
results = []


def encode_tweet(tweet):
    return str(html.escape(tweet).replace('\n', '<br/>').encode('ascii', 'xmlcharrefreplace')).replace("b'", '')[:-1]

def expand_set(df):
    df = df.drop_duplicates(subset=['id'])
    topics_en_v2 = [
        "We should reduce the consumption of meat",
        "Plant based food should be encouraged",
        "Meat alternatives should be encouraged",
        "Vegan and vegetarian diets should be encouraged",
        "We should pursue policies that promote sustainable foods"
    ]

    df_rec = []
    for r in df.to_dict('records'):
        for topic in topics_en_v2:
            rec = r.copy()
            rec['topic'] = topic
            df_rec.append(rec)
            
    return pd.DataFrame(df_rec).sample(frac=1)

df_set = df[df.claim > 0]#[~df.id.isin(df_ignore.id)]#pd.concat([df.drop(columns=['claim']), df_known]).sample(frac=1)
# known_answers = read_json('known_claims')
df_set_sample = df_set.sample(frac=1)#.sample(len(df_set)))
#df_set_sample.to_csv('active_hits_tweets.csv')
print(len(df_set_sample))

819


In [11]:
reward = float(TaskAttributes['Reward'])
amazon_tax = 1.2 # 20 % fee for amazon https://www.mturk.com/pricing
masters_fee = 1.05 # 5 % fee for amazon masters

print('Estimated cost:', (len(df_set_sample)*MaxAssignments-sum(list(tweet_num_done.values()))) * (reward*amazon_tax*masters_fee), '$')

Estimated cost: 26.5734 $


In [12]:
(len(df_set_sample)*MaxAssignments-sum(list(tweet_num_done.values())))

703

In [13]:
def try_answers(tweet, topic):
    key = tweet+topic
    if key in tweet_num_done:
        return tweet_num_done[key]
    return 0

def known_answer(tweet, topic):
    key = tweet+topic
    if key in known_tweets_dict:
        return known_tweets_dict[key]
    return None




def create_hits(data, offset = 0):
    results = []
    total_assignments = 0
    for i, row in enumerate(data[offset:]):
        print('\rCreating hit for tweet number', offset + i + 1, end='')
        num_assig = MaxAssignments-try_answers(row['tweet'], row['topic'])
        if num_assig > 0:
            total_assignments = total_assignments + num_assig

            response = client.create_hit(
                **TaskAttributes,
                MaxAssignments=num_assig,
                Question=question_xml.replace('${tweet}',encode_tweet(row['tweet'])).replace('${topic}', encode_tweet(row['topic']))
            )
            hit_type_id = response['HIT']['HITTypeId']
            results.append({
                'tweet': row['tweet'],
                'topic': row['topic'],
                'hit_id': response['HIT']['HITId'],
                'known_answer': known_answer(row['tweet'], row['topic'])
            })

    print("\nYou can view the HITs here:")
    link = mturk_environment['preview'] + "?groupId={}".format(hit_type_id)
    print(link)
    return results, link, total_assignments

In [331]:
#for i, hit in enumerate(results):
#    print('\rDeleting hit for tweet number',i + 1, end='')
#    client.update_expiration_for_hit(
#        HITId=hit['hit_id'],
#        ExpireAt=datetime(2015, 1, 1)
#    )

In [332]:
results, link, total_assignments = create_hits(df_set_sample.to_dict('records'))
#results, link, total_assignments = create_hits(list(filter(lambda x: x != None, known_tweets)))

Creating hit for tweet number 819
You can view the HITs here:
https://www.mturk.com/mturk/preview?groupId=3RY3YOAA7ZQFIIM9698OVWE5A0F27K


In [21]:
#client.update_qualification_type(
#    QualificationTypeId=qualification['QualificationType']['QualificationTypeId'],
#    Description=f'{qualification["QualificationType"]["Description"]} | When you get this qualification you can find a bigger annotation job here {link}',
#)
total_assignments

In [333]:
with open('./active_hits-v3-evidence.json', 'w') as fout:
    json.dump(results, fout)

In [14]:
with open('./active_hits-v3-evidence.json', 'r') as fout:
    results = json.load(fout)

In [26]:
workers = {
    
}

offset_for_gather = 0

for i, item in enumerate(results[offset_for_gather:]):
    print('\r', 'Gathering results for hit num', offset_for_gather + i + 1, end='')
    # Get the status of the HIT
    hit = client.get_hit(HITId=item['hit_id'])
    item['status'] = hit['HIT']['HITStatus']

    # Get a list of the Assignments that have been submitted by Workers
    assignmentsList = client.list_assignments_for_hit(
        HITId=item['hit_id'],
        AssignmentStatuses=['Submitted', 'Approved'],#, 'Rejected'],
        MaxResults=10
    )

    assignments = assignmentsList['Assignments']
    item['assignments_submitted_count'] = len(assignments)

    answers = []
    for assignment in assignments:
        
        
        
        # Retreive the attributes for each Assignment
        worker_id = assignment['WorkerId']
        assignment_id = assignment['AssignmentId']
        
        # Retrieve the value submitted by the Worker from the XML
        answer_dict = xmltodict.parse(assignment['Answer'])
        answer = answer_dict['QuestionFormAnswers']['Answer']['FreeText']
        answer_dict['worker'] = worker_id
        answer_dict['answer'] = answer
        answers.append(answer_dict)
        
        
        workser_work = {
            "answer": answer,
            "known_answer": item['known_answer'],
            "assignment_id": assignment['AssignmentId'],
            "status": assignment['AssignmentStatus'],
            "tweet": item['tweet'],
            "topic": item['topic'],
        }
        if worker_id in workers:
            workers[worker_id].append(workser_work)
        else:
            workers[worker_id] = [workser_work]
        
        # Approve the Assignment (if it hasn't already been approved)
        #if assignment['AssignmentStatus'] == 'Submitted':
        #    client.approve_assignment(
        #        AssignmentId=assignment_id,
        #        OverrideRejection=False
        #    )
    
    # Add the answers that have been retrieved for this item
    item['answers'] = answers


print('\nDone')

 Gathering results for hit num 354
Done


In [27]:
def mapper(tweet_dict):
    topic = df_set_sample[df_set_sample.tweet == tweet_dict['tweet']].head(1).topic.values[0]
    if len(tweet_dict['answers']) > 0:
        return {
            'tweet': tweet_dict['tweet'],
            'topic': topic,
            'evidence': tweet_dict['answers'][0]['QuestionFormAnswers']['Answer']['FreeText']
        }

#dump_json(list(map(mapper, results)), 'known_evidence')

In [28]:
def freeze(o):
    if isinstance(o,dict):
        return frozenset({ k:freeze(v) for k,v in o.items()}.items())

    if isinstance(o,list):
        return tuple([freeze(v) for v in o])

    return o
file_name = f'./active_{"production" if create_hits_in_production else "sandbox"}_hits-v1-claim-res-{hash(freeze(results))}.json'
with open(file_name, 'w') as fout:
    json.dump(results, fout)
print(file_name)

./active_production_hits-v1-claim-res--4760697775114063540.json


In [29]:


file_name = f'./workers-{hash(freeze(workers))}.json'

with open(file_name, 'w') as fout:
    json.dump(workers, fout)

print(file_name)

./workers-7331574588752615339.json


In [19]:
#with open('./workers-3003139998663480200.json', 'r') as fout:
#    workers = json.load(fout)


In [25]:
total_anno = 0
total_approved = 0


print('Topics ')
for worker in workers:
    work = workers[worker]
    num_unapproved = 0
    answer_dist = {'no evidence': 0, 'normative': 0, 'expert': 0, 'anecdotal': 0, 'study': 0, 'fact': 0}
    num_answer_made = 0
    num_hits_made = 0
    num_known = 0
    num_correct = 0
    num_partial_correct = 0
    num_wrong_answer = 0
    
    for assign in work:
        num_hits_made = num_hits_made + 1
        ans  = assign['answer']
        num_answer_made = num_answer_made + 1 
        if assign['status'] == 'Submitted':
            num_unapproved = num_unapproved + 1
        if assign['status'] == 'Approved':
            total_approved = total_approved + 1
        
        answer_dist[ans] = answer_dist[ans] + 1
        
        if not assign['known_answer'] == None:
            known = assign['known_answer']#.split('|')
            num_known = num_known + 1
            
            
            num_correct = num_correct + (ans == known)
            if ans != 'no evidence' and known != 'no evidence' and known != ans:
                num_partial_correct = num_partial_correct + 1
            else:
                num_wrong_answer = num_wrong_answer + (ans != known)
        
            
    total_anno = total_anno + len(work) 
    if num_unapproved > 0:
        
        print(worker, f"Hits made {num_hits_made}, number unapproved: {num_unapproved}, distribution:")
        print(worker, f"no evidence: {np.around(answer_dist['no evidence']/num_answer_made, 2)} normative: {np.around(answer_dist['normative']/num_answer_made, 2)} expert: {np.around(answer_dist['expert']/num_answer_made,2)} study: {np.around(answer_dist['study']/num_answer_made,2)} anecdotal: {np.around(answer_dist['anecdotal']/num_answer_made,2)} fact: {np.around(answer_dist['fact']/num_answer_made,2)}")
        if num_known > 0:
            print(worker, num_known, num_answer_made, 'num_correct', num_correct, 'num_partial_correct',num_partial_correct, 'num_wrong_answer',num_wrong_answer, num_correct/num_known, (num_correct + num_partial_correct)/num_known, num_wrong_answer/num_known)
        else: 
            print(worker, num_known, num_answer_made, num_correct, num_wrong_answer)
        print('')
print('Total number annotated', total_anno, 'out of', total_assignments, 'number approved', total_approved)


Topics 
Total number annotated 701 out of 703 number approved 701


In [355]:
c_worker = 'A1GKEEI844CEKI'
check_workers_work(c_worker)

Plant based food should be encouraged | years ago, id be glad to see so many alternative meat options. now, i’m HIGHLY skeptical of plant-based meat. have y’all even read the ingredients on em???? | ['anecdotal'] | 3DYGAII7PXV4DZI50GRRV46Q1YJQPU
------------------------------
Meat alternatives should be encouraged | <MENTION> <MENTION> <MENTION> Cease meat production.. | ['no evidence'] | 3AMYWKA6YN9TIHNK670E0ZK9RSD6OY
------------------------------
Meat alternatives should be encouraged | <MENTION> <MENTION> <MENTION> <MENTION> Then again all the other issues we have brought up with veganism that we have already explained many people would get sick or at least suffer of malnutrition if they tried veganism on their own as such it is morally acceptable for then ask people to buy and eat meat | ['normative'] | 3KMS4QQVKED6BJVG98BPNOGD0M5KFU
------------------------------
We should pursue policies that promote sustainable foods | <MENTION> I think eating a plant based diet is fantastic, I

In [356]:
approve_work(c_worker)

 Approving 64

In [272]:
block_worker(c_worker, 'Low quality work, and lots of misses it seems')

In [None]:
#reject_work(c_worker, 'Marking every tweet as unrelated or no claim is not quality work when there are misses such as  Thank you for participation')
#approve_work('A3GUJ6JD25FX7O')

In [30]:
dump_json(results, 'evidence_third_round')