In [1]:
import boto3
import xmltodict
from datetime import datetime
import json
import pandas as pd
import numpy as np
import pyemoji
import html

class DateTimeEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, datetime):
            return o.isoformat()

        return json.JSONEncoder.default(self, o)

In [2]:
def approve_work(worker, forced = False):
    work = workers[worker]
    num_approved = 0
    for assign in work:
        if assign['status'] == 'Submitted' or forced:
            print('\r', 'Approving', num_approved + 1, end='')
            num_approved = num_approved + 1
            client.approve_assignment(
                AssignmentId=assign['assignment_id'],
                OverrideRejection=True
            )
            assign['status'] = 'Approved'
            
        
def approve_worker(worker, qualification):
    client.associate_qualification_with_worker(WorkerId=worker, QualificationTypeId = qualification['QualificationType']['QualificationTypeId'], IntegerValue = 1)
    
def reject_worker(worker, qualification):
    client.disassociate_qualification_from_worker(
        WorkerId=worker,
        QualificationTypeId = qualification['QualificationType']['QualificationTypeId'], 
        reason = 'Test disqualification'
    )
  
def reject_work(worker, reason):
    work = workers[worker]
    num_rejected = 0
    for assign in work:
        if assign['status'] == 'Submitted':
            print('\r', 'Rejecting', num_rejected + 1, end='')
            num_rejected = num_rejected + 1
            client.reject_assignment(
                AssignmentId=assign['assignment_id'],
                RequesterFeedback=reason
            )
            assign['status'] = 'Rejected'
            
def check_workers_work(worker, check_all = False):
    work = workers[worker]
    num_approved = 0
    for assign in work:
        if (assign['status'] == 'Submitted' or check_all):
            print(assign['tweet'], '|', assign['answer'], '|', assign['assignment_id'])
            print('-'*30)
            
def dump_json(data, file_name):
    with open(f'./{file_name}.json', 'w') as fout:
        json.dump(data, fout, cls=DateTimeEncoder)
        
def read_json(file_name):
    with open(f'./{file_name}.json', 'r') as fout:
        return json.load(fout)
    
def get_requested_qualification(qualification): 
    return client.list_qualification_requests(
        QualificationTypeId=qualification['QualificationType']['QualificationTypeId'],
    )

In [3]:
create_hits_in_production = True
environments = {
        "production": {
            "endpoint": "https://mturk-requester.us-east-1.amazonaws.com",
            "preview": "https://www.mturk.com/mturk/preview"
        },
        "sandbox": {
            "endpoint": "https://mturk-requester-sandbox.us-east-1.amazonaws.com",
            "preview": "https://workersandbox.mturk.com/mturk/preview"
        },
}
mturk_environment = environments["production"] if create_hits_in_production else environments["sandbox"]

client = boto3.client(
    service_name='mturk',
    region_name='us-east-1',
    endpoint_url=mturk_environment['endpoint'],
)

In [4]:
# This will return your current MTurk balance if you are connected to Production.
# If you are connected to the Sandbox it will return $10,000.
print(client.get_account_balance()['AvailableBalance'])

1412.04


In [5]:
html_layout = open('./claim.html', 'r').read()
QUESTION_XML = """<HTMLQuestion xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2011-11-11/HTMLQuestion.xsd">
        <HTMLContent><![CDATA[{}]]></HTMLContent>
        <FrameHeight>650</FrameHeight>
        </HTMLQuestion>"""
question_xml = QUESTION_XML.format(html_layout)

In [6]:
qualification = client.create_qualification_type(
    Name='Good at labeling tweets for claims',
    Keywords='tweet, tweets, labeling, classefication, twitter',
    Description='The ability to accurately label tweets to relevant topics, for tweets containing potential claims.',
    QualificationTypeStatus='Active',
    AutoGranted=True
)

dump_json(qualification, 'qualification_claim_main')



RequestError: An error occurred (RequestError) when calling the CreateQualificationType operation: You have already created a QualificationType with this name. A QualificationType's name must be unique among all of the QualificationTypes created by the same user. (1651833031855)

In [7]:

approved_workers = []#['A1B4AS107BV1ML']



qualification = read_json('qualification_claim_main')

for worker in approved_workers:
    approve_worker(worker, qualification['QualificationType']['QualificationTypeId'])

In [9]:
qualification["QualificationType"]["QualificationTypeId"] == '3KPNUZIAJ8TEJIOEXY6O9LL21XKH19'

True

In [10]:
Items_to_annotate = 20


Qualifications = [
    {
        'QualificationTypeId': qualification['QualificationType']['QualificationTypeId'],
        'Comparator': 'DoesNotExist',
        'ActionsGuarded': 'DiscoverPreviewAndAccept'
    },
    { # Masters qualification
      # 'QualificationTypeId': '2ARFPLSP75KLA8M8DH1HTEQVJT3SY6', # Masters qualification sandbox
        'QualificationTypeId': '2F1QJWKUDD8XADTFD2Q0G6UTO95ALH', # Masters qualification production
        'Comparator': 'Exists'
    }
]




TaskAttributes = {
    'MaxAssignments': 20,                 
    'LifetimeInSeconds': 60*60*24,         # How long the task will be available on the MTurk website (24 hours)
    'AssignmentDurationInSeconds': 60*10,   # How long Workers have to complete each item (5 minutes)
    'Reward': '0.02',                      # The reward you will offer Workers for each response
    'Title': 'Label tweets',
    'Keywords': 'label, tweet',
    'Description': 'Label tweets for related topics in this small set and if done well gain qualification for a bigger set',
    'QualificationRequirements': Qualifications,
}


In [11]:
def encode_tweet(tweet):
    return str(html.escape(tweet).replace('\n', '<br/>').encode('ascii', 'xmlcharrefreplace')).replace("b'", '')[:-1]

known_answers = read_json('known_claims')


In [12]:
def create_hits(data, offset = 0):
    results = []
    for i, row in enumerate(data[offset:]):
        print('\rCreating hit for tweet number', offset + i + 1, end='')
        response = client.create_hit(
            **TaskAttributes,
            Question=question_xml.replace('${tweet}',encode_tweet(row['tweet'])),#.replace('${topic}', encode_tweet(row['topic']))
        )
        hit_type_id = response['HIT']['HITTypeId']
        results.append({
            'tweet': row['tweet'],
            'hit_id': response['HIT']['HITId'],
            'known_answer': row['claim']
        })

    print("\nYou can view the HITs here:")
    link = mturk_environment['preview'] + "?groupId={}".format(hit_type_id)
    print(link)
    return results, link

In [13]:
results, _ = create_hits(known_answers)

Creating hit for tweet number 20
You can view the HITs here:
https://www.mturk.com/mturk/preview?groupId=3OSXIFYM5H4NGESECR5X7RAWLROGA6


In [14]:
with open('./active_hits-v1-qualification-claim.json', 'w') as fout:
    json.dump(results, fout)

In [None]:
with open('./active_hits-v1-qualification-claim.json', 'r') as fout:
    results = json.load(fout)

In [181]:
workers = {
    
}

for i, item in enumerate(results):
    print('\r', 'Gathering results for hit num', i + 1, end='')
    # Get the status of the HIT
    hit = client.get_hit(HITId=item['hit_id'])
    item['status'] = hit['HIT']['HITStatus']

    # Get a list of the Assignments that have been submitted by Workers
    assignmentsList = client.list_assignments_for_hit(
        HITId=item['hit_id'],
        AssignmentStatuses=['Submitted', 'Approved'],#, 'Rejected'],
        MaxResults=10
    )

    assignments = assignmentsList['Assignments']
    item['assignments_submitted_count'] = len(assignments)

    answers = []
    for assignment in assignments:
        
        
        
        # Retreive the attributes for each Assignment
        worker_id = assignment['WorkerId']
        assignment_id = assignment['AssignmentId']
        
        # Retrieve the value submitted by the Worker from the XML
        answer_dict = xmltodict.parse(assignment['Answer'])
        answer = answer_dict['QuestionFormAnswers']['Answer']['FreeText']
        answer_dict['worker'] = worker_id
        answer_dict['answer'] = answer
        answers.append(answer_dict)
        
        
        workser_work = {
            "answer": answer,
            "known_answer": item['known_answer'],
            "assignment_id": assignment['AssignmentId'],
            "status": assignment['AssignmentStatus'],
            "tweet": item['tweet'],
        }
        if worker_id in workers:
            workers[worker_id].append(workser_work)
        else:
            workers[worker_id] = [workser_work]
        
        # Approve the Assignment (if it hasn't already been approved)
        #if assignment['AssignmentStatus'] == 'Submitted':
        #    client.approve_assignment(
        #        AssignmentId=assignment_id,
        #        OverrideRejection=False
        #    )
    
    # Add the answers that have been retrieved for this item
    item['answers'] = answers


print('\nDone')

 Gathering results for hit num 20
Done


In [182]:
def freeze(o):
    if isinstance(o,dict):
        return frozenset({ k:freeze(v) for k,v in o.items()}.items())

    if isinstance(o,list):
        return tuple([freeze(v) for v in o])

    return o
file_name = f'./active_hits-v1-claim-qualification-res-{hash(freeze(results))}.json'
with open(file_name, 'w') as fout:
    json.dump(results, fout)
print(file_name)

./active_hits-v1-claim-qualification-res-2529877282265771281.json


In [183]:
file_name = f'./workers-qualification-{hash(freeze(workers))}.json'

with open(file_name, 'w') as fout:
    json.dump(workers, fout)

print(file_name)

./workers-qualification--5553671175255113413.json


In [184]:
#with open('./workers-3003139998663480200.json', 'r') as fout:
#    workers = json.load(fout)



In [190]:
total_anno = 0
total_approved = 0
for worker in workers:
    work = workers[worker]
    num_known = 0
    num_correct = 0
    num_unapproved = 0
    num_partial_correct = 0
    num_wrong_answer = 0
    answer_dist = {}
    num_answer_made = 0
    
    for assign in work:
        
        answ  = assign['answer'].split('|')
        num_answer_made = num_answer_made + len(answ)
        if assign['status'] == 'Submitted':
            num_unapproved = num_unapproved + 1
        if assign['status'] == 'Approved':
            total_approved = total_approved + 1
        
        for ans in answ:
            if ans in answer_dist:
                answer_dist[ans] = answer_dist[ans] + 1
            else:
                answer_dist[ans] = 1
        
        if not pd.isnull(assign['known_answer']):
            known = assign['known_answer'].split('|')
            num_known = num_known + len(known)
            inter = len(np.intersect1d(known, answ))
            num_correct = num_correct + inter
            num_partial_correct = num_partial_correct + ((len(known) - inter) if (inter < len(known) or inter < len(answ)) and inter > 0 else 0)
            num_wrong_answer = num_wrong_answer + (len(answ) - inter)
        
            
            
    total_anno = total_anno + len(work) 
    if num_unapproved > 0:
        print(worker, len(work), num_unapproved)
        print(worker, 'answers', answer_dist)
        if num_known > 0:
            print(worker, num_known, num_answer_made, num_correct, num_wrong_answer, num_correct/num_known, (num_correct + num_partial_correct)/num_known, num_wrong_answer/num_known)
        print('')
print('Total number annotated', total_anno, 'out of', len(results)*20, 'number approved', total_approved)


APJJ1HHD2GGFS 3 3
APJJ1HHD2GGFS answers {'We should reduce the consumption of meat': 1, 'no claim': 2}
APJJ1HHD2GGFS 5 3 1 2 0.2 0.2 0.4

A1PAR1TMKFPKQE 4 4
A1PAR1TMKFPKQE answers {'no claim': 2, 'Vegan and vegetarian diets should be encouraged': 1, 'We should pursue policies that promote sustainable foods': 1}
A1PAR1TMKFPKQE 6 4 4 0 0.6666666666666666 1.0 0.0

Total number annotated 130 out of 400 number approved 118


In [192]:
#check_workers_work('A2F0X4LN9N4O4C')
#approve_work('A1PAR1TMKFPKQE')
#approve_worker('A2F0X4LN9N4O4C', qualification)
reject_work('APJJ1HHD2GGFS', 'Wrong annotations for too few hits')

 Rejecting 3

In [None]:
#reject_work('AUGRDUEDEUXUS', 'You failed to achive at least 40% agreement with known annotations. Thank you for participation')
#approve_work('A3GUJ6JD25FX7O')

In [None]:
if num_known > 4 and False:
        for assign in work:
            if num_correct/num_known > 0.65:
                client.approve_assignment(
                    AssignmentId=assign['assignment_id'],
                    OverrideRejection=False
                )
            else:
                client.reject_assignment(
                    AssignmentId=assign['assignment_id'],
                    RequesterFeedback="You failed to achive at least 70% agreement with known annotations"
                )