In [1]:
import boto3
import xmltodict
import json
import pandas as pd
import pyemoji
import html

In [2]:
create_hits_in_production = True
environments = {
        "production": {
            "endpoint": "https://mturk-requester.us-east-1.amazonaws.com",
            "preview": "https://www.mturk.com/mturk/preview"
        },
        "sandbox": {
            "endpoint": "https://mturk-requester-sandbox.us-east-1.amazonaws.com",
            "preview": "https://workersandbox.mturk.com/mturk/preview"
        },
}
mturk_environment = environments["production"] if create_hits_in_production else environments["sandbox"]

client = boto3.client(
    service_name='mturk',
    region_name='us-east-1',
    endpoint_url=mturk_environment['endpoint'],
)

In [3]:
# This will return your current MTurk balance if you are connected to Production.
# If you are connected to the Sandbox it will return $10,000.
print(client.get_account_balance()['AvailableBalance'])

1500.00


In [4]:
df = pd.read_csv('sample_mturk.csv')
df_known = pd.read_csv('../full-sample-v4.csv')

In [5]:
html_layout = open('./argumentative.html', 'r').read()
QUESTION_XML = """<HTMLQuestion xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2011-11-11/HTMLQuestion.xsd">
        <HTMLContent><![CDATA[{}]]></HTMLContent>
        <FrameHeight>650</FrameHeight>
        </HTMLQuestion>"""
question_xml = QUESTION_XML.format(html_layout)

In [6]:
Items_to_annotate = 600

TaskAttributes = {
    'MaxAssignments': 5,                 
    'LifetimeInSeconds': 60*60*24,         # How long the task will be available on the MTurk website (24 hours)
    'AssignmentDurationInSeconds': 60*5,   # How long Workers have to complete each item (5 minutes)
    'Reward': '0.02',                      # The reward you will offer Workers for each response
    'Title': 'Classify tweets',
    'Keywords': 'classify, tweet',
    'Description': 'Classify if tweets are argumentative',
    'QualificationRequirements': [
        {
            #'QualificationTypeId': '2ARFPLSP75KLA8M8DH1HTEQVJT3SY6', # Masters qualification sandbox
            'QualificationTypeId': '2F1QJWKUDD8XADTFD2Q0G6UTO95ALH', # Masters qualification production
            'Comparator': 'Exists'
        }
    ]
}


In [7]:
results = []
hit_type_id = ''


def encode_tweet(tweet):
    return str(html.escape(tweet).replace('\n', '<br/>').encode('ascii', 'xmlcharrefreplace')).replace("b'", '')[:-1]

df_set = pd.concat([df.drop(columns=['argumentative']), df_known]).sample(frac=1)
df_set_sample = df_set.head(Items_to_annotate)
#df_set_sample.to_csv('active_hits_tweets.csv')


In [10]:
offset = 69
for i, row in enumerate(df_set_sample.to_dict('records')[offset:]):
    print('\r', 'Creating hit for tweet number', offset + i + 1, end='')
    response = client.create_hit(
        **TaskAttributes,
        Question=question_xml.replace('${tweet}',encode_tweet(row['tweet'])),#.replace('${topic}', encode_tweet(row['topic']))
    )
    hit_type_id = response['HIT']['HITTypeId']
    results.append({
        'tweet': row['tweet'],
        'hit_id': response['HIT']['HITId'],
        'known_answer': row['argumentative']
    })
    
print("You can view the HITs here:")
print(mturk_environment['preview'] + "?groupId={}".format(hit_type_id))

 Creating hit for tweet number 600You can view the HITs here:
https://www.mturk.com/mturk/preview?groupId=373MH2DFE9O6ZKSH2BNBN263I9LHB0


In [11]:
with open('./active_hits-v1-argumentative.json', 'w') as fout:
    json.dump(results, fout)

In [None]:
with open('./active_hits-v1-argumentative.json', 'r') as fout:
    results = json.load(fout)

In [306]:
workers = {
    
}

for i, item in enumerate(results):
    print('\r', 'Gathering results for hit num', i + 1, end='')
    # Get the status of the HIT
    hit = client.get_hit(HITId=item['hit_id'])
    item['status'] = hit['HIT']['HITStatus']

    # Get a list of the Assignments that have been submitted by Workers
    assignmentsList = client.list_assignments_for_hit(
        HITId=item['hit_id'],
        AssignmentStatuses=['Submitted', 'Approved'],#, 'Rejected'],
        MaxResults=10
    )

    assignments = assignmentsList['Assignments']
    item['assignments_submitted_count'] = len(assignments)

    answers = []
    for assignment in assignments:
        
        
        
        # Retreive the attributes for each Assignment
        worker_id = assignment['WorkerId']
        assignment_id = assignment['AssignmentId']
        
        # Retrieve the value submitted by the Worker from the XML
        answer_dict = xmltodict.parse(assignment['Answer'])
        answer = answer_dict['QuestionFormAnswers']['Answer']['FreeText']
        answer_dict['worker'] = worker_id
        answer_dict['answer'] = answer
        answers.append(answer_dict)
        
        
        workser_work = {
            "answer": answer,
            "known_answer": item['known_answer'],
            "assignment_id": assignment['AssignmentId'],
            "status": assignment['AssignmentStatus'],
            "tweet": item['tweet'],
        }
        if worker_id in workers:
            workers[worker_id].append(workser_work)
        else:
            workers[worker_id] = [workser_work]
        
        # Approve the Assignment (if it hasn't already been approved)
        #if assignment['AssignmentStatus'] == 'Submitted':
        #    client.approve_assignment(
        #        AssignmentId=assignment_id,
        #        OverrideRejection=False
        #    )
    
    # Add the answers that have been retrieved for this item
    item['answers'] = answers


print('\nDone')

 Gathering results for hit num 600
Done


In [307]:
def freeze(o):
    if isinstance(o,dict):
        return frozenset({ k:freeze(v) for k,v in o.items()}.items())

    if isinstance(o,list):
        return tuple([freeze(v) for v in o])

    return o
file_name = f'./active_hits-v1-argumentative-res-{hash(freeze(results))}.json'
with open(file_name, 'w') as fout:
    json.dump(results, fout)
print(file_name)

./active_hits-v1-argumentative-res-2379498895443566127.json


In [260]:


file_name = f'./workers-{hash(freeze(workers))}.json'

with open(file_name, 'w') as fout:
    json.dump(workers, fout)

print(file_name)

./workers-3003139998663480200.json


In [268]:
with open('./workers-3003139998663480200.json', 'r') as fout:
    workers = json.load(fout)



In [269]:
total_anno = 0
total_approved = 0
for worker in workers:
    work = workers[worker]
    num_known = 0
    num_correct = 0
    num_lacks_ctx = 0
    num_unapproved = 0
    
    for assign in work:
        if assign['status'] == 'Submitted':
            num_unapproved = num_unapproved + 1
        if assign['status'] == 'Approved':
            total_approved = total_approved + 1
            
        if not pd.isnull(assign['known_answer']):
            num_known = num_known + 1
            if assign['known_answer'] == '0' and assign['answer'] == 'not argumentative':
                num_correct = num_correct + 1
            if assign['known_answer'] == '1' and assign['answer'] == 'argumentative':
                num_correct = num_correct + 1
            if assign['known_answer'] == 'lacks context':
                num_lacks_ctx = num_lacks_ctx + 1
    total_anno = total_anno +len(work) 
    if num_unapproved > 0 or True:
        print(worker, len(work), num_unapproved)
        if num_known > 0:
            print(worker, num_known, num_correct, num_lacks_ctx, num_correct/num_known, (num_correct+num_lacks_ctx)/num_known)
print('Total number annotated', total_anno, 'out of', len(results)*5, 'number approved', total_approved)

A1FPCIKO68OQ63 597 0
A1FPCIKO68OQ63 91 58 13 0.6373626373626373 0.7802197802197802
A16DOX0X2E9XBI 578 0
A16DOX0X2E9XBI 89 34 13 0.38202247191011235 0.5280898876404494
A3GUJ6JD25FX7O 366 0
A3GUJ6JD25FX7O 58 42 8 0.7241379310344828 0.8620689655172413
AMWIBULQ4S8K3 502 0
AMWIBULQ4S8K3 71 27 13 0.38028169014084506 0.5633802816901409
A3QZMGTVA4VO44 61 0
A3QZMGTVA4VO44 8 5 2 0.625 0.875
A1IU5OP7BBZHZ7 201 0
A1IU5OP7BBZHZ7 30 16 5 0.5333333333333333 0.7
A3760PCQD2MPIO 46 0
A3760PCQD2MPIO 8 1 1 0.125 0.25
A37WXDYYT7RCZ0 168 0
A37WXDYYT7RCZ0 22 20 1 0.9090909090909091 0.9545454545454546
A2LN42YO5UY41W 52 0
A2LN42YO5UY41W 9 8 1 0.8888888888888888 1.0
AUGRDUEDEUXUS 68 0
AUGRDUEDEUXUS 13 4 3 0.3076923076923077 0.5384615384615384
ABY1RK8H6HC8C 14 0
ABY1RK8H6HC8C 3 1 1 0.3333333333333333 0.6666666666666666
A1NF6PELRKACS9 85 0
A1NF6PELRKACS9 15 10 1 0.6666666666666666 0.7333333333333333
A3B9OZQTE4US3T 2 0
A31Z5TPD8QKE26 77 0
A31Z5TPD8QKE26 7 4 1 0.5714285714285714 0.7142857142857143
A25UMSG5I4A82E 55

In [262]:
#results[0]

In [305]:
# A3QZMGTVA4VO44 rejection, way too many obious misses
# A34HSCZ1PZCDY0 rejection, not wrong answer on single tweet check
# A1G96GPSRSLPC0 rejection, Wrong answers

check_workers_work('A3760PCQD2MPIO', True)
# Example The UK meat industry is sustainable, however, the problem lies with other countries, clearing vast areas of forest and other vegitation, which naturally help the environment, to make way for unsustainable farming methods and ending up unable to raise livestock or grow crops.
#approve_work('A3760PCQD2MPIO', True)
#reject_work('A1G96GPSRSLPC0', 'Wrong annotations for too few hits, after manual look over')

<MENTION> <MENTION> She seem to be working 16hrs a day, 7days a week. What food does she eat? | not argumentative | 33UKMF931L7FX734Y1U65JIA75TTTX
------------------------------
<MENTION> <MENTION> <MENTION> My ancestors have been making vegan meat alternatives for thousands of years. Miss me with that “standard” bullshit when you really mean “white” | not argumentative | 3BF51CHDT7XOCPFI05P77PDHZ02H00
------------------------------
Vegetarian maybe but not vegan | not argumentative | 3U8YCDAGX131PIELJVC1G35IQNU0QB
------------------------------
<MENTION> I gen need a good coffee recipe for my fucking sweet tooth 😭 I absolutely CANNOT have black coffee | not argumentative | 3C44YUNSIDCAOQE0VUL7V5ZRFGGPD1
------------------------------
<MENTION> I thought you were going to say in light of food shortage Biden says we would all be getting a ration of Soylent Green.. | not argumentative | 3EQHHY4HQ4F97068D5D13KX71805GA
------------------------------
<MENTION> <MENTION> <MENTION> <MENTION> 

In [304]:
def approve_work(worker, forced = False):
    work = workers[worker]
    num_approved = 0
    for assign in work:
        if assign['status'] == 'Submitted' or forced:
            print('\r', 'Approving', num_approved + 1, end='')
            num_approved = num_approved + 1
            client.approve_assignment(
                AssignmentId=assign['assignment_id'],
                OverrideRejection=True
            )
            assign['status'] = 'Approved'
            
def reject_work(worker, reason):
    work = workers[worker]
    num_rejected = 0
    for assign in work:
        if assign['status'] == 'Submitted':
            print('\r', 'Rejecting', num_rejected + 1, end='')
            num_rejected = num_rejected + 1
            client.reject_assignment(
                AssignmentId=assign['assignment_id'],
                RequesterFeedback=reason
            )
            assign['status'] = 'Rejected'
            
def check_workers_work(worker, check_all = False):
    work = workers[worker]
    num_approved = 0
    for assign in work:
        if (assign['status'] == 'Submitted' or check_all) and assign['answer'] == 'not argumentative':
            print(assign['tweet'], '|', assign['answer'], '|', assign['assignment_id'])
            print('-'*30)

In [275]:
#reject_work('AUGRDUEDEUXUS', 'You failed to achive at least 40% agreement with known annotations. Thank you for participation')
#approve_work('A3GUJ6JD25FX7O')

In [None]:
if num_known > 4 and False:
        for assign in work:
            if num_correct/num_known > 0.65:
                client.approve_assignment(
                    AssignmentId=assign['assignment_id'],
                    OverrideRejection=False
                )
            else:
                client.reject_assignment(
                    AssignmentId=assign['assignment_id'],
                    RequesterFeedback="You failed to achive at least 70% agreement with known annotations"
                )