In [1]:
import mturk
import random
from datetime import datetime
import json
from pprint import pprint
import copy
import numpy as np
from collections import OrderedDict
import pandas as pd
import pymongo
import botocore
import uuid

In [3]:
target_languages = {
    #'en':'English',
    #'es':'Spanish',
    #'nl':'Dutch',
    #'sv':'Swedish',
    #'pt':'Portuguese',
    'ja':'Japanese'
}

data_folder = './data/final/'
config_folder = './config/final/'

with open('./data/language_data/language_tests.json','r') as f:
    language_tests = json.load(f)
with open('./config/mongodb_credentials.json','r') as f:
    mongodb_credentials = json.load(f)
    #mongodb_credentials["connection_string"]

In [4]:
""" Connect to MTurk and to the Mongodb database. Set the boolean below to TRUE to use the marketplace and to FALSE to use the sandbox (testing the HITs)"""
create_hits_in_production = 1

db_client = pymongo.MongoClient(mongodb_credentials["connection_string"])
db = db_client['wikidata']

hit_result_collection = db.hit_results if create_hits_in_production else db.hit_results_sandbox

mt = mturk.MTurk()
mt.launch_client(create_hits_in_production)

1517.44


In [4]:
""" Ban the spammers! """
with open('./config/banlist.json','r') as f:
    banlist = json.load(f)
for w in banlist:
    try:
        response = mt.client.create_worker_block(
            WorkerId=w,
            Reason='You are copy and pasting text'
        )
        assert(response['ResponseMetadata']['HTTPStatusCode'] == 200)
    except botocore.exceptions.ClientError as e:
        continue

In [4]:
""" Create the tasks by populating the HTML templates using the config file """

task_type = 'authorit'

with open(config_folder + 'task_config_{}.json'.format(task_type),'r') as f:
    task_content = json.load(f)
    
taskSets_all_lan = {}

for lan in target_languages.keys():
    task_content_lan = copy.deepcopy(task_content)
    task_content_lan['language'] = lan
    task_content_lan['tasks'] = task_content_lan['tasks'].format(lan)
    
    task_content_lan['task_attributes']['Title'] =\
          task_content_lan['task_attributes']['Title'].format(target_languages[lan])
    task_content_lan['task_attributes']['Keywords'] =\
          task_content_lan['task_attributes']['Keywords'].format(target_languages[lan])
    task_content_lan['task_attributes']['Description'] =\
          task_content_lan['task_attributes']['Description'].format(target_languages[lan])
    TaskAttributes = task_content_lan['task_attributes']
    
    with open('./config/' + task_content_lan['instructions_project_text_file'],'r') as f:
        task_content_lan['instructions_project_text'] = f.read().replace('\n',' ')
    with open('./config/' + task_content_lan['instructions_intro_text_file'],'r') as f:
        task_content_lan['instructions_intro_text'] = f.read().replace('\n',' ')
    with open('./config/' + task_content_lan['instructions_rules_text_file'],'r') as f:
        task_content_lan['instructions_rules_text'] = f.read().replace('\n',' ')

    html_layout = open(task_content_lan['html_layout'], 'r').read()

    html_layout = html_layout.\
        replace('${instructions_project_text}$', task_content_lan['instructions_project_text']).\
        replace('${instructions_intro_text}$', task_content_lan['instructions_intro_text']).\
        replace('${instructions_rules_text}$', task_content_lan['instructions_rules_text']).\
        replace('${lan_test_language}$', '"' + target_languages[lan] + '"').\
        replace('${time_thr}$', task_content_lan['time_thr'])

    with open(data_folder + task_content_lan['tasks'],'r') as f:
        taskSets = json.load(f)

    # If you're only testing, just pick one hit and run it once, with no qualification barriers
    if not create_hits_in_production:
        TaskAttributes.pop('QualificationRequirements')
        TaskAttributes['MaxAssignments'] = 1 
        random.seed(42)
        #taskSets = random.sample(taskSets,1)
        
    if lan in ['sv','nl','ja']:
        TaskAttributes['Reward'] = "1"

    taskSets_all_lan[lan] = {
        'html_layout' : html_layout,
        'taskSets' : taskSets,
        'TaskAttributes' : TaskAttributes,
        'task_content':task_content_lan
    }
          
    print('Generated {} tasks in {} with the following configs:'.format(len(taskSets),target_languages[lan]))
    pprint(TaskAttributes,indent=1) #verify the properties before running the HITs
    print('')

Generated 18 tasks in Japanese with the following configs:
{'AssignmentDurationInSeconds': 1800,
 'Description': 'Help us by clicking on weblinks and telling us which kind of '
                'Author and Publisher they are. You should have reading '
                'proficiency in Japanese.',
 'Keywords': 'Weblinks, Author type, Publisher type, Information Retrieval, '
             'Japanese',
 'LifetimeInSeconds': 604800,
 'MaxAssignments': 5,
 'QualificationRequirements': [{'ActionsGuarded': 'DiscoverPreviewAndAccept',
                                'Comparator': 'GreaterThanOrEqualTo',
                                'IntegerValues': [80],
                                'QualificationTypeId': '000000000000000000L0'},
                               {'ActionsGuarded': 'DiscoverPreviewAndAccept',
                                'Comparator': 'GreaterThan',
                                'IntegerValues': [1000],
                                'QualificationTypeId': '000000000000000

In [5]:
""" See how many HITs this will generate, already multiplied by the expected number of assignments.
Multiply the resulting number by the payment to see how much money this batch will consume. """
for lan in target_languages.keys():
    
    html_layout = taskSets_all_lan[lan]['html_layout']
    taskSets = taskSets_all_lan[lan]['taskSets']
    TaskAttributes = taskSets_all_lan[lan]['TaskAttributes']
    task_content = taskSets_all_lan[lan]['task_content']
    
    l=[]
    target_assignments = TaskAttributes['MaxAssignments']
    for taskSet in taskSets:
        TaskAttributes_hit = copy.deepcopy(TaskAttributes)
        TaskAttributes_hit['MaxAssignments'] = target_assignments -\
            sum([hit['hit']['NumberOfAssignmentsCompleted'] for hit in hit_result_collection.find({
                'taskSet_id':taskSet['_id'],
                'type': task_content['type'],
                'language': target_languages[lan]
            })]) #or sum [hit['hit']['NumberOfAssignmentsCompleted'] for completed results
        l.append(TaskAttributes_hit['MaxAssignments'])
    print('Remaining HITs for {}: {}'.format(target_languages[lan],sum(l)))

Remaining HITs for Japanese: 90


In [6]:
""" Create the batch of HITs """

results = []
batch_id = str(uuid.uuid4())

for lan in target_languages.keys():
    
    html_layout = taskSets_all_lan[lan]['html_layout']
    taskSets = taskSets_all_lan[lan]['taskSets']
    TaskAttributes = taskSets_all_lan[lan]['TaskAttributes']
    task_content = taskSets_all_lan[lan]['task_content']
    
    hit_type_id = ''
    target_assignments = TaskAttributes['MaxAssignments']
    for taskSet in taskSets:
        TaskAttributes_hit = copy.deepcopy(TaskAttributes) # Adjust based on how many were already done in other batches
        TaskAttributes_hit['MaxAssignments'] = target_assignments -\
            sum([hit['hit']['NumberOfAssignmentsCompleted'] for hit in hit_result_collection.find({
                'taskSet_id':taskSet['_id'],
                'type': task_content['type'],
                'language': target_languages[lan]
            })])
        if TaskAttributes_hit['MaxAssignments'] > 0:
            random.seed(None)
            language_questions = random.sample(language_tests[task_content['language']],k=4)
            response = mt.create_hit(
                html_layout.replace('${references}$', str(taskSet['taskSet'])).\
                            replace('${lan_test_questions}$', json.dumps(language_questions)),
                **TaskAttributes_hit
            )

            hit_type_id = response['HIT']['HITTypeId']
            result = {
                '_id': response['HIT']['HITId'],
                'batch_id': batch_id,
                'type': task_content['type'],
                'references': taskSet['taskSet'],
                'language_test': language_questions,
                'language': target_languages[lan],
                'taskSet_id':taskSet['_id'],
                'hit': response['HIT'],
                'timestamp': datetime.now()
            }
            results.append(result)
            try:
                hit_result_collection.insert_one(result)
            except Exception:
                print(result)
                raise
        
    # For you to go to the HITs you just created and test them
    print('Launched tasks for {}'.format(lan))
    if not create_hits_in_production:
        print('You can view the HITs here:')
        print(mt.mturk_environment['preview']+"?groupId={}".format(hit_type_id))
    else:
        print('Launched! Good Luck!')

Launched tasks for ja
Launched! Good Luck!


In [18]:
""" If you set 'force' to TRUE, it will abort mission and force an expiry in all HITs and then delete them.
If you only want to remove the completed ones (make them Disposed so the update routine won't loop through tons of
HITs), keep it as FALSE."""
force = True
while True:
    ''' Dispose all hits in the database '''
    query = {'hit.HITStatus': {'$not': {'$eq': 'Disposed'}}}
    #query['type'] = 'relevance'
    if not force:
        query['hit.NumberOfAssignmentsPending'] = 0
        query['hit.NumberOfAssignmentsAvailable'] = 0
    elif force:
        query['hit.NumberOfAssignmentsPending'] = 0
    hit_result_collection_list = list(hit_result_collection.find(query))
    if (not force and len(hit_result_collection_list) == 0) or (force and mt.client.list_hits()['NumResults']==0):
    #if len(list(hit_result_collection.find({'hit.HITStatus': {'$not': {'$eq': 'Disposed'}}, 'type':'relevance'}))) == 0:
        print('Finished')
        break
    for hit in hit_result_collection_list:
        try:
            mt.client.delete_hit(HITId = hit['_id'])
            print('Removed',hit['_id'])
        except Exception as e:
            #print(hit['_id'], e)
            if force:
                try:
                    mt.client.update_expiration_for_hit(HITId = hit['_id'], ExpireAt=datetime(2017, 1, 1))
                    mt.client.delete_hit(HITId = hit['_id'])
                    print('Removed',hit['_id'])
                except Exception as e:
                    pass
                    #print(hit['_id'],e)
            continue

Removed 3TY2U1TECAMJCWQ3FEKPBYP4O7YJJJ
Finished


In [16]:
mt.client.list_hits(MaxResults=100)
hitid = '3TY2U1TECAMJCWQ3FEKPBYP4O7YJJJ'
mt.client.update_expiration_for_hit(HITId = hitid, ExpireAt=datetime(2018, 1, 1))
mt.client.get_hit(HITId = hitid)
#mt.client.delete_hit(HITId = hitid)

#while True:
#    for hit in to_delete:
#    try:
#        mt.client.delete_hit(HITId = hitid)
#        print('Deleted',hitid)
#        break
#    except Exception as e:
#        #print(hit['_id'], e)
#        if force:
#            try:
#                mt.client.update_expiration_for_hit(HITId = hitid, ExpireAt=datetime(2017, 1, 1))
#                mt.client.delete_hit(HITId = hitid)
#                print('Deleted',hitid)
#                break
#            except Exception as e:
#                pass
#                #print(hit['_id'],e)
#        continue

{'HIT': {'HITId': '3TY2U1TECAMJCWQ3FEKPBYP4O7YJJJ',
  'HITTypeId': '3UY9ZAUDMG4YWKJIILP6ITS3M3PALL',
  'HITGroupId': '3JG2O8WU0KJHH4B5I6M1LCROS52W94',
  'CreationTime': datetime.datetime(2020, 8, 12, 19, 39, 48, tzinfo=tzlocal()),
  'Title': 'Checking the relevance of websites [Dutch]',
  'Description': 'Help us by clicking on weblinks and making sure that they have the correct information. You should have reading proficiency in Dutch.',
  'Keywords': 'Weblinks, Statements, Information Retrieval, Dutch',
  'HITStatus': 'Unassignable',
  'MaxAssignments': 1,
  'Reward': '1.00',
  'AutoApprovalDelayInSeconds': 2592000,
  'Expiration': datetime.datetime(2020, 8, 12, 19, 42, 37, tzinfo=tzlocal()),
  'AssignmentDurationInSeconds': 1800,
  'QualificationRequirements': [],
  'HITReviewStatus': 'NotReviewed',
  'NumberOfAssignmentsPending': 1,
  'NumberOfAssignmentsAvailable': 0,
  'NumberOfAssignmentsCompleted': 0},
 'ResponseMetadata': {'RequestId': 'b0db16bf-1591-4812-b552-a984db818fd2',
  

# MISC

In [None]:
''' Update all non-disposed hits in the database with correct results '''
# It is much better to run the 'update_db.py' script, which runs this routine in loop and keeps the DB updated
for hit in hit_result_collection.find({'hit.HITStatus': {'$not': {'$eq': 'Disposed'}}}):
    print('Updating',hit['_id'],end='\r')
    try:
        hit_result_collection.update_one(
            {'_id': hit['_id']},
            {
                "$set": {
                    "hit": mt.client.get_hit(HITId = hit['_id'])['HIT'],
                    'answers': mt.approve_and_get_hit_answers(hit['_id'])
                }
            })
    except Exception as e:
        print(e,end='\n\n')
        continue
print('Done'+(' '*100))

In [5]:
from bs4 import BeautifulSoup
i = db.hit_results.find({'_id':'32TZXEA1POWMEXREMIS0T79QRA014J'})[0]
y=BeautifulSoup(i['hit']['Question'])
y

<html><body><htmlquestion xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2011-11-11/HTMLQuestion.xsd"><htmlcontent>
<title>Wikidata Reference Verification</title>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" rel="stylesheet"/>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js"></script>
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js"></script>
<script src="https://s3.amazonaws.com/mturk-public/externalHIT_v1.js" type="text/javascript"></script>
<link crossorigin="anonymous" href="https://use.fontawesome.com/releases/v5.7.0/css/all.css" integrity="sha384-lZN37f5QGtY3VHgisS14W3ExzMWZxybE1SJSEsQp9S+oqd12jhcu+A56Ebc1zFSJ" rel="stylesheet"/>
<style>
        input[type="range"].un

In [6]:
with open('test.html','w+') as f:
    f.write(str(y))

In [4]:
backup = list(hit_result_collection.find())

In [13]:
((61+36)*5*6)

2910

In [22]:
61*5*6

1830