# Table of Contents
* [Submitting HITs](#Submitting-HITs)
	* [Building URLs for images on s3](#Building-URLs-for-images-on-s3)
	* [submitting HITs in groups](#submitting-HITs-in-groups)
* [Reviewing HITs](#Reviewing-HITs)
* [Writing annotation results](#Writing-annotation-results)
* [Ignore](#Ignore)


In [59]:
%%capture
from __future__ import division
import numpy as np
import pandas as pd
import scipy.stats as st
import itertools
import math
from collections import Counter, defaultdict
%load_ext autoreload
%autoreload 2

In [2]:
import boto
import boto.mturk.connection as tc
import boto.mturk.question as tq
from keysTkingdom import mturk_ai2
import pickle

# Submitting HITs

## Building URLs for images on s3

In [16]:
def load_book_info():
    with open('breakdowns.pkl', 'rb') as f:
#         book_breakdowns = pickle.load(f, encoding='latin1')
        book_breakdowns = pickle.load(f)


    with open('pdfs/page_ranges.csv') as f:
        ranges = f.readlines()
    range_lookup = {line.split(' ')[0]:[int(num) for num in line.strip().split(' ')[1:]] for line in ranges}
    return book_breakdowns, range_lookup

def form_hit_url(book_name, page_n):
    book_name_no_ext = book_name.replace('.pdf', '_')
#     baseurl = 'https://s3-us-west-2.amazonaws.com/ai2-vision-turk-data/textbook-annotation-test/build/index.html'
    baseurl = 'https://s3-us-west-2.amazonaws.com/ai2-vision-turk-data/textbook-annotation-test/textbook_hit_instructions/instructions.html' 
    full_url = baseurl + '?url={}{}.jpeg&id={}'.format(book_name_no_ext, page_n, page_n)
    return full_url

def make_book_group_urls(book_groups, book_group, ranges):
    group_urls = []
    def get_start_end(start, end):
        return start, end
    
    for tb in book_groups[book_group]:
        start, end = get_start_end(*ranges[tb])
        for page_n in range(start, end):
            group_urls.append(form_hit_url(tb, page_n))
    return group_urls

In [4]:
book_groups,ranges = load_book_info()

In [24]:
daily_sci_urls = make_book_group_urls(book_groups, 'daily_sci', ranges)
spectrum_sci_urls = make_book_group_urls(book_groups, 'spectrum_sci', ranges)

In [78]:
# daily_sci_urls[500:600]

## submitting HITs in groups

In [17]:
def creat_single_hit(url):
    """
    creates a single HIT from a provided url
    """
    title = "Annotate Science Textbook"
    description = "Choose which category a text entry best belongs to"
    keywords = ['image', 'science']
    frame_height = 1000 # the height of the iframe holding the external hit
    amount = .05
#     duration = 3600

    questionform = tq.ExternalQuestion(url, frame_height)

    create_hit_result = mturk.create_hit(
        title = title,
        description = description,
        keywords = keywords,
        question = questionform,
        reward = boto.mturk.price.Price(amount=amount),
#         max_assignments=3,
        max_assignments=1,
#         duration = duration
    )
def create_hits_from_pages(page_links):
    for url in page_links:
        creat_single_hit(url)

In [7]:
def delete_all_hits(mturk):
    my_hits = list(mturk.get_all_hits())
    for hit in my_hits:
        mturk.disable_hit(hit.HITId)

In [18]:
sandbox_host = 'mechanicalturk.sandbox.amazonaws.com' 
mturk = tc.MTurkConnection(
    aws_access_key_id = mturk_ai2.access_key,
    aws_secret_access_key = mturk_ai2.access_secret_key,
    host = sandbox_host,
    debug = 1 # debug = 2 prints out all requests.
)

In [35]:
delete_all_hits(mturk)

In [36]:
creat_single_hit(daily_sci_urls[86])

In [27]:
daily_sci_urls[85]

'https://s3-us-west-2.amazonaws.com/ai2-vision-turk-data/textbook-annotation-test/textbook_hit_instructions/instructions.html?url=Daily_Science_Grade_1_Evan_Moor_93.jpeg&id=93'

In [203]:
daily_sci_urls[502:511]

['https://s3-us-west-2.amazonaws.com/ai2-vision-turk-data/textbook-annotation-test/textbook_hit_instructions/instructions.html?url=Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_147.jpeg&id=147',
 'https://s3-us-west-2.amazonaws.com/ai2-vision-turk-data/textbook-annotation-test/textbook_hit_instructions/instructions.html?url=Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_148.jpeg&id=148',
 'https://s3-us-west-2.amazonaws.com/ai2-vision-turk-data/textbook-annotation-test/textbook_hit_instructions/instructions.html?url=Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_149.jpeg&id=149',
 'https://s3-us-west-2.amazonaws.com/ai2-vision-turk-data/textbook-annotation-test/textbook_hit_instructions/instructions.html?url=Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_150.jpeg&id=150',
 'https://s3-us-west-2.amazonaws.com/ai2-vision-turk-data/textbook-annotation-test/textbook_hit_instructions/instructions.html?url=Daily_Science_Grade_3_(Daily_Practice_Books)_Evan

In [37]:
create_hits_from_pages(daily_sci_urls[502:542])

there are 1100 pages from daily science

# Reviewing HITs

In [None]:
def most_common_strict(turk_responses_single_page):
    """
    returns the consensus response of the three raw response strings for a given page
    """
    most_common = turk_responses_single_page[1]['Answer.NumberOfItems'].mode()
    if most_common.empty:
        most_common = pd.Series(['NO AGREEMENT'])
    return most_common

In [None]:
grouped_results_df = batch_results_df.groupby('Input.image_url')
for turk_response in grouped_results_df:
    print(image_response[1]['Answer.NumberOfItems'])

In [212]:
r_hits = mturk.get_reviewable_hits(page_size=50)

In [215]:
annotation_results = {}
for hit in r_hits:
    assignments = mturk.get_assignments(hit.HITId)
    for assigment in assignments:
        for answers in assigment.answers:
            annotation_results[answers[0].fields[0]] = answers[1].fields
            

In [289]:
annotation_results.keys()

[u'Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_165.jpeg',
 u'Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_151.jpeg',
 u'Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_175.jpeg',
 u'Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_156.jpeg',
 u'Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_180.jpeg',
 u'Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_168.jpeg',
 u'Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_179.jpeg',
 u'Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_173.jpeg',
 u'Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_159.jpeg',
 u'Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_149.jpeg',
 u'Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_186.jpeg',
 u'Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_152.jpeg',
 u'Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_176.jpeg',
 u'Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_148.jpeg',
 u'Daily_Science_Gra

# Writing annotation results

In [280]:
import json
import jsonschema
import requests 
from pdfextraction.annotation_schema import page_schema
from flask import request

In [196]:
page_schema

{'$schema': 'http://json-schema.org/draft-04/schema',
 'additionalProperties': False,
 'properties': {'figure': {'type': 'object'},
  'relationship': {'type': 'object'},
  'text': {'additionalProperties': False,
   'patternProperties': {'^T[0-9]+$': {'additionalProperties': False,
     'properties': {'box_id': {'type': 'string'},
      'category': {'enum': ['header/topic',
        'definition',
        'discussion',
        'question',
        'answer',
        'figure_label',
        'unlabeled']},
      'contents': {'type': 'string'},
      'rectangle': {'items': {'items': {'type': 'integer'},
        'maxItems': 2,
        'minItems': 2,
        'type': 'array'},
       'maxItems': 2,
       'minItems': 2,
       'type': 'array'},
      'score': {},
      'source': {'items': {'$schema': 'http://json-schema.org/draft-04/schema#',
        'additionalProperties': False,
        'properties': {'book_source': {'type': 'string'},
         'page_n': {'type': 'int'}},
        'required': ['

In [293]:
review_api_endpoint = 'http://localhost:8080/api/review'
payload = {'pages_to_review': str(annotation_results.keys())}
headers = {'content-type': 'application/json'}
requests.post(review_api_endpoint, data=json.dumps(payload), headers=headers)

<Response [200]>

In [216]:
def form_annoation_url(page_name):
    base_path = '/Users/schwenk/wrk/notebooks/stb/ai2-vision-turk-data/textbook-annotation-test/merged-annotations/'
    return base_path + page_name.replace('jpeg', 'json')

In [217]:
def load_local_annotation(page_name):
    base_path = '/Users/schwenk/wrk/notebooks/stb/ai2-vision-turk-data/textbook-annotation-test/merged-annotations/'
    file_path = base_path + page_name.replace('jpeg', 'json')
    with open(file_path, 'r') as f:
        local_annotations = json.load(f)
    return local_annotations

In [218]:
def process_annotation_results(anno_page_name, turk_consensus_result, unannotated_page, annotations_folder, page_schema):

    turk_results_json = json.loads(turk_consensus_result[0])
    for result in turk_results_json:
        unannotated_page['text'][result['id']]['category'] = result['category']

    validator = jsonschema.Draft4Validator(page_schema)
#     validator.validate(json.loads(json.dumps(unannotated_page)))

    file_path = annotations_folder + anno_page_name.replace('jpeg', 'json').replace("\\", "")
    with open(file_path, 'wb') as rf:
        json.dump(unannotated_page, f)
    return

In [219]:
for page_name, results in annotation_results.iteritems():
    unaltered_annotations = load_local_annotation(page_name)
    process_annotation_results(page_name, results, unaltered_annotations, './test_write/', page_schema)

In [84]:
# batch_results_df = pd.read_csv(data_dir+results_csv)
# print(batch_results_df.shape)
# batch_results_df.head(2)

# Ignore

In [6]:
# grouped_results_df = batch_results_df.groupby('Input.image_url')
# for image_response in grouped_results_df:
#     print(image_response[1]['Answer.NumberOfItems'])

In [11]:
# hit_type_1 = (
#     "Annotate Science Textbook",
#     "Choose which category a text entry best belongs to",
#     boto.mturk.price.Price(amount=0.05),
#     3600,
#     ['image', 'science']

# )

my_hits = list(mturk.get_all_hits())

# for hit in my_hits:
#     mturk.disable_hit(hit.HITId)

# my_hit = list(mturk.get_all_hits())[0]

# hitidr = mturk.register_hit_type(*hit_type_1)

In [12]:
for hit in my_hits:
    mturk.disable_hit(hit.HITId)

Choosing the right price for your HITs is crucial, and it can be tricky to figure out when you’re first starting. It’s here that those using Mechanical Turk as a digital sweatshop are separated from those using Mechanical Turk as fair and equitable way to employ of other people. Many turkers consider it unethical to pay under $0.10 per minute. This amount works out to a $6.00 hourly wage or the minimum wage in the US (though many states pay higher). Turkers specifically pay attention to price when determining whether or not a HIT is worth their time. As one turker said in a survey “…I figure a good task is one I can make 10 to 12 cents a minute on.” If you’re looking to get your HITs done quickly and have high-quality turkers work on them (and trust me, you are!) then you should make sure you pay your turkers fairly. If you want a quick rule of thumb it’s:

Fair Pay = $0.10 x (Average Number Of Minutes Per Assignment)