# MTURK: Create MTurk task for classification
This creates an MTurk task to have crowd workers classify the messages, and stores the result for further analysis.

The code includes attention checks. Only US master qualified workers are allowed to participate. 



In [1]:
import pandas as pd

import boto3
from random import sample
import random

mturk = boto3.client('mturk',
   aws_access_key_id = "XXXXX",
   aws_secret_access_key = "XXXXX",
   region_name='us-east-1',
)

#Worker must be in US
qualification_req1 = {
    'QualificationTypeId': '00000000000000000071',
    'Comparator': 'EqualTo',
    'LocaleValues': [{'Country': 'US'}],
    'RequiredToPreview': True
}

#Needs to be master qual
qualification_req2 = {
    'QualificationTypeId': '2F1QJWKUDD8XADTFD2Q0G6UTO95ALH', 
    'Comparator': 'Exists',
    'RequiredToPreview': True
}

df = pd.read_csv('selected_sample_tweets_mturk.csv')
df = df.sample(frac=1).reset_index(drop=True) #shuffle twewets

# Define the HIT properties
title = 'Tweet classification: Democrat or Republican poster?'
description = 'You will be given list of 50 Twitter posts from different United States politicians, sent during the two months preceding the 2020 US presidential election, that is, between September 3rd, 2020, and November 3rd, 2020. Your task is to use your knowledge of US politics to make an educated guess on whether the poster is a Democrat or Republican. If you cannot make an educated guess, just make your best guess. Note that if you misclassify an obvious tweet, your work will be rejected.'
keywords = 'classification, politics, democrat, republican'
reward = '1.0'
duration = 60 * 15 # 5 minutes
lifetime = 60 * 60 * 48 # 2 day

# Define the question form
question_form = '''<?xml version="1.0" encoding="UTF-8"?>
<QuestionForm xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2005-10-01/QuestionForm.xsd">
  <Overview>
    <Title>{title}</Title>
    <FormattedContent><![CDATA[<p>{description}</p>]]></FormattedContent>
  </Overview>

{questions}
  
</QuestionForm>'''

question = '''  <Question>
    <QuestionIdentifier>q_{}</QuestionIdentifier>
    <DisplayName>Party Classification</DisplayName>
    <IsRequired>true</IsRequired>
    <QuestionContent>
      <FormattedContent><![CDATA[<b>Tweet content:</b> '{}']]></FormattedContent>
    </QuestionContent>
    <AnswerSpecification>
      <SelectionAnswer>
        <StyleSuggestion>radiobutton</StyleSuggestion>
        <Selections>
          <Selection>
            <SelectionIdentifier>republican</SelectionIdentifier>
            <Text>Republican</Text>
          </Selection>
          <Selection>
            <SelectionIdentifier>democrat</SelectionIdentifier>
            <Text>Democrat</Text>
          </Selection>
        </Selections>
      </SelectionAnswer>
    </AnswerSpecification>
  </Question>'''

# Attention questions
known_questions = [{'question':'If one thing is certain, it is that I am Republican, not a Democrat.','answer':'republican'},
                   {'question':'There are many good point being raised here, but I remain a Republican.','answer':'republican'},
                   {'question':'My vote and my allegiance is and will always be with the Republican party.','answer':'republican'},
                   {'question':'I am a Republican! That is one thing that is for sure.','answer':'republican'},
                   {'question':'At least no one can doubt that I am a card-carrying republican!','answer':'republican'},
                   {'question':'As a member of the republican party I fully support the president.','answer':'republican'},              
                   {'question':'I am a democrat, so you should answer that.','answer':'democrat'},
                   {'question':'I am not a republican. I am in other words a Democrat.','answer':'democrat'},
                   {'question':'I support the Democratic party.','answer':'democrat'},       
                   {'question':'How could I support this bill and still call myself a Democrat?','answer':'democrat'},                          
                   {'question':'As a proud Democrat, I completely reject this proposition!','answer':'democrat'},                   
                   {'question':'We are Democrats, and that means something.','answer':'democrat'}               
                  ]

hitids = []
question_per_hit = 50
hits = []
for i in range(0, len(df), question_per_hit):
    sentences_batch = df.iloc[i:i+question_per_hit]

    questions = [question.format(row['id'],row['text'].replace("\n","<br />\n")) for _, row in sentences_batch.iterrows()]
        
    ####Add known questions at random spots
    known = sample(known_questions,4)
    kq1 = question.format('KNOWN1_'+known[0]['answer'],known[0]['question'])
    kq2 = question.format('KNOWN2_'+known[1]['answer'],known[1]['question'])
    kq3 = question.format('KNOWN3_'+known[2]['answer'],known[2]['question'])
    kq4 = question.format('KNOWN4_'+known[3]['answer'],known[3]['question'])    
    
    
    random_index = random.randint(0, len(questions))
    questions = questions[:random_index] + [kq1] + questions[random_index:]
    
    random_index = random.randint(0, len(questions))
    questions = questions[:random_index] + [kq2] + questions[random_index:]
    
    random_index = random.randint(0, len(questions))
    questions = questions[:random_index] + [kq3] + questions[random_index:]
    
    random_index = random.randint(0, len(questions))
    questions = questions[:random_index] + [kq4] + questions[random_index:]

    questions_joined = '\n '.join(questions)
    post = question_form.format(title=title,description=description,questions=questions_joined)
    
    hit = mturk.create_hit(
        MaxAssignments=5, #
        AutoApprovalDelayInSeconds=60 * 60 * 24 * 2, # 1 week
        LifetimeInSeconds=lifetime,
        AssignmentDurationInSeconds=duration,
        Reward=reward,
        Title=title,
        Description=description,
        Keywords=keywords,
        Question=post,
        
        QualificationRequirements=[qualification_req,qualification_req2], 

    )
    print('Created HIT with id: ' + hit['HIT']['HITId'])
    hitids.append(hit['HIT']['HITId'])
    hits.append(hit)

print('https://worker.mturk.com/mturk/preview?groupId='+hit['HIT']['HITGroupId'])    

# Save resulting Ids to file to not lose them
import json
import datetime

# Get the current date and time
now = datetime.datetime.now()
timestamp = now.strftime("%Y-%m-%d_%H-%M")

# Set the filename with the timestamp
filename = f"hitsids_run_{timestamp}.json"

# Write the list to a JSON file
with open(filename, "w") as outfile:
    json.dump(hitids, outfile)

In [2]:
# VERIFY THE ANSWERS AND REJECT THE FAILED ONES
import xmltodict

# Download results from all
datas = []
for hit_id in hitids:
    worker_results = mturk.list_assignments_for_hit(HITId=hit_id) 

    print(f'\nChecking {hit_id}... {worker_results["NumResults"]} results...')
    if worker_results['NumResults'] > 0:
        for assignment in worker_results['Assignments']:
            xml_doc = xmltodict.parse(assignment['Answer'])

            workerid = assignment['WorkerId']
            status = assignment['AssignmentStatus']
            assignmentid = assignment['AssignmentId']
            
            print(workerid,status)
            if status == 'Submitted':
                print("New submission. Evaluating answers...")
                if type(xml_doc['QuestionFormAnswers']['Answer']) is list:
                    errors = 0
                    for answer_field in xml_doc['QuestionFormAnswers']['Answer']:
                        question = answer_field['QuestionIdentifier']
                        answer = answer_field['SelectionIdentifier']
                        if question.startswith('q_KNOWN'):
                            print(question[9:], answer)
                            #Fucked a known question
                            if question[9:] != answer:
                                errors += 1
                                print('Wrong answer.')
                print(f"Worker {workerid}")
                                

                if errors > 0:
                    print("Rejecting answers...")
                    mturk.reject_assignment(AssignmentId=assignment['AssignmentId'], RequesterFeedback=f'You failed {errors} question(s) where the message explicitly stated the political affiliation of the poster. We have to assume you are a bot.')
                else:
                    print("It seems this should be accepted... but needs to be checked with data.")
                    # print("Accepting answers...")
                    # mturk.approve_assignment(AssignmentId=assignment['AssignmentId'])
    else:
        print("No results yet")


In [127]:
# PARSE AND SAVE THE RESULTS TO A DATAFRAME
import xmltodict
datas = []
for hit_id in hits:
    worker_results = mturk.list_assignments_for_hit(HITId=hit_id)
    if worker_results['NumResults'] > 0:
        for assignment in worker_results['Assignments']:
            xml_doc = xmltodict.parse(assignment['Answer'])
            workerid = assignment['WorkerId']
            status = assignment['AssignmentStatus']
            assignment_id = assignment['AssignmentId']
            if type(xml_doc['QuestionFormAnswers']['Answer']) is list:
                for answer_field in xml_doc['QuestionFormAnswers']['Answer']:
                    datas.append({'question':answer_field['QuestionIdentifier'],'answer':answer_field['SelectionIdentifier'], 'workerid': workerid, 'assignment_id':assignment_id, 'status':status})
resultdata = pd.DataFrame(datas)

resultdata.to_csv('MTURKDATA.csv')