facebookresearch · mwillwork · Oct 12, 2020 · Oct 5, 2020 · Oct 7, 2020 · Oct 8, 2020
diff --git a/parlai/crowdsourcing/tasks/turn_annotations_static/launch_config.py b/parlai/crowdsourcing/tasks/turn_annotations_static/launch_config.py
@@ -11,6 +11,8 @@ class LaunchConfig:
     command line as params)
     """
 
+    BLUEPRINT_TYPE = 'turn_annotations_static_inflight_qa_blueprint'
+
     # Run the below to register the requester if needed (first time only)
     # mephisto register mturk_sandbox --name:XXXX
     # --access-key-id:XXXX --secret-access-key:XXXX
@@ -47,5 +49,5 @@ class LaunchConfig:
     WORKER_BLOCK_LIST = []
 
     # Blueprint specific params
-    ANNOTATE_LAST_UTTERANCE_ONLY = False
     ASK_REASON = False
+    ANNOTATION_INDICES_JSONL = None
diff --git a/parlai/crowdsourcing/tasks/turn_annotations_static/run_task.py b/parlai/crowdsourcing/tasks/turn_annotations_static/run_task.py
@@ -9,6 +9,7 @@
 import shutil
 import subprocess
 import random
+import logging
 
 from parlai.core.params import ParlaiParser
 from parlai.core.script import ParlaiScript
@@ -59,16 +60,16 @@ def setup_mephisto(launch_config):
     architect_type, requester_name, db, args = parser.parse_launch_arguments()
 
     arg_string = (
-        "--blueprint-type turn_annotations_static_inflight_qa_blueprint "
+        f"--blueprint-type {launch_config.BLUEPRINT_TYPE} "
         f"--architect-type {architect_type} "
         f"--requester-name {requester_name} "
         f'--task-title "\\"{launch_config.TASK_TITLE}\\"" '
         f'--task-description "\\"{launch_config.TASK_DESCRIPTION}\\"" '
-        f"--task-name turn-ann-s "
+        f'--task-name {launch_config.TASK_NAME} '
         f'--task-source "{TASK_DIRECTORY}/webapp/build/bundle.js" '
         f'--task-reward {launch_config.TASK_REWARD} '
         f'--subtasks-per-unit {launch_config.SUBTASKS_PER_UNIT} '
-        f'--annotate-last-utterance-only {launch_config.ANNOTATE_LAST_UTTERANCE_ONLY} '
+        f'--annotation-buckets {launch_config.ANNOTATION_BUCKETS} '
         f'--ask-reason {launch_config.ASK_REASON} '
         f'--task-tags chat,conversation,dialog,partner '
         # How many workers to do each assignment
@@ -82,6 +83,32 @@ def setup_mephisto(launch_config):
         f'--onboarding-qualification turn-ann-s-onb '
         f"-use-onboarding True "
     )
+    # Optional flags:
+    try:
+        arg_string += (
+            f'--annotation-question "\\"{launch_config.ANNOTATION_QUESTION}\\"" '
+        )
+    except Exception as exc:
+        logging.info(f'Launch config {launch_config} had no ANNOTATION_QUESTION field')
+
+    try:
+        arg_string += (
+            f'--annotation-indices-jsonl {launch_config.ANNOTATION_INDICES_JSONL} '
+        )
+    except Exception as exc:
+        logging.info(f'Launch config {launch_config} had no ANNOTATION_INDICES_JSONL')
+
+    try:
+        arg_string += f'--conversation-count {launch_config.CONVERSATION_COUNT} '
+    except Exception as exc:
+        logging.info(f'Launch config {launch_config} had no CONVERSATION_COUNT')
+
+    try:
+        arg_string += f'--onboarding-data {launch_config.ONBOARDING_DATA} '
+    except Exception as exc:
+        logging.info(f'Launch config {launch_config} had no ONBOARDING_DATA')
+
+    print(arg_string)
     return db, arg_string
 
 
@@ -145,7 +172,6 @@ def run_task(opt):
 
     build_task()
     operator = Operator(db)
-    print(f'ARG_STRING: {arg_string}')
     operator.parse_and_launch_run_wrapper(shlex.split(arg_string), extra_args={})
     operator.wait_for_runs_then_shutdown()
 

diff --git a/.../crowdsourcing/tasks/turn_annotations_static/task_config/annotation_indices_example.jsonl b/.../crowdsourcing/tasks/turn_annotations_static/task_config/annotation_indices_example.jsonl
@@ -0,0 +1,4 @@
+{"data": [1, 3, 5]}
+{"data": [1]}
+{"data": [7]}
+{"data": [9, 11]}
diff --git a/parlai/crowdsourcing/tasks/turn_annotations_static/turn_annotations_blueprint.py b/parlai/crowdsourcing/tasks/turn_annotations_static/turn_annotations_blueprint.py
@@ -5,6 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 
 import os
+import random
+import numpy as np
 import math
 import json
 import logging
@@ -38,22 +40,55 @@ class TurnAnnotationsStaticBlueprint(StaticReactBlueprint):
 
     def __init__(self, task_run: "TaskRun", opts: Any):
         super().__init__(task_run, opts)
-        self.subtasks_per_unit = opts['subtasks_per_unit']
+        print(f'Running {self.__class__.__name__} with opts: {self.opts}')
+        random.seed(self.opts["random_seed"])
+        np.random.seed(self.opts["random_seed"])
+        self.subtasks_per_unit = self.opts['subtasks_per_unit']
+        self.conversation_count = self.opts['conversation_count']
 
         if self.subtasks_per_unit <= 0:
             raise Exception(
                 f'subtasks-per-unit must be greater than zero but was {self.subtasks_per_unit}'
             )
-        grouped_data = []
+
+        self.raw_data = self._initialization_data_dicts
+
+        # Load from file if needed specifying which utterances within each
+        # conversation to annotate
+        self.annotation_indices = None
+        if self.opts['annotation_indices_jsonl']:
+            self.annotation_indices = []
+            with open(
+                self.opts['annotation_indices_jsonl'], "r", encoding="utf-8-sig"
+            ) as f:
+                line = f.readline()
+                while line:
+                    conversation_indices = json.loads(line)
+                    self.annotation_indices.append(conversation_indices)
+                    line = f.readline()
+            if len(self.annotation_indices) != len(self.raw_data):
+                raise Exception(
+                    f'Cannot specify a different length of annotation indices ({len(self.annotation_indices)}) than conversations ({len(self.raw_data)}).'
+                )
+            # TODO: should check that utterances specified are all bot
+            # utterances (agent_idx == 1)
+
+        if self.conversation_count:
+            self.raw_data = self.raw_data[: self.conversation_count]
+            if self.annotation_indices:
+                self.annotation_indices = self.annotation_indices[
+                    : self.conversation_count
+                ]
 
         # Reorganize the self-chat data
         self._initialization_data_dicts = self.process_data(
-            self._initialization_data_dicts
+            self.raw_data, annotation_indices=self.annotation_indices
         )
 
         # Now chunk the data into groups of <num_subtasks>
+        grouped_data = []
         logging.info(
-            f'Raw data length: {len(self._initialization_data_dicts)}. self.subtasks_per_unit: {self.subtasks_per_unit}'
+            f'Raw data length: {len(self.raw_data)}. self.subtasks_per_unit: {self.subtasks_per_unit}'
         )
         for i in range(0, len(self._initialization_data_dicts), self.subtasks_per_unit):
             chunk = self._initialization_data_dicts[i : i + self.subtasks_per_unit]
@@ -70,6 +105,20 @@ def add_args_to_group(cls, group: "ArgumentGroup") -> None:
         Adds required options for TurnAnnotationStaticBlueprint.
         """
         super().add_args_to_group(group)
+        group.add_argument(
+            "--random-seed",
+            dest="random_seed",
+            type=int,
+            default=42,
+            help="seed for random",
+        )
+        group.add_argument(
+            "--annotation-question",
+            dest="annotation_question",
+            type=str,
+            default='Does this comment require any annotations? (Check all that apply)',
+            help="The string displayed above the checkboxes for each annotation in the task.",
+        )
         group.add_argument(
             "--subtasks-per-unit",
             dest="subtasks_per_unit",
@@ -78,11 +127,11 @@ def add_args_to_group(cls, group: "ArgumentGroup") -> None:
             help="number of subtasks/comparisons to do per unit",
         )
         group.add_argument(
-            "--annotate-last-utterance-only",
-            dest="annotate_last_utterance_only",
-            type=str2bool,  # Need to handle it being 'False' in arg_string
-            default=False,
-            help="If we only want the crowdworker to annotate the last utterance in the conversation",
+            "--annotation-indices-jsonl",
+            dest="annotation_indices_jsonl",
+            type=str,
+            default=None,
+            help="Specify which utterance indices to annotate per conversation in a JSONL file. Must be same length as conversations data-jsonl file. See example file in task_config/annotation_indices_example.jsonl",
         )
         group.add_argument(
             "--ask-reason",
@@ -91,6 +140,13 @@ def add_args_to_group(cls, group: "ArgumentGroup") -> None:
             default=False,
             help="If we want to ask the crowdworker for a reason for each of their annotations in a text field",
         )
+        group.add_argument(
+            "--conversation-count",
+            dest="conversation_count",
+            type=int,
+            default=None,
+            help="Specify a positive integer if you want to use only the first N conversations in the data file",
+        )
         group.add_argument(
             "--onboarding-data",
             dest="onboarding_data",
@@ -123,33 +179,93 @@ def get_frontend_args(self) -> Dict[str, Any]:
         return {
             "task_description": self.opts['task_description'],
             "task_title": self.opts['task_title'],
+            "annotation_question": self.opts['annotation_question'],
             "onboarding_data": onboarding_data,
             "annotation_buckets": annotation_buckets,
-            "annotate_last_utterance_only": self.opts['annotate_last_utterance_only'],
             "ask_reason": self.opts['ask_reason'],
             "frame_height": '100%',
             "num_subtasks": self.opts["subtasks_per_unit"],
             "block_mobile": True,
         }
 
-    def process_data(self, data_dicts):
+    def process_data(self, data_dicts, annotation_indices=None):
         """
-        Override this in a subclass if you want to change how data is processed from
-        input file before being sent to the frontend.
+        Override this in a subclass if you want to change how data is processed
+        from input file before being sent to the frontend.
         """
         output = []
-        for d in data_dicts:
-            new_dialogue = []
-            for utt in d['dialog']:
-                # If there is a persona, which is context as in the ConvAI2
-                # task, we don't want to display the persona utterances
-                if 'persona' not in utt[0]['text']:
-                    new_dialogue.append({'text': utt[0]['text'], 'agent_idx': 0})
-                if 'persona' not in utt[1]['text']:
-                    new_dialogue.append({'text': utt[1]['text'], 'agent_idx': 1})
-            output.append(new_dialogue)
+        for conv_idx, d in enumerate(data_dicts):
+            max_turn_to_show = len(d['dialog']) - 1
+            if annotation_indices:
+                # We only want to show the conversation up to the last
+                # utterance we need annotations on, b/c otherwise may confuse
+                # or bias the turkers
+                if len(annotation_indices[conv_idx]) > 1:
+                    logging.info(
+                        f'Splitting {len(annotation_indices[conv_idx])} separate problematic utterance annotations in the same conversation into two separate conversations for this task. This avoids biasing the turkers with utterances that may come after one of the annotations.'
+                    )
+                for a in annotation_indices[conv_idx]:
+                    processed_dialog = self._process_conversation(d, [a])
+                    output.append(processed_dialog)
+            else:
+                processed_dialog = self._process_conversation(d, [max_turn_to_show])
+                output.append(processed_dialog)
+        print(
+            f'Processed {len(data_dicts)} total conversations into {len(output)} conversations to be used in crowdsourcing task.'
+        )
+        np.random.shuffle(output)
         return output
 
+    def _process_conversation(self, d, annotation_indices):
+        """
+        :param annotation_indices: Array of turn indices to annotate of the
+        actual conversation not including the context [So 0 is the "Hi!" if
+        that's the first non-context utterance of the conversation.]
+        """
+        new_dialogue = []
+        max_turn_to_show = max(annotation_indices)
+        adjusted_turn_idx = 0
+        for full_turn in d['dialog']:
+            if len(full_turn) != 2:
+                print(
+                    f'Warning! Skipping incomplete conversation! full_turn was: {full_turn}'
+                )
+                continue
+            if adjusted_turn_idx > max_turn_to_show:
+                logging.info(
+                    f'Skipping {adjusted_turn_idx}th utterance, b/c max_turn_to_show was {max_turn_to_show}.'
+                )
+                continue
+            # If there is a persona, which is context as in the ConvAI2
+            # task, we don't want to display the persona utterances
+            if 'persona' not in full_turn[0]['text']:
+                do_annotate = False
+                new_dialogue.append(
+                    {
+                        'text': full_turn[0]['text'],
+                        'agent_idx': 0,
+                        'do_annotate': do_annotate,
+                    }
+                )
+                adjusted_turn_idx += 1
+            if 'persona' not in full_turn[1]['text']:
+                do_annotate = True
+                if annotation_indices:
+                    do_annotate = adjusted_turn_idx in annotation_indices
+                new_dialogue.append(
+                    {
+                        'text': full_turn[1]['text'],
+                        'agent_idx': 1,
+                        'do_annotate': do_annotate,
+                    }
+                )
+                adjusted_turn_idx += 1
+        if adjusted_turn_idx < max_turn_to_show:
+            raise Exception(
+                f'Conversation had {adjusted_turn_idx} but max_turn_to_show was {max_turn_to_show}'
+            )
+        return new_dialogue
+
 
 @register_mephisto_abstraction()
 class TurnAnnotationsStaticInFlightQABlueprint(TurnAnnotationsStaticBlueprint):
@@ -171,10 +287,17 @@ def __init__(self, task_run: "TaskRun", opts: Any):
                 qc_convo = json.loads(line)
                 raw_qc_convos.append(qc_convo)
                 line = f.readline()
-        self.quality_control_convos = self.process_data(raw_qc_convos)
+        # Annotate all the utterances in the quality controls
+        # So annotation_indices=None here
+        self.quality_control_convos = self.process_data(
+            raw_qc_convos, annotation_indices=None
+        )
 
+        # Re-chunk the data to add a quality control convo as the last subtask
         # No shuffling of the data for reproducibility's sake
         # (quality control will always be last subtask)
+        # TODO: I don't think we need to re-chunk this actually; just iterate
+        # over the data and add the quality control task
         all_data = []
         for grp in self._initialization_data_dicts:
             all_data.extend(grp)

diff --git a/parlai/crowdsourcing/tasks/turn_annotations_static/webapp/src/components/core_components.jsx b/parlai/crowdsourcing/tasks/turn_annotations_static/webapp/src/components/core_components.jsx
@@ -19,10 +19,10 @@ function TaskFrontend({ taskData, taskConfig, isOnboarding, onSubmit }) {
     return <LoadingScreen />;
   }
   if (isOnboarding) {
-    return <OnboardingComponent onboardingData={taskConfig.onboarding_data} annotationBuckets={taskConfig.annotation_buckets} onSubmit={onSubmit} />;
+    return <OnboardingComponent onboardingData={taskConfig.onboarding_data} annotationBuckets={taskConfig.annotation_buckets} annotationQuestion={taskConfig.annotation_question} onSubmit={onSubmit} />;
   }
   return (
-    <MainTaskComponent taskData={taskData} annotationBuckets={taskConfig.annotation_buckets} taskTitle={taskConfig.task_title} taskDescription={taskConfig.task_description} taskConfig={taskConfig} onSubmit={onSubmit}></MainTaskComponent>
+    <MainTaskComponent taskData={taskData} taskTitle={taskConfig.task_title} taskDescription={taskConfig.task_description} taskConfig={taskConfig} onSubmit={onSubmit}></MainTaskComponent>
   );
 }
 

diff --git a/...owdsourcing/tasks/turn_annotations_static/webapp/src/components/onboarding_components.jsx b/...owdsourcing/tasks/turn_annotations_static/webapp/src/components/onboarding_components.jsx
@@ -78,12 +78,13 @@ function OnboardingDirections({ children }) {
     );
 }
 
-function OnboardingUtterance({ annotationBuckets, turnIdx, text }) {
+function OnboardingUtterance({ annotationBuckets, annotationQuestion, turnIdx, text }) {
+    var annotationQuestionWithoutQuotes = annotationQuestion.replace(/['"]+/g, '');
     var extraElements = '';
     if (turnIdx % 2 == 1) {
         extraElements = '';
         extraElements = (<span key={'extra_' + turnIdx}><br /><br />
-            <span style={{ fontStyle: 'italic' }}>Does this comment from your partner have any annotations? (Check all that apply)<br />
+            <span style={{ fontStyle: 'italic' }}><span dangerouslySetInnerHTML={{ __html: annotationQuestionWithoutQuotes }}></span><br />
                 <Checkboxes annotationBuckets={annotationBuckets} turnIdx={turnIdx} askReason={false} />
             </span>
         </span>)
@@ -99,7 +100,7 @@ function OnboardingUtterance({ annotationBuckets, turnIdx, text }) {
     )
 }
 
-function OnboardingComponent({ onboardingData, annotationBuckets, onSubmit }) {
+function OnboardingComponent({ onboardingData, annotationBuckets, annotationQuestion, onSubmit }) {
     return (
         <div id="onboarding-main-pane">
             <OnboardingDirections>
@@ -116,11 +117,13 @@ function OnboardingComponent({ onboardingData, annotationBuckets, onSubmit }) {
                                     <OnboardingUtterance
                                         key={idx * 2}
                                         annotationBuckets={annotationBuckets}
+                                        annotationQuestion={annotationQuestion}
                                         turnIdx={idx * 2}
                                         text={turn[0].text} />
                                     <OnboardingUtterance
                                         key={idx * 2 + 1}
                                         annotationBuckets={annotationBuckets}
+                                        annotationQuestion={annotationQuestion}
                                         turnIdx={idx * 2 + 1}
                                         text={turn[1].text} />
                                 </div>
@@ -134,7 +137,7 @@ function OnboardingComponent({ onboardingData, annotationBuckets, onSubmit }) {
             <div style={{ textAlign: 'center' }}>
                 <button id="onboarding-submit-button"
                     className="button is-link btn-lg"
-                    onClick={() => handleOnboardingSubmit({ onboardingData, annotationBuckets, onSubmit } )}
+                    onClick={() => handleOnboardingSubmit({ onboardingData, annotationBuckets, onSubmit })}
                 >
                     Submit Answers
                 </button>