ConversationTeacher parent class changed (#4256)

* ConversationTeacher parent class changed * reformat * [Crowdsourcing] can call blueprints from command line (#4254) * remove unnecessary run scripts * udpate readmes * BB2 handling increase in batch size during dynamic batching (#4238) * ConversationTeacher parent class changed * reformat * changed the teacher, tests are passing * removing 'episode_done' if it exists in the example Co-authored-by: Megan Ung <20617868+meganung@users.noreply.github.com>
facebookresearch · Dec 16, 2021 · 2c1bcb0 · 2c1bcb0
1 parent cd58114
commit 2c1bcb0
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 64 deletions.
diff --git a/parlai/core/teachers.py b/parlai/core/teachers.py
@@ -1611,11 +1611,11 @@ def setup_data(self, datafile):
                 yield act, next_episode_new
 
 
-class ConversationTeacher(FixedDialogTeacher):
+class ConversationTeacher(DialogTeacher):
     """
     This module provides access to data in the Conversations format.
 
-    Subclasses ``FixedDialogTeacher`` for functionality and provides an
+    Subclasses ``DialogTeacher`` for functionality and provides an
     implementation of ``setup_data()`` which iterates over datasets in the
     "Conversations" format. If your data is in the format below, use this class to
     handle file parsing for you.
@@ -1649,61 +1649,46 @@ class ConversationTeacher(FixedDialogTeacher):
     A set of examples X1 => Y1, X2 => Y2, and X3 => Y3 will be generated,
     forming one episode. However, Y1 => X2 and Y2 => X3 are not created as
     separate examples by default.
-    To change this behavior, you can set opt['label_turns']. The default
-    value is 'secondspeaker' (i.e., the second speaker's utterances are
+    To change this behavior, you can set ``opt['label_turns']`` or ``--label-turns flag``.
+    The default value is 'secondspeaker' (i.e., the second speaker's utterances are
     used as labels), but 'firstspeaker' and 'both' are also options. In the
     case of 'both', two episodes are generated for each conversation.
     """
 
+    @classmethod
+    def add_cmdline_args(
+        cls, parser: ParlaiParser, partial_opt: Optional[Opt] = None
+    ) -> ParlaiParser:
+        agent = super().add_cmdline_args(parser, partial_opt)
+        agent.add_argument(
+            '--label-turns',
+            type=str,
+            help='which speaker to use as label',
+            choices=['firstspeaker', 'secondspeaker', 'both'],
+            default='secondspeaker',
+        )
+        return parser
+
     def __init__(self, opt, shared=None):
-        super().__init__(opt, shared)
-        if not shared:
-            self.episodes = []
-            self.num_exs = 0
-            self.label_turns = opt.get('label_turns')
-            if opt.get('conversationteacher_datafile') is not None:
-                self._setup_data(opt.get('conversationteacher_datafile'))
-        else:
-            self.episodes = shared['episodes']
-            self.num_exs = sum(len(e) for e in self.episodes)
+        if not opt.get('conversationteacher_datafile'):
+            raise RuntimeError('conversationteacher_datafile not specified')
 
+        opt = copy.deepcopy(opt)
+        opt['datafile'] = opt.get('conversationteacher_datafile')
+        self.label_turns = opt.get('label_turns')
+        super().__init__(opt, shared)
         self.id = opt['task']
 
-        self.reset()
+    def _return_episode_examples(self, episode):
+        for idx, example in enumerate(episode):
+            episode_begin = idx == 0
+            if 'episode_done' in example:
+                example.pop('episode_done')
+            yield example, episode_begin
 
-    def share(self):
-        """
-        Share the episodes.
-        """
-        shared = super().share()
-        shared['episodes'] = self.episodes
-        return shared
-
-    def num_examples(self):
-        """
-        Return the number of examples from the data.
-        """
-        return self.num_exs
-
-    def num_episodes(self):
-        """
-        Return the number of episodes from the data.
-        """
-        return len(self.episodes)
-
-    def get(self, episode_idx, entry_idx=None):
-        """
-        Get a specific example from the dataset.
-        """
-        return Message(self.episodes[episode_idx][entry_idx])
-
-    def _setup_data(self, path):
-        logging.info("[loading data from json file into task:" + path + "]")
-        self.episodes = []
-        self.num_exs = 0
-        eps = []
+    def setup_data(self, path):
+        logging.info(f"[loading data from json file into task: {path} ]")
         conversations = Conversations(path)
-        self.num_exs = 0
         for conv in conversations:
             if conv.context:
                 warn_once(
@@ -1719,27 +1704,24 @@ def _setup_data(self, path):
             if self.label_turns in ['firstspeaker', 'both']:
                 eps = self._get_ep_from_turns(turns[::2], turns[1::2])
                 if eps:
-                    self.episodes.append(eps)
-                    self.num_exs += len(eps)
+                    for example, example_begins in self._return_episode_examples(eps):
+                        yield example, example_begins
 
             # train on even turns as labels (turns w/ second speaker)
             if self.label_turns in ['secondspeaker', 'both']:
                 eps = self._get_ep_from_turns(turns[1::2], turns[2::2])
                 if eps:
-                    self.episodes.append(eps)
-                    self.num_exs += len(eps)
+                    for example, example_begins in self._return_episode_examples(eps):
+                        yield example, example_begins
 
     def _get_ep_from_turns(self, xturns, yturns):
         eps = []
         for xturn, yturn in zip(xturns, yturns):
             turn = {}
             turn['text'] = xturn.get('text').strip()
             turn['labels'] = [yturn.get('text').strip()]
-            turn['episode_done'] = False
             eps.append(turn)
-        if eps:
-            eps[-1]['episode_done'] = True
-            return eps
+        return eps
 
 
 class AbstractImageTeacher(FixedDialogTeacher):
@@ -1930,9 +1912,9 @@ def get_image_features_path(self, task, image_model_name, dt):
         """
         Image features for the dataset images are stored here.
 
-        Can be overridden in subclass to use custom paths. Image features can be manually
-        copied into this directory or in the case of ImageLoader eligible models, they
-        will be built and stored here if not already there.
+        Can be overridden in subclass to use custom paths. Image features can be
+        manually copied into this directory or in the case of ImageLoader eligible
+        models, they will be built and stored here if not already there.
         """
         # In default implementation, self.data_path already has task name added
         image_features_path = os.path.join(self.data_path, 'image_features')

diff --git a/parlai/tasks/jsonfile/agents.py b/parlai/tasks/jsonfile/agents.py
@@ -47,19 +47,18 @@ def add_cmdline_args(
         return parser
 
     def __init__(self, opt, shared=None):
-        super().__init__(opt, shared)
         opt = copy.deepcopy(opt)
         if not opt.get('jsonfile_datapath'):
             raise RuntimeError('jsonfile_datapath not specified')
         datafile = opt['jsonfile_datapath']
-        if self.opt['jsonfile_datatype_extension']:
-            datafile += "_" + self.opt['datatype'].split(':')[0] + '.jsonl'
-        if shared is None:
-            self._setup_data(datafile)
+        if opt['jsonfile_datatype_extension']:
+            datafile += "_" + opt['datatype'].split(':')[0] + '.jsonl'
+        opt['conversationteacher_datafile'] = datafile
+        super().__init__(opt, shared)
+
         # Truncate datafile to just the immediate enclosing folder name and file name
         dirname, basename = os.path.split(datafile)
         self.id = os.path.join(os.path.split(dirname)[1], basename)
-        self.reset()
 
 
 class DefaultTeacher(JsonTeacher):