Skip to content
This repository has been archived by the owner on Nov 3, 2023. It is now read-only.

Commit

Permalink
ConversationTeacher parent class changed (#4256)
Browse files Browse the repository at this point in the history
* ConversationTeacher parent class changed

* reformat

* [Crowdsourcing] can call blueprints from command line  (#4254)

* remove unnecessary run scripts

* udpate readmes

* BB2 handling increase in batch size during dynamic batching (#4238)

* ConversationTeacher parent class changed

* reformat

* changed the teacher, tests are passing

* removing 'episode_done' if it exists in the example

Co-authored-by: Megan Ung <20617868+meganung@users.noreply.github.com>
  • Loading branch information
mojtaba-komeili and meganung committed Dec 16, 2021
1 parent cd58114 commit 2c1bcb0
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 64 deletions.
98 changes: 40 additions & 58 deletions parlai/core/teachers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1611,11 +1611,11 @@ def setup_data(self, datafile):
yield act, next_episode_new


class ConversationTeacher(FixedDialogTeacher):
class ConversationTeacher(DialogTeacher):
"""
This module provides access to data in the Conversations format.
Subclasses ``FixedDialogTeacher`` for functionality and provides an
Subclasses ``DialogTeacher`` for functionality and provides an
implementation of ``setup_data()`` which iterates over datasets in the
"Conversations" format. If your data is in the format below, use this class to
handle file parsing for you.
Expand Down Expand Up @@ -1649,61 +1649,46 @@ class ConversationTeacher(FixedDialogTeacher):
A set of examples X1 => Y1, X2 => Y2, and X3 => Y3 will be generated,
forming one episode. However, Y1 => X2 and Y2 => X3 are not created as
separate examples by default.
To change this behavior, you can set opt['label_turns']. The default
value is 'secondspeaker' (i.e., the second speaker's utterances are
To change this behavior, you can set ``opt['label_turns']`` or ``--label-turns flag``.
The default value is 'secondspeaker' (i.e., the second speaker's utterances are
used as labels), but 'firstspeaker' and 'both' are also options. In the
case of 'both', two episodes are generated for each conversation.
"""

@classmethod
def add_cmdline_args(
cls, parser: ParlaiParser, partial_opt: Optional[Opt] = None
) -> ParlaiParser:
agent = super().add_cmdline_args(parser, partial_opt)
agent.add_argument(
'--label-turns',
type=str,
help='which speaker to use as label',
choices=['firstspeaker', 'secondspeaker', 'both'],
default='secondspeaker',
)
return parser

def __init__(self, opt, shared=None):
super().__init__(opt, shared)
if not shared:
self.episodes = []
self.num_exs = 0
self.label_turns = opt.get('label_turns')
if opt.get('conversationteacher_datafile') is not None:
self._setup_data(opt.get('conversationteacher_datafile'))
else:
self.episodes = shared['episodes']
self.num_exs = sum(len(e) for e in self.episodes)
if not opt.get('conversationteacher_datafile'):
raise RuntimeError('conversationteacher_datafile not specified')

opt = copy.deepcopy(opt)
opt['datafile'] = opt.get('conversationteacher_datafile')
self.label_turns = opt.get('label_turns')
super().__init__(opt, shared)
self.id = opt['task']

self.reset()
def _return_episode_examples(self, episode):
for idx, example in enumerate(episode):
episode_begin = idx == 0
if 'episode_done' in example:
example.pop('episode_done')
yield example, episode_begin

def share(self):
"""
Share the episodes.
"""
shared = super().share()
shared['episodes'] = self.episodes
return shared

def num_examples(self):
"""
Return the number of examples from the data.
"""
return self.num_exs

def num_episodes(self):
"""
Return the number of episodes from the data.
"""
return len(self.episodes)

def get(self, episode_idx, entry_idx=None):
"""
Get a specific example from the dataset.
"""
return Message(self.episodes[episode_idx][entry_idx])

def _setup_data(self, path):
logging.info("[loading data from json file into task:" + path + "]")
self.episodes = []
self.num_exs = 0
eps = []
def setup_data(self, path):
logging.info(f"[loading data from json file into task: {path} ]")
conversations = Conversations(path)
self.num_exs = 0
for conv in conversations:
if conv.context:
warn_once(
Expand All @@ -1719,27 +1704,24 @@ def _setup_data(self, path):
if self.label_turns in ['firstspeaker', 'both']:
eps = self._get_ep_from_turns(turns[::2], turns[1::2])
if eps:
self.episodes.append(eps)
self.num_exs += len(eps)
for example, example_begins in self._return_episode_examples(eps):
yield example, example_begins

# train on even turns as labels (turns w/ second speaker)
if self.label_turns in ['secondspeaker', 'both']:
eps = self._get_ep_from_turns(turns[1::2], turns[2::2])
if eps:
self.episodes.append(eps)
self.num_exs += len(eps)
for example, example_begins in self._return_episode_examples(eps):
yield example, example_begins

def _get_ep_from_turns(self, xturns, yturns):
eps = []
for xturn, yturn in zip(xturns, yturns):
turn = {}
turn['text'] = xturn.get('text').strip()
turn['labels'] = [yturn.get('text').strip()]
turn['episode_done'] = False
eps.append(turn)
if eps:
eps[-1]['episode_done'] = True
return eps
return eps


class AbstractImageTeacher(FixedDialogTeacher):
Expand Down Expand Up @@ -1930,9 +1912,9 @@ def get_image_features_path(self, task, image_model_name, dt):
"""
Image features for the dataset images are stored here.
Can be overridden in subclass to use custom paths. Image features can be manually
copied into this directory or in the case of ImageLoader eligible models, they
will be built and stored here if not already there.
Can be overridden in subclass to use custom paths. Image features can be
manually copied into this directory or in the case of ImageLoader eligible
models, they will be built and stored here if not already there.
"""
# In default implementation, self.data_path already has task name added
image_features_path = os.path.join(self.data_path, 'image_features')
Expand Down
11 changes: 5 additions & 6 deletions parlai/tasks/jsonfile/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,18 @@ def add_cmdline_args(
return parser

def __init__(self, opt, shared=None):
super().__init__(opt, shared)
opt = copy.deepcopy(opt)
if not opt.get('jsonfile_datapath'):
raise RuntimeError('jsonfile_datapath not specified')
datafile = opt['jsonfile_datapath']
if self.opt['jsonfile_datatype_extension']:
datafile += "_" + self.opt['datatype'].split(':')[0] + '.jsonl'
if shared is None:
self._setup_data(datafile)
if opt['jsonfile_datatype_extension']:
datafile += "_" + opt['datatype'].split(':')[0] + '.jsonl'
opt['conversationteacher_datafile'] = datafile
super().__init__(opt, shared)

# Truncate datafile to just the immediate enclosing folder name and file name
dirname, basename = os.path.split(datafile)
self.id = os.path.join(os.path.split(dirname)[1], basename)
self.reset()


class DefaultTeacher(JsonTeacher):
Expand Down

0 comments on commit 2c1bcb0

Please sign in to comment.