Skip to content
This repository has been archived by the owner on Nov 3, 2023. It is now read-only.

Friends dataset #4568

Merged
merged 13 commits into from Jun 10, 2022
5 changes: 5 additions & 0 deletions parlai/tasks/friends/__init__.py
@@ -0,0 +1,5 @@
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
133 changes: 133 additions & 0 deletions parlai/tasks/friends/agents.py
@@ -0,0 +1,133 @@
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

from typing import Optional
from parlai.core.opt import Opt
from parlai.core.teachers import DialogTeacher
from parlai.core.params import ParlaiParser
from .build import build

import copy
import os
import json


def _path(opt, *additions):
mojtaba-komeili marked this conversation as resolved.
Show resolved Hide resolved
return os.path.join(
opt['datapath'], 'Friends', 'friends-corpus/utterances.jsonl', *additions
)


class DefaultTeacher(DialogTeacher):
START_TOKEN = '<START>'
mojtaba-komeili marked this conversation as resolved.
Show resolved Hide resolved
SILENCE_TOKEN = '<SILENCE>'
MAIN_CHARACTERS = [
'Rachel Green',
'Monica Geller',
'Phoebe Buffay',
'Joey Tribbiani',
'Chandler Bing',
'Ross Geller',
]

def __init__(self, opt, shared=None):
opt = copy.deepcopy(opt)
build(opt)
opt['datafile'] = _path(opt)
self.character = opt['character']
self.use_silence_token = opt['use_silence_token']
self.use_start_token = opt['use_start_token']
super().__init__(opt, shared)

def setup_data(self, datafile):
conversations = {}

with open(datafile, 'r') as json_file:
mojtaba-komeili marked this conversation as resolved.
Show resolved Hide resolved
for json_str in json_file:
utterance = json.loads(json_str)

text = utterance['text']
speaker = utterance['speaker']
conversation_id = utterance['conversation_id']

if conversation_id not in conversations:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check out python's defaultdict for simpler implantation here.

conversations[conversation_id] = []
conversations[conversation_id].append(
{"text": text, "speaker": speaker}
)

for conversation in conversations:
utterances = conversations[conversation]
last_utterance_index = len(utterances) - 1

for index, utterance in enumerate(utterances):
if index == 0:
if self.use_start_token:
context = self.START_TOKEN

else: # skip the first utterance since there's no context
speaker = utterance['speaker']
text = utterance['text']
context = f'{speaker}: {text}'
continue

speaker = utterance['speaker']
text = utterance['text']

prev_context = context
context += '\n' + f'{speaker}: {text}'

isConversationDone = index == last_utterance_index

# By default, generate training examples for all 6 main characters.
# Otherwise only generate training examples for the chosen character.
if (
self.character == 'All' and speaker in self.MAIN_CHARACTERS
) or speaker == self.character:
yield {
"text": prev_context,
"label": f'{speaker}: {text}',
}, isConversationDone
elif self.use_silence_token:
yield {
"text": prev_context,
"label": self.SILENCE_TOKEN,
}, isConversationDone

@classmethod
def add_cmdline_args(
cls, parser: ParlaiParser, partial_opt: Optional[Opt] = None
) -> ParlaiParser:
super().add_cmdline_args(parser, partial_opt)
agent = parser.add_argument_group('Friends Corpus Arguments')
agent.add_argument(
'--character',
type=str,
default='All',
choices=[
'All',
'Rachel Green',
'Monica Geller',
'Phoebe Buffay',
'Joey Tribbiani',
'Chandler Bing',
'Ross Geller',
],
help='Which speaker labels to train on',
)
agent.add_argument(
'--use_silence_token',
type='bool',
default=True,
help='Use silence token <SILENCE> to generate training example for sentences where the chosen speaker is not speaking',
)
agent.add_argument(
'--use_start_token',
type='bool',
default=False,
help='Use start token <START> at the beginning of each conversation, and include the first sentence as a training example',
)
return parser
40 changes: 40 additions & 0 deletions parlai/tasks/friends/build.py
@@ -0,0 +1,40 @@
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# Download and build the data if it does not exist.

from parlai.core.build_data import DownloadableFile
import parlai.core.build_data as build_data
import os
from convokit import download

RESOURCES = [
DownloadableFile(
'http://zissou.infosci.cornell.edu/convokit/datasets/friends-corpus/friends-corpus.zip',
'friends-corpus.zip',
'51ae80ce345212839d256b59b4982e9b40229ff6049115bd54d885a285d2b921',
zipped=True,
)
]


def build(opt):
dpath = os.path.join(opt['datapath'], 'Friends')
version = '1.00'

if not build_data.built(dpath, version_string=version):
print('[building data: ' + dpath + ']')
if build_data.built(dpath):
# An older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)

# Download the data.
# for downloadable_file in RESOURCES:
# downloadable_file.download_file(dpath)
download('friends-corpus', data_dir=dpath)

# Mark the data as built.
build_data.mark_done(dpath, version_string=version)
11 changes: 11 additions & 0 deletions parlai/tasks/friends/test.py
@@ -0,0 +1,11 @@
# !/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

from parlai.utils.testing import AutoTeacherTest # noqa: F401


class TestDefaultTeacher(AutoTeacherTest):
task = 'friends'
58 changes: 58 additions & 0 deletions parlai/tasks/friends/test/friends_test.yml
@@ -0,0 +1,58 @@
acts:
- - episode_done: false
eval_labels:
- 'Joey Tribbiani: C''mon, you''re going out with the guy! There''s gotta be something
wrong with him!'
id: friends
text: 'Monica Geller: There''s nothing to tell! He''s just some guy I work with!'
- - episode_done: false
eval_labels:
- 'Chandler Bing: All right Joey, be nice. So does he have a hump? A hump and
a hairpiece?'
id: friends
text: 'Monica Geller: There''s nothing to tell! He''s just some guy I work with!

Joey Tribbiani: C''mon, you''re going out with the guy! There''s gotta be something
wrong with him!'
- - episode_done: false
eval_labels:
- 'Phoebe Buffay: Wait, does he eat chalk?'
id: friends
text: 'Monica Geller: There''s nothing to tell! He''s just some guy I work with!

Joey Tribbiani: C''mon, you''re going out with the guy! There''s gotta be something
wrong with him!

Chandler Bing: All right Joey, be nice. So does he have a hump? A hump and a
hairpiece?'
- - episode_done: false
eval_labels:
- <SILENCE>
id: friends
text: 'Monica Geller: There''s nothing to tell! He''s just some guy I work with!

Joey Tribbiani: C''mon, you''re going out with the guy! There''s gotta be something
wrong with him!

Chandler Bing: All right Joey, be nice. So does he have a hump? A hump and a
hairpiece?

Phoebe Buffay: Wait, does he eat chalk?'
- - episode_done: false
eval_labels:
- 'Phoebe Buffay: Just, ''cause, I don''t want her to go through what I went through
with Carl- oh!'
id: friends
text: 'Monica Geller: There''s nothing to tell! He''s just some guy I work with!

Joey Tribbiani: C''mon, you''re going out with the guy! There''s gotta be something
wrong with him!

Chandler Bing: All right Joey, be nice. So does he have a hump? A hump and a
hairpiece?

Phoebe Buffay: Wait, does he eat chalk?

TRANSCRIPT_NOTE: '
num_episodes: 3093
num_examples: 64266
58 changes: 58 additions & 0 deletions parlai/tasks/friends/test/friends_train.yml
@@ -0,0 +1,58 @@
acts:
- - episode_done: false
id: friends
labels:
- 'Joey Tribbiani: C''mon, you''re going out with the guy! There''s gotta be something
wrong with him!'
text: 'Monica Geller: There''s nothing to tell! He''s just some guy I work with!'
- - episode_done: false
id: friends
labels:
- 'Chandler Bing: All right Joey, be nice. So does he have a hump? A hump and
a hairpiece?'
text: 'Monica Geller: There''s nothing to tell! He''s just some guy I work with!

Joey Tribbiani: C''mon, you''re going out with the guy! There''s gotta be something
wrong with him!'
- - episode_done: false
id: friends
labels:
- 'Phoebe Buffay: Wait, does he eat chalk?'
text: 'Monica Geller: There''s nothing to tell! He''s just some guy I work with!

Joey Tribbiani: C''mon, you''re going out with the guy! There''s gotta be something
wrong with him!

Chandler Bing: All right Joey, be nice. So does he have a hump? A hump and a
hairpiece?'
- - episode_done: false
id: friends
labels:
- <SILENCE>
text: 'Monica Geller: There''s nothing to tell! He''s just some guy I work with!

Joey Tribbiani: C''mon, you''re going out with the guy! There''s gotta be something
wrong with him!

Chandler Bing: All right Joey, be nice. So does he have a hump? A hump and a
hairpiece?

Phoebe Buffay: Wait, does he eat chalk?'
- - episode_done: false
id: friends
labels:
- 'Phoebe Buffay: Just, ''cause, I don''t want her to go through what I went through
with Carl- oh!'
text: 'Monica Geller: There''s nothing to tell! He''s just some guy I work with!

Joey Tribbiani: C''mon, you''re going out with the guy! There''s gotta be something
wrong with him!

Chandler Bing: All right Joey, be nice. So does he have a hump? A hump and a
hairpiece?

Phoebe Buffay: Wait, does he eat chalk?

TRANSCRIPT_NOTE: '
num_episodes: 3093
num_examples: 64266
58 changes: 58 additions & 0 deletions parlai/tasks/friends/test/friends_valid.yml
@@ -0,0 +1,58 @@
acts:
- - episode_done: false
eval_labels:
- 'Joey Tribbiani: C''mon, you''re going out with the guy! There''s gotta be something
wrong with him!'
id: friends
text: 'Monica Geller: There''s nothing to tell! He''s just some guy I work with!'
- - episode_done: false
eval_labels:
- 'Chandler Bing: All right Joey, be nice. So does he have a hump? A hump and
a hairpiece?'
id: friends
text: 'Monica Geller: There''s nothing to tell! He''s just some guy I work with!

Joey Tribbiani: C''mon, you''re going out with the guy! There''s gotta be something
wrong with him!'
- - episode_done: false
eval_labels:
- 'Phoebe Buffay: Wait, does he eat chalk?'
id: friends
text: 'Monica Geller: There''s nothing to tell! He''s just some guy I work with!

Joey Tribbiani: C''mon, you''re going out with the guy! There''s gotta be something
wrong with him!

Chandler Bing: All right Joey, be nice. So does he have a hump? A hump and a
hairpiece?'
- - episode_done: false
eval_labels:
- <SILENCE>
id: friends
text: 'Monica Geller: There''s nothing to tell! He''s just some guy I work with!

Joey Tribbiani: C''mon, you''re going out with the guy! There''s gotta be something
wrong with him!

Chandler Bing: All right Joey, be nice. So does he have a hump? A hump and a
hairpiece?

Phoebe Buffay: Wait, does he eat chalk?'
- - episode_done: false
eval_labels:
- 'Phoebe Buffay: Just, ''cause, I don''t want her to go through what I went through
with Carl- oh!'
id: friends
text: 'Monica Geller: There''s nothing to tell! He''s just some guy I work with!

Joey Tribbiani: C''mon, you''re going out with the guy! There''s gotta be something
wrong with him!

Chandler Bing: All right Joey, be nice. So does he have a hump? A hump and a
hairpiece?

Phoebe Buffay: Wait, does he eat chalk?

TRANSCRIPT_NOTE: '
num_episodes: 3093
num_examples: 64266
11 changes: 11 additions & 0 deletions parlai/tasks/task_list.py
Expand Up @@ -288,6 +288,17 @@
),
"links": {"arXiv": "https://arxiv.org/abs/1706.05125"},
},
{
"id": "Friends",
"display_name": "Friends",
"task": "friends",
"tags": ["MultiPartyConvo"],
"description": (
"Multi-party conversation dataset modified from the 10 seasons "
"of the popular American sitcom that ran in the 90s, Friends."
),
"links": {"website": "https://convokit.cornell.edu/documentation/friends.html"},
},
{
"id": "Glue",
"display_name": "Glue",
Expand Down