In [1]:
import json
from collections import Counter

from utils.data2seq import Dial2seq, SequencePreprocessor

In [2]:
topical_sequencer = Dial2seq('data/topical_chat_annotated.json', 3)
daily_sequencer = Dial2seq('data/daily_dialogue_annotated.json', 3)

## Daily Dialogue

In [3]:
daily = daily_sequencer.transform()

In [4]:
len(daily)

61087

In [5]:
daily_preproc = SequencePreprocessor()

In [6]:
daily_dataset = daily_preproc.transform(daily)
len(daily_dataset)

2352

In [7]:
daily_preproc.midas_target.most_common()

[('statement', 1441),
 ('opinion', 415),
 ('yes_no_question', 173),
 ('open_question_factual', 104),
 ('open_question_opinion', 82),
 ('command', 61),
 ('neg_answer', 25),
 ('pos_answer', 17),
 ('comment', 13),
 ('complaint', 11),
 ('dev_command', 9),
 ('other_answers', 1)]

In [8]:
daily_preproc.entity_target.most_common()

[('duration', 545),
 ('location', 348),
 ('number', 344),
 ('device', 231),
 ('person', 204),
 ('videoname', 188),
 ('date', 129),
 ('vehicle', 105),
 ('organization', 74),
 ('sport', 61),
 ('genre', 60),
 ('event', 21),
 ('sportteam', 16),
 ('year', 10),
 ('position', 9),
 ('gamename', 4),
 ('softwareapplication', 2),
 ('songname', 1)]

In [9]:
daily_preproc.midas_and_entity_target.most_common()

[('statement_duration', 447),
 ('statement_number', 289),
 ('statement_location', 205),
 ('statement_person', 109),
 ('statement_device', 102),
 ('opinion_videoname', 87),
 ('statement_videoname', 79),
 ('statement_date', 69),
 ('statement_vehicle', 57),
 ('opinion_location', 48),
 ('opinion_person', 43),
 ('statement_organization', 41),
 ('yes_no_question_device', 40),
 ('opinion_genre', 39),
 ('opinion_device', 37),
 ('opinion_duration', 34),
 ('yes_no_question_location', 33),
 ('open_question_factual_location', 27),
 ('opinion_sport', 25),
 ('opinion_number', 23),
 ('yes_no_question_duration', 20),
 ('opinion_date', 20),
 ('open_question_opinion_location', 20),
 ('opinion_vehicle', 18),
 ('yes_no_question_person', 18),
 ('yes_no_question_date', 17),
 ('opinion_organization', 16),
 ('statement_sport', 14),
 ('yes_no_question_vehicle', 13),
 ('open_question_factual_device', 12),
 ('command_number', 12),
 ('opinion_sportteam', 12),
 ('open_question_factual_vehicle', 12),
 ('open_questi

In [10]:
with open('data/daily_dataset.json', 'w', encoding='utf-8') as f:
    json.dump(daily_dataset, f, ensure_ascii=False, indent=2)

In [11]:
daily_labels = dict()

daily_labels['midas2id'] = {label: i for i, label in enumerate(daily_preproc.midas_all.keys())}
daily_labels['entities2id'] = {label: i for i, label in enumerate(daily_preproc.entity_all.keys())}
daily_labels['target_midas2id'] = {label: i for i, label in enumerate(daily_preproc.midas_target.keys())}
daily_labels['target_entity2id'] = {label: i for i, label in enumerate(daily_preproc.entity_target.keys())}
daily_labels['target_midas_and_entity2id'] = {label: i for i, label in enumerate(daily_preproc.midas_and_entity_target.keys())}

with open('data/daily_labels.json', 'w', encoding='utf-8') as f:
    json.dump(daily_labels, f, ensure_ascii=False, indent=2)

## Topical Chat

In [12]:
topical = topical_sequencer.transform()

In [13]:
len(topical)

162494

In [14]:
topical_preproc = SequencePreprocessor()

In [15]:
topical_dataset = topical_preproc.transform(topical)
len(topical_dataset)

10117

In [16]:
topical_preproc.midas_target.most_common()

[('opinion', 4636),
 ('statement', 3594),
 ('yes_no_question', 896),
 ('pos_answer', 299),
 ('comment', 209),
 ('command', 151),
 ('open_question_opinion', 118),
 ('open_question_factual', 106),
 ('neg_answer', 70),
 ('complaint', 24),
 ('appreciation', 8),
 ('dev_command', 4),
 ('other_answers', 2)]

In [17]:
topical_preproc.entity_target.most_common()

[('person', 2626),
 ('videoname', 1711),
 ('location', 1074),
 ('organization', 935),
 ('genre', 631),
 ('device', 576),
 ('sport', 570),
 ('sportteam', 451),
 ('softwareapplication', 389),
 ('duration', 243),
 ('number', 237),
 ('event', 169),
 ('position', 152),
 ('vehicle', 102),
 ('year', 96),
 ('gamename', 63),
 ('party', 47),
 ('date', 31),
 ('bookname', 11),
 ('songname', 3)]

In [18]:
topical_preproc.midas_and_entity_target.most_common()

[('opinion_person', 1334),
 ('opinion_videoname', 815),
 ('statement_person', 809),
 ('statement_videoname', 568),
 ('statement_location', 497),
 ('opinion_organization', 443),
 ('opinion_location', 429),
 ('opinion_genre', 348),
 ('statement_organization', 316),
 ('opinion_sport', 282),
 ('opinion_sportteam', 250),
 ('statement_device', 243),
 ('yes_no_question_person', 236),
 ('opinion_device', 211),
 ('yes_no_question_videoname', 211),
 ('statement_softwareapplication', 178),
 ('statement_duration', 172),
 ('statement_number', 158),
 ('statement_sportteam', 151),
 ('statement_sport', 146),
 ('opinion_softwareapplication', 132),
 ('statement_genre', 124),
 ('yes_no_question_genre', 74),
 ('opinion_event', 71),
 ('opinion_position', 70),
 ('yes_no_question_organization', 69),
 ('comment_person', 65),
 ('yes_no_question_device', 62),
 ('statement_event', 60),
 ('yes_no_question_location', 60),
 ('yes_no_question_sport', 60),
 ('opinion_duration', 58),
 ('pos_answer_genre', 50),
 ('opin

In [19]:
with open('data/topical_dataset.json', 'w', encoding='utf-8') as f:
    json.dump(topical_dataset, f, ensure_ascii=False, indent=2)

In [20]:
topical_labels = dict()

topical_labels['midas2id'] = {label: i for i, label in enumerate(topical_preproc.midas_all.keys())}
topical_labels['entities2id'] = {label: i for i, label in enumerate(topical_preproc.entity_all.keys())}
topical_labels['target_midas2id'] = {label: i for i, label in enumerate(topical_preproc.midas_target.keys())}
topical_labels['target_entity2id'] = {label: i for i, label in enumerate(topical_preproc.entity_target.keys())}
topical_labels['target_midas_and_entity2id'] = {label: i for i, label in enumerate(topical_preproc.midas_and_entity_target.keys())}

with open('data/topical_labels.json', 'w', encoding='utf-8') as f:
    json.dump(topical_labels, f, ensure_ascii=False, indent=2)