In [1]:
import json
from collections import Counter

import numpy as np
from nltk.tokenize import sent_tokenize

from utils.data2seq import Dial2seq, SequencePreprocessor

In [2]:
sequencer = Dial2seq('data/topicalchat_midas_cobot_entities_.json', 3)

In [3]:
seqs = sequencer.transform()

In [4]:
len(seqs)

162494

In [5]:
for seq in seqs[0:3]:
    print(seq[-1]['ner']['response'][0]['label'], '\n')

organization 

anaphor 

organization 



In [6]:
preproc = SequencePreprocessor()

In [7]:
for seq in preproc.transform(seqs[0:50]):
    print(seq, '\n')

{'previous_text': ['Yes it helped him smooth out his dance moves', 'Nice. Do you like Shakespeare?', 'Yes I do. Do you know that he popularized many phrases'], 'previous_midas': [['pos_answer'], ['comment', 'yes_no_question'], ['pos_answer', 'yes_no_question']], 'midas_vectors': [[[0.004489609505981207, 0.005659966263920069, 0.008467592298984528, 0.001957598840817809, 0.0032978374511003494, 0.0016933480510488153, 0.0015114849666133523, 0.001163744367659092, 0.011162664741277695, 0.002088370034471154, 0.8511312007904053, 0.10295984894037247, 0.004416705574840307]], [[0.33323583006858826, 0.0063044424168765545, 0.46820133924484253, 0.007547429762780666, 0.014540870673954487, 0.0016343685565516353, 0.003681134432554245, 0.0049273851327598095, 0.0934942215681076, 0.006875962950289249, 0.016408585011959076, 0.037494078278541565, 0.005654338281601667], [0.005343989469110966, 0.03268001973628998, 0.00530182896181941, 0.008314643055200577, 0.018734272569417953, 0.004701630212366581, 0.00695905

In [8]:
dataset = preproc.transform(seqs)
len(dataset)

10565

In [9]:
preproc.midas_target.most_common()

[('opinion', 4855),
 ('statement', 3256),
 ('yes_no_question', 1355),
 ('comment', 335),
 ('open_question_opinion', 215),
 ('pos_answer', 200),
 ('open_question_factual', 170),
 ('command', 87),
 ('neg_answer', 53),
 ('complaint', 22),
 ('dev_command', 8),
 ('appreciation', 6),
 ('other_answers', 4)]

In [10]:
preproc.entity_target.most_common()

[('person', 2516),
 ('videoname', 1625),
 ('location', 1016),
 ('device', 976),
 ('organization', 833),
 ('genre', 638),
 ('sport', 467),
 ('number', 408),
 ('softwareapplication', 376),
 ('sportteam', 374),
 ('duration', 281),
 ('year', 195),
 ('position', 156),
 ('event', 128),
 ('vehicle', 127),
 ('date', 105),
 ('gamename', 99),
 ('wear', 76),
 ('channelname', 49),
 ('sportrole', 49),
 ('songname', 33),
 ('party', 29),
 ('bookname', 8),
 ('venue', 1),
 ('ordinal', 1)]

In [11]:
preproc.midas_and_entity_target.most_common()

[('opinion_person', 1270),
 ('opinion_videoname', 775),
 ('statement_person', 658),
 ('statement_videoname', 458),
 ('opinion_location', 426),
 ('statement_location', 406),
 ('opinion_organization', 372),
 ('statement_device', 361),
 ('opinion_device', 358),
 ('opinion_genre', 350),
 ('yes_no_question_person', 312),
 ('yes_no_question_videoname', 282),
 ('statement_organization', 280),
 ('statement_number', 221),
 ('opinion_sportteam', 218),
 ('opinion_sport', 215),
 ('yes_no_question_device', 148),
 ('opinion_softwareapplication', 144),
 ('statement_softwareapplication', 135),
 ('statement_duration', 128),
 ('opinion_number', 126),
 ('statement_genre', 111),
 ('yes_no_question_genre', 104),
 ('opinion_duration', 103),
 ('yes_no_question_organization', 96),
 ('statement_year', 96),
 ('statement_sportteam', 95),
 ('statement_sport', 92),
 ('comment_person', 90),
 ('yes_no_question_sport', 90),
 ('yes_no_question_location', 87),
 ('opinion_position', 82),
 ('opinion_date', 81),
 ('opinio

In [36]:
with open('data/dataset.json', 'w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=2)

In [37]:
labels = dict()

labels['midas2id'] = {label: i for i, label in enumerate(preproc.midas_all.keys())}
labels['entities2id'] = {label: i for i, label in enumerate(preproc.entity_all.keys())}
labels['target_midas2id'] = {label: i for i, label in enumerate(preproc.midas_target.keys())}
labels['target_entity2id'] = {label: i for i, label in enumerate(preproc.entity_target.keys())}
labels['target_midas_and_entity2id'] = {label: i for i, label in enumerate(preproc.midas_and_entity_target.keys())}

with open('data/labels.json', 'w', encoding='utf-8') as f:
    json.dump(labels, f, ensure_ascii=False, indent=2)