In [1]:
import json
from collections import Counter

import numpy as np
from nltk.tokenize import sent_tokenize

from utils.data2seq import Dial2seq, SequencePreprocessor

In [2]:
sequencer = Dial2seq('data/topicalchat_midas_cobot_entities_.json', 3)

In [3]:
seqs = sequencer.transform()

In [4]:
len(seqs)

162494

In [5]:
for seq in seqs[0:3]:
    print(seq[-1]['ner']['response'][0]['label'], '\n')

organization 

anaphor 

organization 



In [6]:
preproc = SequencePreprocessor()

In [7]:
for seq in preproc.transform(seqs[0:50]):
    print(seq, '\n')

{'previous_text': ['Google is leading the alphabet subsidiary and will continue to be the Umbrella company for Alphabet internet interest.', 'Did you know Google had hundreds of live goats to cut the grass in the past? \n', 'It is very interesting. Google provide "Chrome OS" which is a light weight OS. Google provided a lot of hardware mainly in 2010 to 2015. '], 'previous_midas': [['statement'], ['yes_no_question'], ['comment', 'statement', 'statement']], 'midas_vectors': [[[0.001440016203559935, 0.011605668812990189, 0.0031229499727487564, 0.0031691223848611116, 0.004141080193221569, 0.0013456976739689708, 0.002711065113544464, 0.001342577626928687, 0.1014678105711937, 0.0014571526553481817, 0.0017298738239333034, 0.864709734916687, 0.001757223391905427]], [[0.009326450526714325, 0.12051617354154587, 0.008179610595107079, 0.020255884155631065, 0.03977271541953087, 0.008093005046248436, 0.012120380997657776, 0.009426798671483994, 0.0024891614448279142, 0.004480868577957153, 0.03910575

In [8]:
dataset = preproc.transform(seqs)
len(dataset)

12055

In [9]:
preproc.midas_target.most_common()

[('opinion', 5508),
 ('statement', 4455),
 ('yes_no_question', 1226),
 ('comment', 209),
 ('open_question_opinion', 174),
 ('open_question_factual', 169),
 ('pos_answer', 148),
 ('command', 105),
 ('neg_answer', 35),
 ('complaint', 19),
 ('dev_command', 7),
 ('appreciation', 1)]

In [10]:
preproc.entity_target.most_common()

[('person', 2669),
 ('videoname', 1570),
 ('location', 1369),
 ('organization', 1127),
 ('device', 1012),
 ('number', 734),
 ('genre', 677),
 ('sport', 566),
 ('sportteam', 421),
 ('softwareapplication', 392),
 ('duration', 276),
 ('year', 237),
 ('position', 189),
 ('event', 185),
 ('vehicle', 136),
 ('gamename', 103),
 ('wear', 83),
 ('date', 83),
 ('sportrole', 67),
 ('channelname', 64),
 ('party', 49),
 ('songname', 35),
 ('ordinal', 5),
 ('bookname', 5),
 ('venue', 2)]

In [11]:
preproc.midas_and_entity_target.most_common()

[('opinion_person', 1316),
 ('statement_person', 868),
 ('opinion_videoname', 759),
 ('opinion_location', 588),
 ('statement_location', 588),
 ('opinion_organization', 507),
 ('statement_videoname', 506),
 ('statement_number', 478),
 ('statement_organization', 431),
 ('opinion_device', 421),
 ('statement_device', 413),
 ('opinion_genre', 394),
 ('yes_no_question_person', 285),
 ('opinion_sport', 277),
 ('yes_no_question_videoname', 231),
 ('opinion_sportteam', 226),
 ('opinion_number', 192),
 ('statement_softwareapplication', 170),
 ('statement_duration', 161),
 ('statement_sport', 151),
 ('opinion_softwareapplication', 143),
 ('statement_sportteam', 135),
 ('statement_genre', 130),
 ('statement_year', 124),
 ('yes_no_question_location', 112),
 ('yes_no_question_device', 106),
 ('yes_no_question_organization', 100),
 ('opinion_position', 98),
 ('opinion_duration', 89),
 ('yes_no_question_genre', 87),
 ('opinion_year', 83),
 ('opinion_event', 80),
 ('yes_no_question_sport', 77),
 ('stat

In [12]:
with open('data/dataset.json', 'w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=2)

In [13]:
labels = dict()

labels['midas2id'] = {label: i for i, label in enumerate(preproc.midas_all.keys())}
labels['entities2id'] = {label: i for i, label in enumerate(preproc.entity_all.keys())}
labels['target_midas2id'] = {label: i for i, label in enumerate(preproc.midas_target.keys())}
labels['target_entity2id'] = {label: i for i, label in enumerate(preproc.entity_target.keys())}
labels['target_midas_and_entity2id'] = {label: i for i, label in enumerate(preproc.midas_and_entity_target.keys())}

with open('data/labels.json', 'w', encoding='utf-8') as f:
    json.dump(labels, f, ensure_ascii=False, indent=2)