In [16]:
import json
from collections import Counter

import numpy as np

from nltk.tokenize import sent_tokenize

In [114]:
class Dial2seq():
    """ 
    a class to transform dialogues into a sequence of utterances and labels
    The sequence consists of n previous uterrances. 
    
    There are no constraints
    on number of entities or midas types in the them, however a sequence is 
    valid only when it is followed by an utterance with a single entity and
    its midas label is not MISC or ANAPHOR.
    
    params:
    path2data: str - a path to a json file with dialogues and their midas and cobot labels
    seqlen: int - a number of utterances to use to predict next midas labels and entities
    """
    def __init__(self, path2data: str, seqlen=2):
        self.data = self.__load_data(path2data)
        self.seqlen = seqlen

        
    def transform(self) -> list:
        """transforms dialogues into a set of sequences of size seqlen n+1"""
        return [seq for dial in self for seq in self.__ngrammer(dial)]

    
    def __ngrammer(self, dialogue: list) -> list:
        """ transforms a dialogue into a set of sequences (ngram style) """
        return [dialogue[i:i+self.seqlen+1] for i in range(len(dialogue)-(self.seqlen+1)+1)]
        
        
    def __load_data(self, path: str) -> dict:
        """ loads data from a json file """
        with open(path, 'r') as f:
            data = json.load(f)
        return data
    
    
    def __len__(self) -> int:
        return len(self.data)
    
    
    def __iter__(self):
        """iterates over all the dialogues in the file """
        for dialogue in self.data.values():
            yield dialogue

In [115]:
sequencer = Dial2seq('data/topicalchat_midas_cobot_entities_.json', 3)

In [116]:
seqs = sequencer.transform()

In [117]:
len(seqs)

162494

In [120]:
for seq in seqs[0:3]:
    print(seq[-1]['ner']['response'][0]['label'], '\n')

organization 

anaphor 

organization 



In [121]:
class SequencePreprocessor():
    """ 
    preprocesses sequences
    to filter only those that are relevant for the task
    
    params:
    num_entities: int - maximum size of a last sentence in a sequence 
    in terms of number of cobot entities 
    
    entities: list - if these cobot entities labels are in the last sentence
    of a sequence, skip this seqence
    """

    def __init__(self, num_entities=1, entities: list = ['misc', 'anaphor']):
        self.num_entities = num_entities
        self.stoplist = entities
        self.midas = Counter()
        self.entities = Counter()
        
        
    def transform(self, sequences: list) -> list:
        """ extract necessary data from sequences """
        seqs = list()
        
        for seq in sequences:
            sents, midas_labels, _, entities = self.preproc(seq[-1])
            if not self.__is_valid(sents, midas_labels, entities):
                continue
            sample = self.__get_dict_entry(self.__shape_output(seq))
            seqs.append(sample)
            
        return seqs
    

    def preproc(self, ut) -> tuple:
        """ 
        opens up a single utterance to extract:
        1. sentances (with nltk.sent_tokenize)
        2. midas probability vector
        3. all entities in this utterance
        
        returns tuple
        """
        try:
            sents = sent_tokenize(ut['text'])
        except IndexError:
            # handles utterances with to much punctuation
            sents = [ut['text']]
        
        midas_labels, midas_vectors = self.__get_midas(ut['midas_label'])
                         
        try:
            entities = ut['ner']['response']
        except KeyError:
            # handles mislabelled samples
            entities = []
        
        return sents, midas_labels, midas_vectors, entities

    
    def __is_valid(self, sents:list, midas_labels:list, entities:list) -> bool:
        """
        checks if all the requirements for an utterance are met:
        1. number of sents == number of midas_labels
        2. an uterrance is one sentence, one midas label and 
        includes only one entity which is not in the stoplist
        3. when an utterance has 2+ sentence, it will be valid if
        the requirement 2 is applicable to the first sentence while
        other sentences are omitted
            
        input:
        sents: list - an utterance tokenized into sentences
        midas_labels: list - midas_label per each sentence
        entities: list of dicts - all entities in a given utterance (not mapped)
        
        output: bool
        """
        if len(sents) != len(midas_labels) or not entities:
            return False
        
        if len(sents) == 1 and len(entities) > 1:
            return False
        
        sent_ents = self.__get_entities(sents[0], entities)
        
        if len(sent_ents) != 1:
            return False

        return sent_ents[0]['label'] not in self.stoplist
    
    
    def __shape_output(self, seq: list) -> list:
        """ shapes sequence in order to keep only the necessary data """
        
        output = list()
        
        for ut in seq[:-1]:
            try:
                entities = ut['ner']['response']
            except KeyError:
                # handles mislabelled samples
                # TODO: fix labelling
                entities = []
                
            midas_labels, midas_vectors = self.__get_midas(ut['midas_label'])
            
            output.append((
                # tuple of text, midas labels, entities for a utterance
                ut['text'], midas_labels, midas_vectors, entities))

        # preprocess last sentence in the sequence
        sents, midas_labels, midas_vectors, entities = self.preproc(seq[-1])
        output.append(
            (sents[0], midas_labels[0:1], self.__get_entities(sents[0], entities)))
        
        return output
    
    
    def __get_dict_entry(self, seq) -> dict:
        """ creates a proper dict entry to dump into a file """
        entry = dict()
        
        for s in seq:
            self.midas.update(s[1])
            self.entities.update([label['label'] for label in s[-1]])

        entry['previous_text'] = [s[0] for s in seq[:-1]]
        entry['previous_midas'] = [s[1] for s in seq[:-1]]
        entry['midas_vectors'] = [s[2] for s in seq[:-1]]
        entry['previous_entities'] = [s[-1] for s in seq[:-1]]
        entry['predict'] = {}
        entry['predict']['text'] = seq[-1][0]
        entry['predict']['midas'] = seq[-1][1][0]
        entry['predict']['entity'] = seq[-1][2][0]
        
        return entry
            
        
    def __get_midas(self, midas_labels: list) -> tuple:
        """ 
        extracts midas labels with max value per each sentence in an utterance
        and return a midas vector per each sentence
        """
        labels = []
        vectors = []
        
        for sample in midas_labels:
            labels.append(max(sample, key=sample.get))
            vectors.append(list(sample.values()))
            
        return labels, vectors
    
    
    def __get_entities(self, sentence, entities) -> list:
        """
        returns entities from a given list of entities
        that are present in a given sentence
        """
        return [ent for ent in entities if ent['text'] in sentence]

In [122]:
preproc = SequencePreprocessor()

In [123]:
for seq in preproc.transform(seqs[0:50]):
    print(seq, '\n')

{'previous_text': ['Yes it helped him smooth out his dance moves', 'Nice. Do you like Shakespeare?', 'Yes I do. Do you know that he popularized many phrases'], 'previous_midas': [['pos_answer'], ['comment', 'yes_no_question'], ['pos_answer', 'yes_no_question']], 'midas_vectors': [[[0.004489609505981207, 0.005659966263920069, 0.008467592298984528, 0.001957598840817809, 0.0032978374511003494, 0.0016933480510488153, 0.0015114849666133523, 0.001163744367659092, 0.011162664741277695, 0.002088370034471154, 0.8511312007904053, 0.10295984894037247, 0.004416705574840307]], [[0.33323583006858826, 0.0063044424168765545, 0.46820133924484253, 0.007547429762780666, 0.014540870673954487, 0.0016343685565516353, 0.003681134432554245, 0.0049273851327598095, 0.0934942215681076, 0.006875962950289249, 0.016408585011959076, 0.037494078278541565, 0.005654338281601667], [0.005343989469110966, 0.03268001973628998, 0.00530182896181941, 0.008314643055200577, 0.018734272569417953, 0.004701630212366581, 0.00695905

In [124]:
dataset = preproc.transform(seqs)
len(dataset)

10565

In [125]:
preproc.entities

Counter({'anaphor': 16742,
         'misc': 38446,
         'person': 11002,
         'videoname': 5867,
         'location': 6126,
         'duration': 1566,
         'number': 4912,
         'position': 1129,
         'date': 282,
         'softwareapplication': 1431,
         'year': 2266,
         'vehicle': 562,
         'wear': 354,
         'genre': 3073,
         'device': 4090,
         'organization': 5312,
         'event': 1136,
         'sport': 2455,
         'party': 324,
         'ordinal': 34,
         'bookname': 30,
         'songname': 238,
         'gamename': 320,
         'sportteam': 1907,
         'channelname': 335,
         'sportrole': 250,
         'venue': 13,
         'albumname': 7})

In [130]:
dataset[5]

{'previous_text': ['Do you like Star Wars ? The movie about Hans Solo just came out on Netflix. ',
  'Is that the one where its about a young Han Solo?',
  'Yeah it is the one, so Han Solo is not played by Harrison Ford, does it affect your opinion of the movie ?'],
 'previous_midas': [['yes_no_question', 'statement'],
  ['yes_no_question'],
  ['yes_no_question']],
 'midas_vectors': [[[0.005408135242760181,
    0.03774670138955116,
    0.004193521570414305,
    0.005548159126192331,
    0.01911454275250435,
    0.0036064106971025467,
    0.00729062594473362,
    0.010243617929518223,
    0.0017117718234658241,
    0.0031786616891622543,
    0.016034064814448357,
    0.004651328548789024,
    0.8812724947929382],
   [0.0008340697386302054,
    0.002748156199231744,
    0.0018400404369458556,
    0.0013773691607639194,
    0.0016404872294515371,
    0.0005165188922546804,
    0.0008629477233625948,
    0.000587132410146296,
    0.040731094777584076,
    0.0008753170259296894,
    0.00084

In [128]:
with open('data/dataset.json', 'w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=2)

In [15]:
labels = dict()

labels['midas2id'] = {label: i for i, label in enumerate(preproc.midas.keys())}
labels['entities2id'] = {label: i for i, label in enumerate(preproc.entities.keys())}

with open('data/labels.json', 'w', encoding='utf-8') as f:
    json.dump(labels, f, ensure_ascii=False, indent=2)