In [39]:
import json
import random
import os
from collections import Counter

import pandas as pd


from sklearn.model_selection import train_test_split

from utils.data2seq import Dial2seq

# Data

### Merge Datasets

In [6]:
with open('data/daily_dialogue_annotated_update.json', 'r', encoding='utf8') as f:
    daily = json.load(f)
    
with open('data/topical_chat_annotated_update.json', 'r', encoding='utf8') as f:
    topical = json.load(f)

In [9]:
f'Length in dialogues: topical chat - {len(topical)}, daily dialog - {len(daily)}'

'Length in dialogues: topical chat - 8628, daily dialog - 12376'

In [12]:
total = list(topical.items()) + list(daily.items())
len(total)

21004

In [13]:
train, val_test = train_test_split(total, test_size=0.2, random_state=42)
val, test = train_test_split(val_test, test_size=500, random_state=42)

train, val, test = dict(train), dict(val), dict(test)
len(train), len(val), len(test)

(16803, 3701, 500)

save data files

Total number of dialogues


(12376, 8628)

In [16]:
with open('data/annotated/train_dialogues.json', 'w', encoding='utf-8') as f:
    json.dump(train, f, ensure_ascii=False, indent=2)

In [17]:
with open('data/annotated/val_dialogues.json', 'w', encoding='utf-8') as f:
    json.dump(val, f, ensure_ascii=False, indent=2)

In [18]:
with open('data/annotated/test_dialogues.json', 'w', encoding='utf-8') as f:
    json.dump(test, f, ensure_ascii=False, indent=2)

### Load datasets

In [55]:
train_seqs = Dial2seq('data/annotated/train_dialogues.json', 3).transform()
print('Total number of sequences in the Train dataset is ', len(train_seqs))
val_seqs = Dial2seq('data/annotated/val_dialogues.json', 3).transform()
print('Total number of sequences in the Validation dataset is', len(val_seqs))
test_seqs = Dial2seq('data/annotated/test_dialogues.json', 3).transform()
print('Total number of sequences in the Test dataset is', len(test_seqs))

Total number of sequences in the Train dataset is  179286
Total number of sequences in the Validation dataset is 39089
Total number of sequences in the Test dataset is 5206


## Preprocess datasets

In [348]:
from abc import ABC, abstractmethod

class Validator(ABC):
    
    @abstractmethod
    def is_valid(self, seq: dict):
        pass
    
    
class OneEntity(Validator):
    
    def __init__(self, stoplist: list = ['misc', 'anaphor']):
        self.stoplist = stoplist
    
    def is_valid(self, seq:dict) -> bool:
        """
        checks if the first sentence of the sequence has
        one annotated entity and it is not in the stoplist
        """
        
        def __init__(self, stoplist: list = ['misc', 'anaphor']):
            self.stoplist = stoplist

        if len(seq['entities'][0]) != 1:
            return False
        
        return seq['entities'][0][0]['label'] not in self.stoplist
    
    
class NoEntity(Validator):
    
    def is_valid(self, seq:dict) -> bool:
        """
        checks if the first sentence of the sequence has
        one annotated entity and it is not in the stoplist
        """
        return len(seq['entities'][0]) == 0
        
    
    
class SequencePreprocessor():
    """ 
    preprocesses sequences
    to filter only those that are relevant for the task
    
    params:
    num_entities: int - maximum size of a last sentence in a sequence 
    in terms of number of annotated entities 
    """

    def __init__(self, stoplist_labels: list = ['misc', 'anaphor'],
                 seq_validator=None):
        self.stoplist_labels = stoplist_labels
        self.seq_validator = seq_validator
        
    def transform(self, sequences: list) -> list:
        """ extract only necessary data from sequences """
        seqs = list()
        
        for seq in sequences:
            if self.seq_validator and not self.seq_validator.is_valid(seq[-1]):
                # validate final utterance if necessary
                continue
            sample = self.__get_dict_entry(self.__shape_output(seq))
            seqs.append(sample)
            
        return seqs
    

    def __shape_output(self, seq: list) -> list:
        """ shapes sequence in order to keep only the necessary data """
        output = list()
        
        # preprocess context
        for ut in seq[:-1]:
            midas_labels, midas_vectors = self.__get_midas(ut['midas'])
            output.append((
                ut['text'], midas_labels, midas_vectors, ut['entities']))

        # preprocess target: only the first sentence of 
        # the last utterance in the sequence
        midas_labels, midas_vectors = self.__get_midas(seq[-1]['midas'])
        midas_labels, midas_vectors = midas_labels[0:1], midas_vectors[0:1]
        sentence = seq[-1]['text'][0].lower()
        entities = seq[-1]['entities'][0]
        
        if entities:
            # filter out labels from stoplist
            entities = [e for e in entities if e['label'] not in self.stoplist_labels]
            # pre-sort them -> longest first to prevent mess with overlapping entities
            entities = sorted(entities, key=lambda x: len(x['text']), reverse=True) 
        
        ## replace entities with their labels
        for ent in entities:
            sentence = sentence.replace(ent['text'], ent['label'].upper())
            
        output.append(
            (sentence, midas_labels[0], entities))
        
        return output
    
    
    def __get_dict_entry(self, seq) -> dict:
        """ creates a proper dict entry to dump into a file """
        entry = dict()
        entry['previous_text'] = [s[0] for s in seq[:-1]]
        entry['previous_midas'] = [s[1] for s in seq[:-1]]
        entry['midas_vectors'] = [s[2] for s in seq[:-1]]
        entry['previous_entities'] = [s[-1] for s in seq[:-1]]
        entry['predict'] = {}
        entry['predict']['text'] = seq[-1][0]
        entry['predict']['midas'] = seq[-1][1]
        entry['predict']['entities'] = seq[-1][2]
        
        return entry
            
        
    def __get_midas(self, midas_labels: list) -> tuple:
        """ 
        extracts midas labels with max value per each sentence in an utterance
        and return a midas vector per each sentence
        """
        labels = []
        vectors = []
        
        for sentence_labels in midas_labels:
            labels.append(max(sentence_labels, key=sentence_labels.get))
            vectors.append(list(sentence_labels.values()))
            
        return labels, vectors

### Preprocess targets for Convert

In [368]:
for_convert = SequencePreprocessor(seq_validator=OneEntity())
targets = for_convert.transform(train_seqs)

In [369]:
for_convert = SequencePreprocessor(seq_validator=NoEntity())
targets_zero = for_convert.transform(train_seqs)

In [370]:
len(targets), len(targets_zero)

(9991, 81103)

In [371]:
_, targets_zero = train_test_split(targets_zero, test_size=len(targets), random_state=42)

In [372]:
len(targets_zero)

9991

In [373]:
convert_contexts, convert_responses = list(), list()

midas_counter = Counter()
entity_counter = Counter()
midas_entity_counter = Counter()

for seq in targets:
    convert_contexts.append(seq['previous_text'])
    
    midas = seq['predict']['midas']
    entity = seq['predict']['entities']
    entity = entity[0]['label'] if entity else None
    text = seq['predict']['text']
    convert_responses.append((midas,entity,text))
    
    # calc stats
    midas_counter.update([midas])
    entity_counter.update([entity])
    midas_entity_counter.update([f'{midas} {entity}'])

In [374]:
with open('data/annotated/convert_responses.json', 'w', encoding='utf-8') as f:
    json.dump(convert_responses, f, ensure_ascii=False, indent=2)

In [378]:
zero_contexts, zero_responses = list(), list()

zero_midas_counter = Counter()

for seq in targets_zero:
    zero_contexts.append(seq['previous_text'])
    
    midas = seq['predict']['midas']
    entity = None
    text = seq['predict']['text']
    zero_responses.append((midas,entity,text))
    
    # calc stats
    zero_midas_counter.update([midas])

In [386]:
with open('data/annotated/convert_responses_zero.json', 'w', encoding='utf-8') as f:
    json.dump(zero_responses, f, ensure_ascii=False, indent=2)

#### Convert stats

In [375]:
midas_counter.most_common()

[('opinion', 4096),
 ('statement', 4029),
 ('yes_no_question', 868),
 ('pos_answer', 214),
 ('comment', 177),
 ('open_question_factual', 171),
 ('command', 170),
 ('open_question_opinion', 168),
 ('neg_answer', 66),
 ('complaint', 16),
 ('dev_command', 10),
 ('appreciation', 5),
 ('other_answers', 1)]

In [380]:
zero_midas_counter.most_common()

[('opinion', 2822),
 ('pos_answer', 2134),
 ('statement', 2091),
 ('comment', 860),
 ('neg_answer', 524),
 ('appreciation', 351),
 ('complaint', 338),
 ('open_question_factual', 220),
 ('yes_no_question', 210),
 ('other_answers', 179),
 ('command', 137),
 ('open_question_opinion', 102),
 ('dev_command', 23)]

In [376]:
entity_counter.most_common()

[('person', 2277),
 ('videoname', 1477),
 ('location', 1136),
 ('organization', 813),
 ('device', 660),
 ('duration', 629),
 ('genre', 542),
 ('sport', 508),
 ('number', 465),
 ('sportteam', 394),
 ('softwareapplication', 310),
 ('vehicle', 177),
 ('event', 148),
 ('position', 133),
 ('date', 130),
 ('year', 85),
 ('gamename', 57),
 ('party', 37),
 ('bookname', 9),
 ('songname', 4)]

In [377]:
midas_entity_counter.most_common()

[('opinion person', 1116),
 ('statement person', 730),
 ('opinion videoname', 694),
 ('statement location', 545),
 ('statement videoname', 518),
 ('statement duration', 493),
 ('opinion location', 394),
 ('opinion organization', 374),
 ('statement number', 366),
 ('opinion genre', 314),
 ('statement organization', 289),
 ('statement device', 271),
 ('opinion sport', 251),
 ('opinion sportteam', 221),
 ('opinion device', 218),
 ('yes_no_question person', 215),
 ('yes_no_question videoname', 169),
 ('statement softwareapplication', 148),
 ('statement sport', 135),
 ('statement sportteam', 133),
 ('statement genre', 105),
 ('opinion softwareapplication', 99),
 ('statement vehicle', 88),
 ('yes_no_question device', 83),
 ('yes_no_question location', 81),
 ('opinion duration', 76),
 ('statement date', 71),
 ('opinion position', 65),
 ('opinion event', 60),
 ('yes_no_question sport', 59),
 ('yes_no_question organization', 57),
 ('opinion number', 53),
 ('comment person', 53),
 ('yes_no_quest

### Preprocess datasets for train and evaluation

In [56]:
for_train_eval =  SequencePreprocessor(seq_validator=None)
train_dataset = for_train_eval.transform(train_seqs)
val_dataset = for_train_eval.transform(val_seqs)
test_dataset = for_train_eval.transform(test_seqs)
len(train_dataset), len(val_dataset), len(test_dataset)

(179286, 39089, 5206)

#### save context for convert

In [105]:
val_contexts = list()
test_contexts = list()

for seq in val_dataset:
    val_contexts.append(seq['previous_text'])
    
for seq in test_dataset:
    test_contexts.append(seq['previous_text'])
    
with open('data/annotated/val_contexts.json', 'w', encoding='utf-8') as f:
    json.dump(val_contexts, f, ensure_ascii=False, indent=2)
    
with open('data/annotated/test_contexts.json', 'w', encoding='utf-8') as f:
    json.dump(test_contexts, f, ensure_ascii=False, indent=2)  

In [61]:
train_dataset[0]

{'previous_text': [['Hi!', ' Have you every played golf?'],
  ['Hey I actually have played golf.',
   "Even though I'm horrible haha what about you?"],
  ['I have made noble attempts.',
   'but I am horrible.',
   ' Definitely not going to quit my day job.']],
 'previous_midas': [['statement', 'yes_no_question'],
  ['statement', 'open_question_opinion'],
  ['statement', 'opinion', 'opinion']],
 'midas_vectors': [[[0.008263594470918179,
    0.01513385958969593,
    0.012243056669831276,
    0.013781467452645302,
    0.016995860263705254,
    0.0018954542465507984,
    0.007130841724574566,
    0.00389679754152894,
    0.055932093411684036,
    0.004719457123428583,
    0.004707093816250563,
    0.8473060727119446,
    0.007994434796273708],
   [0.0053682513535022736,
    0.08350381255149841,
    0.005704871378839016,
    0.02419126406311989,
    0.021962281316518784,
    0.0038323821499943733,
    0.02248297818005085,
    0.011466488242149353,
    0.0012161567574366927,
    0.0028420155

## Extract features

In [114]:
import pickle
from collections import Counter

import numpy as np
import tensorflow_hub as hub

from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
os.environ['TFHUB_CACHE_DIR'] = './models/tf_cache'
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
encoder = hub.load(module_url)
print ("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [74]:
entities_cnt = Counter()

for sample in train_dataset:
    for ut in sample['previous_entities']:
        entities_cnt.update([ent['label'] for sent in ut for ent in sent if sent])

In [78]:
Entity2ID = {label[0]: i for i, label in enumerate(entities_cnt.most_common())}

In [79]:
Midas2ID = {
    "appreciation": 0, "command": 1, "comment": 2,"complaint": 3,
    "dev_command": 4, "neg_answer": 5, "open_question_factual": 6,
    "open_question_opinion": 7, "opinion": 8, "other_answers": 9,
    "pos_answer": 10, "statement": 11, "yes_no_question": 12,
}

In [80]:
Entity2ID

{'misc': 0,
 'person': 1,
 'location': 2,
 'videoname': 3,
 'organization': 4,
 'device': 5,
 'sport': 6,
 'duration': 7,
 'number': 8,
 'genre': 9,
 'sportteam': 10,
 'position': 11,
 'event': 12,
 'softwareapplication': 13,
 'anaphor': 14,
 'vehicle': 15,
 'party': 16,
 'year': 17,
 'date': 18,
 'gamename': 19,
 'songname': 20,
 'bookname': 21}

In [138]:
EntityLabelEncoder = MultiLabelBinarizer()
EntityLabelEncoder.classes = [label for label in Entity2ID if label not in ['misc', 'anaphor']]

In [98]:
class SampleVectorizer:
    
    def __init__(
        self, text_vectorizer, entity2id:dict, midas2id:dict,
        context_len:int=3, embed_dim:int = 512):
        
        self.vectorizer = text_vectorizer
        self.entity2id = entity2id
        self.midas2id = midas2id
        self.context_len = context_len
        # 512 + 13 + 22 = 547
        self.utterance_vec_size = embed_dim + len(midas2id) + len(entity2id)
        # 3 * 547 = 1641
        self.vector_size = self.context_len * self.utterance_vec_size
        
        
    def context_vector(
        self, context: list, midas_vectors: list, entities: list) -> tuple:
        """
        vectorizes the previous context by concatenating text embeddings,
        midas probas and one-hot encoded entities """
        embedding = self.__embed(context)
        midas = self.__norm_midas(midas_vectors)
        entities = self.__oh_encode(entities)
        return self.__get_context_vec(embedding, midas, entities)
        
        
    def __embed(self, utterances: list) -> np.ndarray:
        """ 
        vectorizes a list of N previous utterances using a provided encoder
        input: List[str]
        output: numpy array (len(utterance), embed_dim)
        """
        return self.vectorizer([" ".join(ut) for ut in utterances]).numpy()
    
    
    def __norm_midas(self, midas_vectors: list) -> np.ndarray:
        """ 
        takes midas vectors of all sentences in the utterance
        and returns a vector with max values per midas label
        """
        vecs = np.zeros((len(midas_vectors), 13))
        
        for i, vec in enumerate(midas_vectors):
            # get max probability per each midas labels
            vecs[i] = np.max(np.array(vec), axis=0)

        # return normalized
        return vecs
    
    
    def __oh_encode(self, entities) -> np.ndarray:
        """ one-hot encoding of entities per each sample """
        entities = [[ent['label'] for sent in ut for ent in sent] for ut in entities]
        ohe_vec = np.zeros((len(entities), len(self.entity2id)))
        
        for i, ut in enumerate(entities):
            for ent in set(ut):
                ent_id = self.entity2id.get(ent, None)
                if not ent_id:
                    continue
                ohe_vec[i][ent_id] = 1
                
        return ohe_vec
    
    
    def __get_context_vec(self, embedding: np.ndarray,
                      midas_vec: np.ndarray, 
                      ohe_vec: np.ndarray) -> np.ndarray:
        """ 
        concatenates text embeddings with midas vectors 
        and one-hot encoded entities
        
        The output vector will be (n_utterances, self.vector_dim)
        Vector dim comes from:
        1. [embedding of utterance(i-2)]
        2. [midas proba distribution utterance(i-2)]
        3. [entity type one-hot utterance(i-2)]
        4. [embedding (i-1)]
        5. [midas (i-1)][entity (i-1)]
        6. [embedding (i)] 
        7. [midas (i)]
        8. [entity (i)]
        """
        assert embedding.shape[0] == midas_vec.shape[0] == ohe_vec.shape[0]
        vecs = np.zeros((self.context_len, self.utterance_vec_size))

        vecs[:,:embedding.shape[1]] = embedding
        vecs[:,embedding.shape[1]:embedding.shape[1]+midas_vec.shape[1]] = midas_vec
        vecs[:,embedding.shape[1]+midas_vec.shape[1]:] = ohe_vec
        
        vecs = vecs.reshape(-1)
        
        assert vecs.shape[0] == self.vector_size

        # returned context vector (1, n_ut * utterance_dim)
        return vecs.reshape(-1)

In [99]:
vectorizer = SampleVectorizer(
    text_vectorizer=encoder, # USE
    entity2id=Entity2ID,
    midas2id=Midas2ID,
    context_len=3,
    embed_dim=512)

In [104]:
vectorizer.context_vector(
    train_dataset[0]['previous_text'],
    train_dataset[0]['midas_vectors'],
    train_dataset[0]['previous_entities']
)[500:550]

array([ 0.03675972, -0.03856859,  0.0011045 ,  0.06951697,  0.0160016 ,
        0.06346866, -0.09389518,  0.02070764,  0.04127781, -0.01470397,
       -0.04645021,  0.00437406,  0.00826359,  0.08350381,  0.01224306,
        0.02419126,  0.02196228,  0.00383238,  0.02248298,  0.01146649,
        0.05593209,  0.00471946,  0.00548276,  0.84730607,  0.80616486,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.01185059, -0.01601361, -0.0223879 ])

In [145]:
import numpy as np
from tensorflow.keras.utils import Sequence

class Dataset(Sequence):
    
    """ customized Dataset class from torch """
    
    def __init__(self, data: list, vectorizer, batch_size: int = 32, shuffle: bool = False):
        self.data = data
        self.indexes = np.arange(len(self.data))
        self.vectorizer = vectorizer
        self.batch_size = batch_size
        self.shuffle=shuffle
        
    def __len__(self):
        """
        Denotes the number of batches per epoch
        A common practice is to set this value to [num_samples / batch size⌋
        so that the model sees the training samples at most once per epoch.
        """
        return int(np.ceil(len(self.data) / self.batch_size))

    def on_epoch_end(self):
        """
        Updates indexes after each epoch
        Shuffling the order so that batches between epochs do not look alike.
        It can make a model more robust.
        """
        if self.shuffle:
            np.random.shuffle(self.indexes)
            
    def __getitem__(self, idx: int):
        """ get batch_id and return its vectorized representation """
        indexes = self.indexes[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch = [self.data[index] for index in indexes]
        
        x_batch = np.zeros([len(batch), self.vectorizer.vector_size])
        y_batch = list()
        
        for i, sample in enumerate(batch):
            x_batch[i, :] = self.vectorizer.context_vector(
                sample['previous_text'],
                sample['midas_vectors'], sample['previous_entities'])
            
            midas_label = sample['predict']['midas']
            
            entity_labels = [ent['label'] for ent in sample['predict']['entities'] if ent]
            y_batch.append([midas_label, entity_labels])
        
        
        return x_batch, y_batch

In [146]:
train_loader = Dataset(
    data=train_dataset,
    vectorizer=vectorizer,
    batch_size=32,
    shuffle=False)

In [147]:
x, y = train_loader[17]
    
print(x.shape)

(32, 1641)


In [148]:
y

[['opinion', ['organization']],
 ['statement', ['location', 'device']],
 ['opinion', []],
 ['pos_answer', []],
 ['statement', []],
 ['opinion', ['person', 'device', 'device', 'person']],
 ['statement', ['genre']],
 ['statement', ['videoname']],
 ['opinion', ['person']],
 ['statement', []],
 ['opinion', ['location']],
 ['statement', ['videoname']],
 ['opinion', []],
 ['opinion', []],
 ['statement', []],
 ['opinion', ['person']],
 ['statement', []],
 ['pos_answer', []],
 ['comment', []],
 ['statement', []],
 ['opinion', ['vehicle', 'vehicle']],
 ['opinion', []],
 ['statement', ['location']],
 ['comment', []],
 ['statement', []],
 ['comment', []],
 ['statement', []],
 ['opinion', []],
 ['opinion', []],
 ['opinion', []],
 ['statement', []],
 ['opinion', []]]

#### split midas and entity labels to encode them separately

In [149]:
# split midas and entites
y_midas = list()
y_entity = list()

for label in y:
    # encode midas labels
    y_midas.append(Midas2ID[label[0]])
    y_entity.append(label[1])
    
# encode entity labels
y_entity = EntityLabelEncoder.fit_transform(y_entity)

In [150]:
y_entity

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0,

In [152]:
# check OOV labels
EntityLabelEncoder.fit_transform([['asd', 'person'], ['person', 'vehicle']])

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])

#### train on a single dataset

In [154]:
train_loader = Dataset(
    data=train_dataset,
    vectorizer=vectorizer,
    batch_size=len(train_dataset),
    shuffle=False)

val_loader = Dataset(
    data=val_dataset,
    vectorizer=vectorizer,
    batch_size=len(val_dataset),
    shuffle=False)

In [155]:
for X_train, y_train in train_loader:
    break

In [158]:
# split midas and entites
y_midas_train = list()
y_entity_train = list()

for label in y_train:
    # encode midas labels
    y_midas_train.append(Midas2ID[label[0]])
    y_entity_train.append(label[1])
    
# encode entity labels
y_entity_train = EntityLabelEncoder.fit_transform(y_entity_train)

In [159]:
with open('data/annotated/vectorized_train.npy', 'wb') as f:
    np.save(f, X_train)
    np.save(f, y_midas_train)
    np.save(f, y_entity_train)        

In [156]:
for X_val, y_val in val_loader:
    break

In [160]:
# split midas and entites
y_midas_val = list()
y_entity_val = list()

for label in y_val:
    # encode midas labels
    y_midas_val.append(Midas2ID[label[0]])
    y_entity_val.append(label[1])
    
# encode entity labels
y_entity_val = EntityLabelEncoder.fit_transform(y_entity_val)

In [161]:
with open('data/annotated/vectorized_val.npy', 'wb') as f:
    np.save(f, X_val)
    np.save(f, y_midas_val)
    np.save(f, y_entity_val) 

In [167]:
X_train.shape, len(y_midas_train), len(y_entity_train)

((179286, 1641), 179286, 179286)

# MODELS

with open('data/annotated/vectorized_train.npy', 'rb') as f:
    X_train = np.load(f)
    y_midas_train = np.load(f)
    y_entity_train = np.load(f)

with open('data/annotated/vectorized_val.npy', 'rb') as f:
    X_val = np.load(f)
    y_midas_val = np.load(f)
    y_entity_val = np.load(f)

In [216]:
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import confusion_matrix

### Midas

In [210]:
rfc_midas = RandomForestClassifier(max_depth=20, max_samples=0.40, random_state=42)

In [211]:
rfc_midas.fit(X_train, y_midas_train)

RandomForestClassifier(max_depth=20, max_samples=0.4, random_state=42)

In [212]:
midas_preds = rfc_midas.predict(X_val)

In [213]:
Counter(midas_preds)

Counter({11: 13016,
         8: 25526,
         2: 88,
         10: 252,
         12: 64,
         5: 32,
         1: 28,
         7: 15,
         6: 42,
         3: 19,
         0: 5,
         4: 1,
         9: 1})

In [214]:
accuracy_score(y_midas_val, midas_preds)

0.40740361738596537

In [215]:
f1_score(y_midas_val, midas_preds, average='weighted')

0.3172102899963896

In [218]:
# Store data (serialize)
with open('models/rfcME_midas_depth20_maxsample04.pickle', 'wb') as f:
    pickle.dump(rfc_midas, f, protocol=pickle.HIGHEST_PROTOCOL)

### Entity

In [300]:
from sklearn.multiclass import OneVsRestClassifier
from joblib import dump, load

In [310]:
rfc_entity = RandomForestClassifier(max_depth=20, max_samples=0.40, random_state=42)

In [311]:
entity_clf = OneVsRestClassifier(rfc_entity)

In [312]:
entity_clf.fit(X_train, y_entity_train)

OneVsRestClassifier(estimator=RandomForestClassifier(max_depth=20,
                                                     max_samples=0.4,
                                                     random_state=42))

In [313]:
entity_preds = entity_clf.predict_proba(X_val)

In [314]:
preds_cnt = Counter()

for ent in entity_preds:
    pr = np.argmax(ent)
    preds_cnt.update([pr])
    
preds_cnt.most_common()

[(0, 13547),
 (1, 8173),
 (2, 3803),
 (3, 2909),
 (4, 2213),
 (5, 2003),
 (6, 1447),
 (7, 1237),
 (8, 912),
 (9, 830),
 (13, 653),
 (12, 485),
 (10, 357),
 (16, 196),
 (11, 121),
 (14, 108),
 (15, 50),
 (17, 32),
 (19, 8),
 (18, 5)]

In [316]:
entity_preds = entity_clf.predict(X_val)

In [317]:
accuracy_score(y_entity_val, entity_preds)

0.8460436439919159

In [318]:
for ent in entity_preds:
    if sum(ent) > 0:
        print(ent)

[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [320]:
f1_score(y_entity_val, entity_preds, average=None)

array([0.00149589, 0.        , 0.        , 0.        , 0.00410678,
       0.01058201, 0.0344086 , 0.00438596, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [315]:
dump(entity_clf, 'models/entity_clf_depth20_maxsplit_04.joblib')

['models/entity_clf_depth20_maxsplit_04.joblib']

In [325]:
loaded_clf = load('models/entity_clf_depth20_maxsplit_04.joblib')

In [327]:
preds_cnt = Counter()

for ent in loaded_clf.predict_proba(X_val):
    pr = np.argmax(ent)
    preds_cnt.update([pr])
    
preds_cnt.most_common()

[(0, 13547),
 (1, 8173),
 (2, 3803),
 (3, 2909),
 (4, 2213),
 (5, 2003),
 (6, 1447),
 (7, 1237),
 (8, 912),
 (9, 830),
 (13, 653),
 (12, 485),
 (10, 357),
 (16, 196),
 (11, 121),
 (14, 108),
 (15, 50),
 (17, 32),
 (19, 8),
 (18, 5)]

# Inference

possible responses

In [None]:
import pandas as pd

In [387]:
with open('data/annotated/convert_responses.json', 'r', encoding='utf8') as f:
    responses = json.load(f)
    
with open('data/annotated/convert_responses_zero.json', 'r', encoding='utf8') as f:
    responses_zero = json.load(f)

responses = pd.DataFrame(responses + responses_zero, columns=['midas', 'entity', 'text'])

In [390]:
responses.head()

Unnamed: 0,midas,entity,text
0,statement,sport,i didn't even realize that he played any SPORT.
1,opinion,person,i head it was because PERSON was so dominant a...
2,statement,organization,usually though ORGANIZATION.
3,statement,location,i don't think anything is happening in LOCATIO...
4,statement,softwareapplication,not unless SOFTWAREAPPLICATION publishes their...


In [476]:
responses.index[(responses.midas == 'opinion') & (responses.entity.isnull())]

Int64Index([10006, 10010, 10013, 10016, 10018, 10022, 10027, 10028, 10032,
            10033,
            ...
            19945, 19951, 19952, 19957, 19958, 19963, 19970, 19975, 19979,
            19980],
           dtype='int64', length=2822)

In [391]:
with open('data/convert_vecs/resp_encoding.npy', 'rb') as f: 
    responses_vecs = np.load(f)
    
with open('data/convert_vecs/resp_encoding_zero.npy', 'rb') as f: 
    responses_vecs_zero = np.load(f)
    


In [458]:
with open('data/convert_vecs/test_context_encoding.npy', 'rb') as f: 
    context_vecs = list()
    for i in range(len(test_seqs)):
        context_vecs.append(np.load(f))
        
context_vecs = np.vstack(context_vecs)

In [459]:
responses_vecs.shape, responses_vecs_zero.shape, context_vecs.shape

((19982, 512), (9991, 512), (5206, 512))

In [396]:
responses_vecs.shape

(19982, 512)

In [395]:
responses_vecs = np.vstack((responses_vecs, responses_vecs_zero))

In [397]:
vectorizer = SampleVectorizer(
    text_vectorizer=encoder, # USE
    entity2id=Entity2ID,
    midas2id=Midas2ID,
    context_len=3,
    embed_dim=512)

In [401]:
class EntityChecker:
    
    def __init__(
        self, entity2id: list, stoplist: list = ['misc', 'anaphor']):
        self.entity2id = entity2id
        self.stoplist = stoplist
        
    def get_context_entities(self, sample: dict) -> list:
        """ 
        returns all the entities from the context
        except those from the stoplist
        
        output:
        entities: List[Tuple]
        """
        entities = [self.__get_entities(ut) for ut in sample['previous_entities']]
        
        return entities
    
    def __get_entities(self, ut: list) -> list:
        """ extract entities from a single utterance """
        ents = [
            (ent['label'], ent['text'], self.entity2id[ent['label']]) 
            for sent in ut for ent in sent if sent and ent['label'] not in self.stoplist]

        return ents

In [484]:
class Inference:
    
    def __init__(self, vectorizer, midas_clf, 
                 entity_clf, entity_checker, 
                 responses, responses_vecs):
        self.vectorizer = vectorizer
        self.midas_clf = midas_clf
        self.entity_clf = entity_clf
        self.entity_checker = entity_checker
        self.id2midas = list(vectorizer.midas2id)
        self.id2entity = list(vectorizer.entity2id)
        self.responses = responses
        self.responses_vecs = responses_vecs
        
    def infer_labels(self, sample) -> tuple:
        """
        takes context, midas_vectors and annotated entities
        and predicts midas label and entity for the next utterance
        """
        vec = self.vectorizer.context_vector(
            sample['previous_text'],
            sample['midas_vectors'],
            sample['previous_entities'])
        # (n_features,) -> (1, n_features)
        vec = vec[None,:]
        midas_id = self.midas_clf.predict(vec)[0]
        midas_pred = self.id2midas[midas_id]
        
        entities = self.entity_checker.get_context_entities(sample)
        
        entity_ids = [entity[2] for sent in entities if sent for entity in sent]
        
        entity_pred, entity_text = None, ""

        if entity_ids:
            # predict entity if there are of them in the context
            entity_probas = self.entity_clf.predict_proba(vec)[0]
            entity2proba = dict(zip(entity_ids, entity_probas[entity_ids]))
            entity_id = max(entity2proba, key=entity2proba.get)
            entity_pred = self.id2entity[entity_id]
    
        if entity_pred:
            for sent in entities:
                for ent in sent:
                    if ent[0] == entity_pred:
                        entity_text = ent[1]
                
            
        return midas_pred, entity_pred, entity_text
    
    def get_candidate_ids(self, midas_label, entity_label) -> list:
        """
        filters bank of responses and returns ids of candidates
        meeting the midas_entity requirements
        """
        midas_mask = self.responses.midas == midas_label
        
        if entity_label:
            entity_mask = self.responses.entity == entity_label
        else:
            entity_mask = self.responses.entity.isnull()

        candidate_ids = self.responses.index[(midas_mask) & (entity_mask)]
        return candidate_ids.tolist()

In [485]:
inference = Inference(
    vectorizer=vectorizer,
    midas_clf=rfc_midas,
    entity_clf=entity_clf,
    entity_checker=EntityChecker(Entity2ID),
    responses=responses,
    responses_vecs=responses_vecs
)

In [489]:
for sample, context_vec in zip(test_dataset[1000:1050], context_vecs[1000:1050]):
    
    midas_label, ent_label, ent_text = inference.infer_labels(sample)
    candidate_ids = inference.get_candidate_ids(midas_label, ent_label)
    candidate_vecs = inference.responses_vecs[candidate_ids]
    scores = context_vec.dot(candidate_vecs.T)
    candidate_pos = np.argmax(scores)
    candidate_id = candidate_ids[candidate_pos]

    context = [" ".join(ut) for ut in sample['previous_text']]
    for ut in context:
        print(ut)
    
    print('\n')        
    text = inference.responses.at[candidate_id, 'text']
    print(text)
    if ent_text:
        print(text.replace(ent_label.upper(), ent_text.upper()))
    print('\n\n\n')

Must be Las Vegas... Did you know Elmo is the only non-human to testify to the US congress?
How is that possible lol. the congress is a circus at times
Yep. I didn't know the Rep's and Dem's played a baseball game every year since 1909.


in 1976 PERSON was a co-founder.
in 1976 ELMO was a co-founder.




How is that possible lol. the congress is a circus at times
Yep. I didn't know the Rep's and Dem's played a baseball game every year since 1909.
So they also have a rivalry in sports. do you know which side is currently the winner?


i don't really...know too much about SPORT.
i don't really...know too much about BASEBALL.




Yep. I didn't know the Rep's and Dem's played a baseball game every year since 1909.
So they also have a rivalry in sports. do you know which side is currently the winner?
The GOP is ahead 3 games. I can't believe Norway donated $1B to save the Amazon rainforest.  Good for them!


i wonder if it did all the SPORT for them?
i wonder if it did all the BASEBALL for

In [445]:
inference.responses.head()

Unnamed: 0,midas,entity,text
0,statement,sport,i didn't even realize that he played any SPORT.
1,opinion,person,i head it was because PERSON was so dominant a...
2,statement,organization,usually though ORGANIZATION.
3,statement,location,i don't think anything is happening in LOCATIO...
4,statement,softwareapplication,not unless SOFTWAREAPPLICATION publishes their...
