In [5]:
import pandas as pd
import re, nltk, random, os, json
import numpy as np
from nltk.corpus import stopwords
from rank_bm25 import BM25Okapi, BM25L, BM25Plus

In [872]:
from simpletransformers.question_answering import QuestionAnsweringModel
import json
import os
import logging


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Create the QuestionAnsweringModel
model = QuestionAnsweringModel('distilbert', 'distilbert-base-uncased-distilled-squad', use_cuda=False, args={'reprocess_input_data': True, 'overwrite_output_dir': True, 'use_cuda': False})

In [531]:
# idx = 0
# to_predict = []
# for context in context_sentences:
#     qas = []
#     for prop in cand_properties:
#         qas.append({'question': cand_properties[prop], 'id': idx})
#         idx += 1
#     to_predict.append({'context': context, 'qas':qas})

In [6]:
stop_words = set(stopwords.words('english'))

regex = re.compile('[^a-zA-Z0-9.,]')
#First parameter is the replacement, second parameter is your input string
def preprocessingText(doc,enableStopword=False):
#     doc = regex.sub(' ', doc)
    if enableStopword:
        doc = " ".join([w for w in doc.split() if not w in stop_words])
    return doc.lower().replace("\n","")

df_wiki = pd.read_csv("data/wikihowSep.csv")
df_wiki['title'] = df_wiki['title'].str.replace("How to ", "")
for col in ['overview', 'headline', 'text', 'sectionLabel', 'title']:
    df_wiki[col] = [preprocessingText(str(i), False) for i in df_wiki[col]]
    
df_wiki['title'] = [i if not i[-1].isdigit() else i[:-1] for i in df_wiki['title']]

In [7]:
import glob, re, math

def camel_case_split(str): 
    words = [[str[0]]] 
    for c in str[1:]: 
        if words[-1][-1].islower() and c.isupper(): 
            words.append(list(c)) 
        else: 
            words[-1].append(c) 

    return " ".join([i.lower() for i in [''.join(word) for word in words]])    

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext.lower()

def simplifyDomainName(oldName):
    special = {"buse": "bus", "rentalcar": "rent car", "ridesharing": "taxi", "messaging": "message"}
    newName = oldName.split("_")[0].lower()
    if newName[-1] == "s":
        newName = newName[:-1]
    if newName in special:
        newName = special[newName]
    return newName

schema = pd.read_json("data/dialog/dstc8-schema-guided-dialogue/train/schema.json")
schema = pd.concat([schema, pd.read_json("data/dialog/dstc8-schema-guided-dialogue/dev/schema.json")])
schema = pd.concat([schema, pd.read_json("data/dialog/dstc8-schema-guided-dialogue/test/schema.json")])

schema['service_name'] = [simplifyDomainName(i) for i in schema.service_name.tolist()]
domains = schema.service_name.unique().tolist()

orgType = pd.read_csv("data/dialog/all-layers-types.csv")
orgProp = pd.read_csv("data/dialog/all-layers-properties.csv")

schemaCorpus = [camel_case_split(i[0]) + " #SEP# " + cleanhtml(i[1]).lower() for i in orgType[['label', 'comment']].values]
bm25s = BM25L([doc.split() for doc in schemaCorpus])

sProp2Desc = {camel_case_split(i[0]):cleanhtml(i[1]).lower() for i in np.concatenate([orgProp[['label', 'comment']].values, orgType[['label', 'comment']].values])}
sType2Prop = {}
sType2Subtype = {}
for i,j,k in orgType[['label', 'subTypeOf', 'properties']].values:
    if type(j) == str:
        sType2Subtype[camel_case_split(i)] = [camel_case_split(z) for z in j.replace("http://schema.org/", "").split(", ")]
                           
    if type(k) == str:
        sType2Prop[camel_case_split(i)] = [camel_case_split(z) for z in k.replace("http://schema.org/", "").split(", ")]
    

In [8]:
corpus = []
doc2title = {}
for name, group in df_wiki.groupby("title"):
    doc = []
    for col in ["overview", "sectionLabel", "headline", "text"]:
        doc.append(" ".join(group[col].unique().tolist()))
        
    doc = name+" #SEP# "+ " ".join(doc)
    corpus.append(doc)
    doc2title[doc] = name
bm25w = BM25L([doc.split() for doc in corpus])

In [456]:
# distant
from textblob import TextBlob
import nltk
unknown_services = ['alarm', 'weather']
matches = []
retrieved_properties = []
for service, desc in schema[['service_name', 'description']].values:
#   ignore weather and alarm
    if service in unknown_services:
        continue
        
    types = [i.split(" #SEP# ")[0] for i in bm25s.get_top_n(service.split(), schemaCorpus, n=5)]
    wiki2desc = dict(i.split(" #SEP# ") for i in bm25w.get_top_n(preprocessingText(desc).split(), corpus, n=5))

    #   get subtypes
    subtypes = []
    for t in types:
        if t in sType2Subtype:
            subtypes.extend(sType2Subtype[t])
    subtypes = list(set(subtypes))
    
    cand_props = []
    for t in types:
        if t in sType2Prop:
            cand_props.extend(sType2Prop[t])
            retrieved_properties.extend(sType2Prop[t])
    cand_props += types
    
    seen_prop = [""]
    for prop in cand_props:
        if prop in schema.service_name.unique().tolist():
            continue
        propLast = prop.split()[-1]
        #   check if nous
        tokenized = nltk.word_tokenize(propLast)
        nouns = [word  for (word, pos) in nltk.pos_tag(tokenized) if(pos[:2] == 'NN')]
        if len(nouns) == 0:
            propLast = ""
            
        for wiki in wiki2desc:
            for sentence in wiki2desc[wiki].split(". "):
                found_exact_match = False
                exact_match_start = sentence.find(prop)
                if exact_match_start > -1:
                    matches.append([sProp2Desc[prop], prop, exact_match_start, sentence])
                    found_exact_match = True
                    
                if propLast not in seen_prop and not found_exact_match:

                    exact_match_start = sentence.find(propLast)
                    if exact_match_start > -1:
                        matches.append([sProp2Desc[prop], propLast, exact_match_start, sentence])
        seen_prop.append(propLast)

In [None]:
idx = 1
trainData = []
for i in matches:
    question, text, answer_start, context = i
    neg_question = random.choice(list(sProp2Desc.values()))
    while neg_question == question:
        neg_question = random.choice(list(sProp2Desc.values()))
                        
    tmp = {"context": context, 'qas':[{'id':str(idx+1), 'is_impossible': True, 'question': neg_question, 'answers':[]},{'id':str(idx), 'is_impossible': False, 'question': question, 'answers':[{'text': i[1], 'answer_start': i[2]}]}]}
    trainData.append(tmp)
    idx += 2

os.makedirs('data', exist_ok=True)
with open('data/dialog/distantWithNegativeTrain.json', 'w') as f:
    json.dump(trainData, f)
# model.train_model('data/dialog/distantWithNegativeTrain.json')

In [113]:
schema

Unnamed: 0,service_name,description,slots,intents
0,bank,Manage bank accounts and transfer money,"[{'name': 'account_type', 'description': 'The ...","[{'name': 'CheckBalance', 'description': 'Chec..."
1,bus,Book bus journeys from the biggest bus network...,"[{'name': 'from_location', 'description': 'Cit...","[{'name': 'FindBus', 'description': 'Find a bu..."
2,bus,Find a bus to take you to the city you want,"[{'name': 'origin', 'description': 'Origin cit...","[{'name': 'FindBus', 'description': 'Find a bu..."
3,calendar,Calendar service to manage personal events and...,"[{'name': 'event_date', 'description': 'Date o...","[{'name': 'GetEvents', 'description': 'Get lis..."
4,event,The comprehensive portal to find and reserve s...,"[{'name': 'category', 'description': 'Type of ...","[{'name': 'FindEvents', 'description': 'Find e..."
...,...,...,...,...
16,service,A widely used service for finding and reservin...,[{'description': 'Name of the hair stylist/sal...,[{'description': 'Book an appointment at a hai...
17,service,Discover the right therapist for you and make ...,"[{'description': 'Name of the therapist', 'is_...",[{'description': 'Make a reservation with the ...
18,train,Service to find and reserve train journeys bet...,[{'description': 'Starting city for train jour...,[{'description': 'Reserve tickets for train jo...
19,travel,The biggest database of tourist attractions an...,[{'description': 'City or town where the attra...,[{'description': 'Browse attractions in a give...


In [134]:
# Unsupervised Model
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')

query = ["order", "pizza"]
types = [i.split(" #SEP# ")[0] for i in bm25s.get_top_n(query, schemaCorpus, n=5)]
print(types)
cand_props = []
for t in types:
    if t in sType2Prop:
        cand_props.extend(sType2Prop[t])
cand_props = list(set(cand_props))

tmp = df_wiki[(df_wiki.title.str.contains(query[0])) & (df_wiki.title.str.contains(query[1]))]
sentences = []
for i in tmp.text.tolist():
    sentences.extend(i.split("·"))

sentence_embs = model.encode(sentences)
property_embs = model.encode([sProp2Desc[i] for i in cand_props if i in sProp2Desc])

from sklearn.metrics.pairwise import cosine_similarity
memo = []
for p, p_emb in zip(cand_props, property_embs):
    max_sim = 0
    max_sen = ""
    for s, s_emb in zip(sentences, sentence_embs):
        sim = cosine_similarity([p_emb], [s_emb])[0][0]
        if sim > max_sim:
            max_sim = sim
            max_sen = s
    memo.append([p, max_sim, max_sen])
[i[0] for i in sorted(memo, key=lambda x: -x[1])[:20]]

['order processing', 'order returned', 'order in transit', 'order pickup available', 'order item']


['ordered item',
 'order item number',
 'order delivery',
 'order item status',
 'disambiguating description',
 'identifier',
 'description',
 'additional type',
 'main entity of page',
 'same as',
 'url',
 'order quantity',
 'name',
 'image',
 'potential action',
 'alternate name',
 'subject of']

In [99]:
types

['car', 'auto dealer', 'auto repair', 'auto wash', 'auto rental']

In [10]:
wiki = pd.read_csv("data/cedr/akgg-wikipedia.tsv", sep="\t", names=['na', 'qid', 'text'])
queries = pd.read_csv("data/cedr/akgg-query.tsv", sep="\t", names=['na', 'qid', 'text', 'entity'])
doc = pd.read_csv("data/cedr/akgg-doc.tsv", sep="\t", names=['na', 'qid', 'text'])
prop2desc = dict(zip([camel_case_split(i) for i in orgProp.label], [cleanhtml(i) for i in orgProp.comment]))
type2prop = dict(zip([camel_case_split(i) for i in orgType.label], [cleanhtml(i) for i in orgType.comment]))
doc2id =  dict(zip([camel_case_split(i) for i in doc.text], doc.qid))
id2wiki = dict(zip(wiki.qid, wiki.text))

In [11]:
df = pd.read_csv("data/AKG/Test Collection/AKGG/akg_standard_akgg_property_rele.csv")
df_action = pd.read_csv("data/AKG/Test Collection/AM/akg_standard_am_verb_object_rele.csv")

with open("data/AKG/Formal Run Topics/AKGG_Formal_Run_Topic.json") as json_file:
    data = json.load(json_file)
    qid, query, entity, entityType, action = [], [], [], [], []
    for p in data['queries']:
        qid.append(p['queryId'])
        query.append(p['query'])   
        entity.append(p['entity'])
        entityType.append(' '.join(p['entityTypes']))    
        action.append(p['action'])
topic = pd.DataFrame({"query_id": qid, "query": query, "entity": entity, "entityType": entityType, "action":action})
for c in ["query", "action", "entity"]:
    topic[c] = topic[c].str.lower().replace("\'", "")
    
df = df.merge(topic, how="inner", on="query_id")
df['entityType'] = [camel_case_split(i.split()[-1]) for i in df.entityType.tolist()]
df = df[['query_id', 'entity', 'action', 'entityType']].drop_duplicates()
id2entityType = dict(zip(df.query_id, df.entityType))


In [12]:
# org = orgType[['label', 'comment']]
# org = org.append(orgProp[['label', 'comment']])
org = orgProp[['label', 'comment']]
org['label'] = [camel_case_split(i) for i in org['label']]
org['comment'] = [cleanhtml(i) for i in org['comment']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [90]:
prop2emb = dict(zip(org.label, model.encode(org.label)))

In [14]:
import modeling
import Data
import train5foldCV

In [18]:
import io
qrels = Data.read_qrels_dict(io.TextIOWrapper(io.open("data/cedr/akgg-qrel.tsv",'rb'), 'UTF-8'))

                                           

In [91]:
ndcg = []
for idx, row in queries.iterrows():
#     w_emb = model.encode(id2wiki[row['qid']].split(". ") + [row['text']])
    w_emb = model.encode([row['text']])
    memo = []
    if id2entityType[row['qid']] in sType2Prop:
        for prop in sType2Prop[id2entityType[row['qid']]]:
            if prop in prop2emb and prop in doc2id:
                memo.append((doc2id[prop], max(cosine_similarity([prop2emb[prop]], w_emb)[0])))
    if len(memo) > 0:
        ranked_list = [str(i[0]) for i in sorted(memo, key=lambda x : -x[1])]
        result = train5foldCV.eval(qrels[str(row['qid'])], ranked_list)
        ndcg.extend([result['ndcg@10']] * freq[row['qid']])
    else:
        ndcg.extend([0]*freq[row['qid']])

In [92]:
np.mean(ndcg)

0.35829920882793

In [63]:
dtest = pd.read_csv("data/cedr/akgg-test0.tsv", sep="\t", names=['qid'], usecols=[0])
for i in [1,2,3,4]:
    dtest = dtest.append(pd.read_csv("data/cedr/akgg-test%d.tsv" % i, sep="\t", names=['qid'], usecols=[0]))

In [68]:
from collections import Counter
freq = Counter(dtest.qid)

In [69]:
freq[1]

20

In [749]:
# generate dstc8-schema-guided-dialogue
# Train and Test set
from sklearn.model_selection import train_test_split
import random
import glob

# read dialog datasets
txtfiles = []
dialog = None
for i in ['train', 'dev', 'test']:
    for file in glob.glob("data/dialog/dstc8-schema-guided-dialogue/%s/*.json" % i):
        if "schema.json" in file:
            continue
        txtfiles.append(file)
        dialog = pd.concat([dialog, pd.read_json(file)])
        
# # extract single domains
unique_single_domains = list(set([simplifyDomainName(i[0]) for i in dialog[dialog.services.str.len() == 1]['services']]))
# split domains to train and test sets
trainIds, testIds = train_test_split(list(set(unique_single_domains)), test_size=0.5, random_state=2020)

prop2desc = {}
schemaT = pd.read_json("data/dialog/dstc8-schema-guided-dialogue/train/schema.json")
schemaT = pd.concat([schemaT, pd.read_json("data/dialog/dstc8-schema-guided-dialogue/dev/schema.json")])
schemaT = pd.concat([schemaT, pd.read_json("data/dialog/dstc8-schema-guided-dialogue/test/schema.json")])
for idx, s in schemaT.iterrows():
    service = s.service_name
    for i in s.slots:
        prop2desc[service+"_"+i['name']] = i['description']


single_domain_dialogs = dialog[dialog.services.str.len() == 1]
# single_domain_dialogs['domain'] = [i[0] for i in single_domain_dialogs['services']]

In [755]:
trainIds = []
testIds = list(set(unique_single_domains))

In [756]:
trainIds, testIds

([],
 ['bus',
  'media',
  'event',
  'restaurant',
  'taxi',
  'bank',
  'calendar',
  'flight',
  'payment',
  'rent car',
  'travel',
  'alarm',
  'weather',
  'movie',
  'train',
  'service',
  'home',
  'music',
  'hotel'])

In [757]:
train_data = []
test_data = []

qid = 1
for idx, row in single_domain_dialogs.iterrows():
    frames = row['turns']
    context = []
    
    if str(frames)=='nan':
        continue
        
    for f in frames:
        service = f['frames'][0]['service']
#         service = service.split("_")[0].lower()
        if len(f['frames']) == 1:
            utterance = f['utterance']
            if len(f['frames'][0]['slots']) != 0:
                terms = []
                for s in f['frames'][0]['slots']:
                    
#                   replace text with slot name
#                     _context = " ".join(context)
#                     answer_start = len(_context) + s["start"] + 1
#                     new_answer = utterance[:s["start"]] + s['slot'].replace("_", " ") + utterance[s["exclusive_end"]+1:]
#                     _context += " " + new_answer
#                     _json = {'context': _context, 'qas':[{'id': str(qid), 'is_impossible': False, 
#                                                           'question': prop2desc[service+"_"+s['slot']], 'answers':[{'text':s['slot'].replace("_"," "), 
#                                                            'answer_start': answer_start}]}]}
                
                    _context = " ".join(context)
                    answer_start = len(_context) + s["start"] + 1
#                     new_answer = utterance[:s["start"]] + s['slot'].replace("_", " ") + utterance[s["exclusive_end"]+1:]
                    _context += " " + utterance
                    question = prop2desc[service+"_"+s['slot']]
#                   random negative question
                    neg_question = random.choice(list(prop2desc.values()))
                    while neg_question == question:
                        neg_question = random.choice(list(prop2desc.values()))
                        
                    _json = {'context': _context, 'qas':[{'id': str(qid), 'service':service, 'is_impossible': False, 
                                                          'question': question, 'answers':[{'text':utterance[s['start']: s['exclusive_end']], 
                                                           'answer_start': answer_start}]}, {'id': str(qid+1), 'is_impossible': True, 'question': neg_question, 'answers':[] }]}
        
                    if simplifyDomainName(service) in trainIds:
                        train_data.append(_json)
                    else:
                        test_data.append(_json)
                    qid += 2
            context.append(utterance)
#     if len(train_data) > 0:
#         break

In [758]:
len(train_data), len(test_data)

(0, 73374)

In [759]:
### Save as a JSON file
import json
with open('data/dialog/allTrain.json', 'w') as f:
    json.dump(train_data, f)
import json
with open('data/dialog/allTest.json', 'w') as f:
    json.dump(test_data, f)

In [754]:
model.train_model('data/dialog/unseenTrain.json')

INFO:simpletransformers.question_answering.question_answering_model: Converting to features started.
  0%|          | 0/68922 [00:00<?, ?it/s]INFO:simpletransformers.question_answering.question_answering_utils:*** Example ***
INFO:simpletransformers.question_answering.question_answering_utils:unique_id: 1000000000
INFO:simpletransformers.question_answering.question_answering_utils:example_index: 0
INFO:simpletransformers.question_answering.question_answering_utils:doc_span_index: 0
INFO:simpletransformers.question_answering.question_answering_utils:tokens: [CLS] [UNK] of the song [SEP] [UNK] want to listen to some songs . [UNK] , how about the song [UNK] [UNK] by the [UNK] [UNK] [UNK] [UNK] on their album [UNK] [UNK] ? [SEP]
INFO:simpletransformers.question_answering.question_answering_utils:token_to_orig_map: 6:0 7:1 8:2 9:3 10:4 11:5 12:6 13:6 14:7 15:7 16:8 17:9 18:10 19:11 20:12 21:13 22:14 23:15 24:16 25:17 26:18 27:19 28:20 29:21 30:22 31:23 32:24 33:24
INFO:simpletransformers.qu

INFO:simpletransformers.question_answering.question_answering_utils:token_to_orig_map: 10:0 11:1 12:2 13:3 14:4 15:5 16:6 17:6 18:7 19:7 20:8 21:9 22:10 23:11 24:12 25:13 26:14 27:15 28:16 29:17 30:18 31:19 32:20 33:21 34:22 35:23 36:24 37:24
INFO:simpletransformers.question_answering.question_answering_utils:token_is_max_context: 10:True 11:True 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True
INFO:simpletransformers.question_answering.question_answering_utils:input_ids: 101 100 1997 6001 2000 3914 3345 9735 2005 102 100 2215 2000 4952 2000 2070 2774 1012 100 1010 2129 2055 1996 2299 100 100 2011 1996 100 100 100 100 2006 2037 2201 100 100 1029 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:

INFO:simpletransformers.question_answering.question_answering_utils:input_ids: 101 100 1996 2299 7460 2000 102 100 2215 2000 4952 2000 2070 2774 1012 100 1010 2129 2055 1996 2299 100 100 2011 1996 100 100 100 100 2006 2037 2201 100 100 1029 100 1005 1055 2298 2005 2242 2842 1010 2151 6907 2097 2147 1012 100 2055 2242 2006 1996 2201 100 100 100 1029 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:simpletransformers.question_answering.question_answering_utils:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:simpletransformers.question_answering.question_answering_utils:segment_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

INFO:simpletransformers.question_answering.question_answering_utils:token_to_orig_map: 6:0 7:1 8:2 9:3 10:4 11:5 12:6 13:6 14:7 15:7 16:8 17:9 18:10 19:11 20:12 21:13 22:14 23:15 24:16 25:17 26:18 27:19 28:20 29:21 30:22 31:23 32:24 33:24 34:25 35:25 36:25 37:26 38:27 39:28 40:29 41:29 42:30 43:31 44:32 45:33 46:33 47:34 48:35 49:36 50:37 51:38 52:39 53:40 54:41 55:42 56:42 57:43 58:44 59:45 60:46 61:47 62:48 63:49 64:50 65:51 66:52 67:53 68:54 69:55 70:55
INFO:simpletransformers.question_answering.question_answering_utils:token_is_max_context: 6:True 7:True 8:True 9:True 10:True 11:True 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:Tr

INFO:simpletransformers.question_answering.question_answering_utils:segment_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:simpletransformers.question_answering.question_answering_utils:impossible example
INFO:simpletransformers.question_answering.question_answering_utils:*** Example ***
INFO:simpletransformers.question_answering.question_answering_utils:unique_id: 1000000012
INFO:simpletransformers.question_answering.question_answering_utils:example_index: 12
INFO:simpletransformers.question_answering.question_answering_utils:doc_span_index: 0
INFO:simpletransformers.question_answering.question_answering_utils:tokens: [CLS] [UNK] the song belongs to [SEP] [UNK] want to listen to some songs . [UNK] , how about the song [UNK] [UNK] by the [UNK] [UNK] [UNK] [UNK] on their 

INFO:simpletransformers.question_answering.question_answering_utils:token_is_max_context: 6:True 7:True 8:True 9:True 10:True 11:True 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True
INFO:simpletransformers.question_answering.question_answering_utils:input_ids: 101 100 1997 1996 2299 102 100 2215 2000 4952 2000 2070 2774 1012 100 1010

INFO:simpletransformers.question_answering.question_answering_utils:input_ids: 101 100 1997 1996 2299 102 100 2215 2000 4952 2000 2070 2774 1012 100 1010 2129 2055 1996 2299 100 100 2011 1996 100 100 100 100 2006 2037 2201 100 100 1029 100 1005 1055 2298 2005 2242 2842 1010 2151 6907 2097 2147 1012 100 2055 2242 2006 1996 2201 100 100 100 1029 100 2000 2963 100 100 2006 1996 100 100 2201 100 100 100 1029 100 1010 2008 1005 1055 2307 1012 100 1045 2377 2009 2085 1029 100 1010 2377 2009 1012 100 12210 2008 100 2323 2377 100 100 2006 1996 100 1012 100 1010 2377 2009 2006 1996 3829 5882 1012 100 1010 12210 2008 100 1005 1049 2652 100 100 2006 1996 3829 5882 1012 102 0 0
INFO:simpletransformers.question_answering.question_answering_utils:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

INFO:simpletransformers.question_answering.question_answering_utils:token_to_orig_map: 11:0 12:1 13:2 14:3 15:4 16:5 17:5 18:6 19:6 20:6 21:7 22:8 23:9 24:9
INFO:simpletransformers.question_answering.question_answering_utils:token_is_max_context: 11:True 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True
INFO:simpletransformers.question_answering.question_answering_utils:input_ids: 101 100 1997 1996 3063 1996 2299 2003 2864 2011 102 100 2017 2377 2033 2070 2189 1029 100 1005 1049 3241 100 3892 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:simpletransformers.question_answering.question_answering_utils:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

KeyboardInterrupt: 

In [700]:
result, out = model.eval_model('data/dialog/unseenTest.json')

INFO:simpletransformers.question_answering.question_answering_model: Converting to features started.
  0%|          | 0/38 [00:00<?, ?it/s]INFO:simpletransformers.question_answering.question_answering_utils:*** Example ***
INFO:simpletransformers.question_answering.question_answering_utils:unique_id: 1000000000
INFO:simpletransformers.question_answering.question_answering_utils:example_index: 0
INFO:simpletransformers.question_answering.question_answering_utils:doc_span_index: 0
INFO:simpletransformers.question_answering.question_answering_utils:tokens: [CLS] [UNK] where the event is taking place [SEP] [UNK] ' d like to find an event . [UNK] are you interested in , and do you have a city preference ? [UNK] ' m looking for music events in [UNK] . [SEP]
INFO:simpletransformers.question_answering.question_answering_utils:token_to_orig_map: 9:0 10:0 11:0 12:1 13:2 14:3 15:4 16:5 17:5 18:6 19:7 20:8 21:9 22:10 23:10 24:11 25:12 26:13 27:14 28:15 29:16 30:17 31:17 32:18 33:18 34:18 35:19 36:

INFO:simpletransformers.question_answering.question_answering_utils:token_to_orig_map: 12:0 13:0 14:0 15:1 16:2 17:3 18:4 19:5 20:5 21:6 22:7 23:8 24:9 25:10 26:10 27:11 28:12 29:13 30:14 31:15 32:16 33:17 34:17 35:18 36:18 37:18 38:19 39:20 40:21 41:22 42:23 43:24 44:24 45:25 46:26 47:27 48:28 49:28 50:29 51:30 52:31 53:32 54:33 55:34 56:35 57:36 58:36
INFO:simpletransformers.question_answering.question_answering_utils:token_is_max_context: 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True
INFO:simpletransformers.question_answering.question_answering_utils:input_ids: 101 100 2051 1997 6712 1997 3462 2013 4761 2000 7688 102 100 1005 1040 2066 2000 2424 2019 2724 101

INFO:simpletransformers.question_answering.question_answering_utils:token_is_max_context: 5:True 6:True 7:True 8:True 9:True 10:True 11:True 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True
INFO:simpletransformers.question_answering.question_answering_utils:input_ids: 101 100 1997 2724 102 100 1005 1040 2066 2000 2424 2019 2724 1012 100 2024 2017 4699 1999 1010 1998 2079 2017 2031 1037 2103 12157 1029 100 1005 1049 2559 2005 2189 2824 1999 100 1012 100 2179 2184 2824 1012 100 2515 100 100 100 2012 100 2614 1029 100 2017 2031 2151 2006 100 4343 1029 100 2066 2600 2824 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

INFO:simpletransformers.question_answering.question_answering_utils:token_to_orig_map: 6:0 7:0 8:0 9:1 10:2 11:3 12:4 13:5 14:5 15:6 16:7 17:8 18:9 19:10 20:10 21:11 22:12 23:13 24:14 25:15 26:16 27:17 28:17 29:18 30:18 31:18 32:19 33:20 34:21 35:22 36:23 37:24 38:24 39:25 40:26 41:27 42:28 43:28 44:29 45:30 46:31 47:32 48:33 49:34 50:35 51:36 52:36 53:37 54:38 55:39 56:40 57:41 58:42 59:43 60:43 61:44 62:45 63:46 64:47 65:47
INFO:simpletransformers.question_answering.question_answering_utils:token_is_max_context: 6:True 7:True 8:True 9:True 10:True 11:True 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True
INFO

INFO:simpletransformers.question_answering.question_answering_utils:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 32%|███▏      | 12/38 [00:00<00:00, 116.68it/s]INFO:simpletransformers.question_answering.question_answering_utils:*** Example ***
INFO:simpletransformers.question_answering.question_answering_utils:unique_id: 1000000012
INFO:simpletransformers.question_answering.question_answering_utils:example_index: 12
INFO:simpletransformers.question_answering.question_answering_utils:doc_span_index: 0
INFO:simpletransformers.question_answering.question_answering_utils:tokens: [CLS] [UNK] venue of event [SEP] [UNK] ' d like to find an event . [UNK] are you interested in , and do you have a city preference ? [UNK] ' m looking for music events in [UNK] . [UNK] found 10

INFO:simpletransformers.question_answering.question_answering_utils:token_is_max_context: 6:True 7:True 8:True 9:True 10:True 11:True 12:True 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True
INFO:simpletransformers.question_answering.question_answering_utils:inpu

INFO:simpletransformers.question_answering.question_answering_utils:input_ids: 101 100 1997 2724 6891 102 100 1005 1040 2066 2000 2424 2019 2724 1012 100 2024 2017 4699 1999 1010 1998 2079 2017 2031 1037 2103 12157 1029 100 1005 1049 2559 2005 2189 2824 1999 100 1012 100 2179 2184 2824 1012 100 2515 100 100 100 2012 100 2614 1029 100 2017 2031 2151 2006 100 4343 1029 100 2066 2600 2824 1012 100 2179 1018 2824 1012 100 2055 100 2012 1996 100 1012 100 2515 2009 2707 1010 1998 2073 2003 1996 6891 2284 1029 100 2724 4627 2012 1019 1024 2382 7610 1010 1996 6891 2003 2284 2012 2459 2789 12602 1010 6613 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:simpletransformers.question_answering.question_answering_utils:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:simpletransf

INFO:simpletransformers.question_answering.question_answering_utils:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:simpletransformers.question_answering.question_answering_utils:segment_ids: 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:simpletransformers.question_answering.question_answering_utils:*** Example ***
INFO:simpletransformers.question_answering.question_answering_utils:unique_id: 1000000019
INFO:simpletransformers.question_answering.question_answering_utils:example_index: 18
INFO:simpletransformers.question_answering.question_answering_utils

INFO:simpletransformers.question_answering.question_answering_utils:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
100%|██████████| 38/38 [00:00<00:00, 80.03it/s]


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

INFO:simpletransformers.question_answering.question_answering_utils:Writing predictions to: outputs/predictions_test.json
INFO:simpletransformers.question_answering.question_answering_utils:Writing nbest to: outputs/nbest_predictions_test.json
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK]' in 'NY.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] ' m looking for music events in [UNK]' in 'I'm looking for music events in NY.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] .' in 'NY.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'music events in [UNK]' in 'music events in NY.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'in [UNK]' in 'in NY.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'looking for music events i

INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] ' m looking for music' in 'I'm looking for music'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '. [UNK] are you interested in , and do you have a city preference ? [UNK] ' m looking for music events in [UNK]' in 'event. What are you interested in, and do you have a city preference? I'm looking for music events in NY.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'music events in [UNK] .' in 'music events in NY.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK]' in 'I'm'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'looking for music events in [UNK] .' in 'looking for music events in NY.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] are you interested in , and do you

INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '10 events . [UNK] does [UNK] [UNK] [UNK] at [UNK]' in '10 events. How does Amber RUn Brooklyn at Warsaw'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] at [UNK]' in 'Brooklyn at Warsaw'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] does [UNK] [UNK] [UNK] at [UNK] sound' in 'How does Amber RUn Brooklyn at Warsaw sound?'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] . [UNK] found 10 events . [UNK] does [UNK] [UNK] [UNK] at [UNK] sound' in 'NY. I found 10 events. How does Amber RUn Brooklyn at Warsaw sound?'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] [UNK] at [UNK]' in 'RUn Brooklyn at Warsaw'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] [UNK] [UNK] at

INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] 4th' in 'March 4th?'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] sound ? [UNK] you have any on [UNK] 4th ? [UNK] like rock events' in 'Warsaw sound? Do you have any on March 4th? I like rock events.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] sound ? [UNK] you have any on [UNK] 4th ? [UNK] like rock events .' in 'Warsaw sound? Do you have any on March 4th? I like rock events.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] found 10 events . [UNK] does [UNK] [UNK] [UNK] at [UNK] sound ? [UNK] you have any on [UNK] 4th ? [UNK] like rock' in 'I found 10 events. How does Amber RUn Brooklyn at Warsaw sound? Do you have any on March 4th? I like rock'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] 4

INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] found 10 events . [UNK] does [UNK] [UNK] [UNK] at [UNK] sound ? [UNK] you have any on [UNK] 4th ? [UNK] like rock events .' in 'I found 10 events. How does Amber RUn Brooklyn at Warsaw sound? Do you have any on March 4th? I like rock events.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] . [UNK] found 10 events . [UNK] does [UNK] [UNK] [UNK] at [UNK]' in 'NY. I found 10 events. How does Amber RUn Brooklyn at Warsaw'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'music events in [UNK]' in 'music events in NY.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] like rock events' in 'I like rock events.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] . [UNK] found 10 events . [UNK] does [UNK] [UNK] [UNK] at [UN




INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK]' in 'Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] at the [UNK]' in 'Low at the Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] about [UNK] at the [UNK]' in 'What about Low at the Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] .' in 'Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK]' in 'Low'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'the [UNK]' in 'the Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] at the [UNK] .' in 'Low at the Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] about [UNK]' in 'What about Low

INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'at the [UNK]' in 'at the Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] like rock events . [UNK] found 4 events . [UNK] about [UNK] at the [UNK]' in 'I like rock events. I found 4 events. What about Low at the Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK]' in 'Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] .' in 'Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'the [UNK]' in 'the Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] at the [UNK]' in 'Low at the Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] about [UNK] at the [UNK]' in 'What about Low at the Murmrr.'
IN

INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] found 10 events . [UNK] does [UNK] [UNK] [UNK] at [UNK] sound ? [UNK] you have any on [UNK] 4th ? [UNK] like rock events . [UNK] found 4 events . [UNK] about [UNK] at the [UNK]' in 'I found 10 events. How does Amber RUn Brooklyn at Warsaw sound? Do you have any on March 4th? I like rock events. I found 4 events. What about Low at the Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'about [UNK]' in 'about Low'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] sound ? [UNK] you have any on [UNK] 4th ? [UNK] like rock events . [UNK] found 4 events . [UNK] about [UNK] at the [UNK]' in 'Warsaw sound? Do you have any on March 4th? I like rock events. I found 4 events. What about Low at the Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] ' m lookin

INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] [UNK] [UNK] ,' in 'The Paper Kites,'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] [UNK]' in 'Paper Kites,'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '5 : 30 pm , the venue is located at 17 eastern parkway , brooklyn . [UNK] you find me something else ? [UNK] event is [UNK] [UNK] [UNK]' in '5:30 pm, the venue is located at 17 eastern parkway, brooklyn. Could you find me something else? The event is The Paper Kites,'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] [UNK]' in 'The Paper'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK]' in 'Kites,'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] event starts at 5 : 30 pm , the venue is located at 17 eastern p

INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '5 : 30 pm , the venue is located at 17 eastern parkway , brooklyn . [UNK] you find me something else ? [UNK] event is [UNK]' in '5:30 pm, the venue is located at 17 eastern parkway, brooklyn. Could you find me something else? The event is The'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] event is [UNK]' in 'The event is The'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '5 : 30 pm , the venue is located at 17 eastern parkway , brooklyn . [UNK] you find me something else ? [UNK]' in '5:30 pm, the venue is located at 17 eastern parkway, brooklyn. Could you find me something else? The'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK]' in 'The'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] event starts at 5 : 30 p

INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'on next monday , the event is [UNK] [UNK] [UNK] , at the [UNK] .' in 'on next monday, the event is The Paper Kites, at the Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'at the [UNK]' in 'at the Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'the event is [UNK] [UNK] [UNK] , at the [UNK] .' in 'the event is The Paper Kites, at the Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'monday , the event is [UNK] [UNK] [UNK]' in 'monday, the event is The Paper Kites,'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] [UNK] [UNK]' in 'The Paper Kites,'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'next monday , the event is [UNK]' in 'next monday, the event is The'
I

INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'monday , the event is [UNK] [UNK] [UNK] , at the [UNK]' in 'monday, the event is The Paper Kites, at the Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'the venue is located at 17 eastern parkway , brooklyn . [UNK] you find me something else ? [UNK] event is [UNK] [UNK] [UNK]' in 'the venue is located at 17 eastern parkway, brooklyn. Could you find me something else? The event is The Paper Kites,'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'brooklyn . [UNK] you find me something else ? [UNK] event is [UNK] [UNK] [UNK]' in 'brooklyn. Could you find me something else? The event is The Paper Kites,'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '17 eastern parkway , brooklyn . [UNK] you find me something else ? [UNK] event is [UNK] [UNK] [UNK]' in '17 eastern par

INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'brooklyn . [UNK] you find me something else ? [UNK] event is [UNK] [UNK] [UNK]' in 'brooklyn. Could you find me something else? The event is The Paper Kites,'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK]' in 'Kites,'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] event is on next monday , the event is [UNK] [UNK] [UNK]' in 'The event is on next monday, the event is The Paper Kites,'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] [UNK] [UNK] ,' in 'The Paper Kites,'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] [UNK] [UNK] , at the [UNK] .' in 'The Paper Kites, at the Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] , at the [UNK]' in 'Kites, at th

INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'next monday , the event is [UNK] [UNK] [UNK] , at the [UNK]' in 'next monday, the event is The Paper Kites, at the Murmrr.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] event is on next monday , the event is [UNK] [UNK] [UNK] , at the [UNK] . [UNK] is the venue located ? [UNK] venue is found at 17 eastern parkway , brooklyn . [UNK] does it start ? [UNK] starts at 5 : 30 pm .' in 'The event is on next monday, the event is The Paper Kites, at the Murmrr. Where is the venue located? The venue is found at 17 eastern parkway, brooklyn. When does it start? It starts at 5:30 pm.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '17 eastern parkway , brooklyn . [UNK] does it start ? [UNK] starts at 5 : 30 pm .' in '17 eastern parkway, brooklyn. When does it start? It starts at 5:30 pm.'
INFO:simpletransformers.quest

INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'brooklyn . [UNK] does it start ? [UNK] starts at 5 : 30 pm' in 'brooklyn. When does it start? It starts at 5:30 pm.'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] [UNK] [UNK]' in 'The Paper Kites,'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] [UNK]' in 'New York?'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK]' in 'New'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] event is [UNK] [UNK] [UNK]' in 'The event is The Paper Kites,'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] [UNK] [UNK] ,' in 'The Paper Kites,'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK]' in 'York?'
INFO:simpletransformers.question_answe

INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'next monday in [UNK] [UNK]' in 'next monday in New York?'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] [UNK] ?' in 'New York?'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'next monday in [UNK]' in 'next monday in New'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] event is [UNK] [UNK] [UNK] , it is alos' in 'The event is The Paper Kites, it is alos'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: 'monday in [UNK] [UNK]' in 'monday in New York?'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '5 : 30 pm , the venue is located at 17 eastern parkway , brooklyn . [UNK] you find me something else ? [UNK] event is [UNK] [UNK] [UNK] , it is alos' in '5:30 pm, the venue is located a

INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK]' in 'The'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK]' in 'Kites,'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] [UNK]' in 'New York?'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK]' in 'New'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK]' in 'York?'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] [UNK] [UNK]' in 'The Paper Kites,'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '17 eastern parkway , brooklyn . [UNK] you find me something else ? [UNK] event is [UNK] [UNK] [UNK]' in '17 eastern parkway, brooklyn. Could you find me something else? The event is The Paper Kites,'
INFO:simpletransformers.question

In [686]:
for i in text['similar_text']:
    print(text['similar_text'][i])

{'truth': 'NY', 'predicted': 'NY.', 'question': 'City where the event is taking place'}
{'truth': 'Amber RUn Brooklyn', 'predicted': '', 'question': 'Name of match or artist for event'}
{'truth': 'Warsaw', 'predicted': '', 'question': 'Exact venue of event'}
{'truth': 'March 4th', 'predicted': '', 'question': 'Date of event'}
{'truth': 'rock', 'predicted': '', 'question': 'The sport or music subcategory'}
{'truth': 'Murmrr', 'predicted': 'Murmrr.', 'question': 'Exact venue of event'}
{'truth': '5:30 pm', 'predicted': '', 'question': 'Starting time for event'}
{'truth': 'The Paper Kites', 'predicted': 'The Paper Kites,', 'question': 'Name of match or artist for event'}
{'truth': 'The Paper Kites', 'predicted': '', 'question': 'Name of match or artist for event'}
{'truth': 'Murmrr', 'predicted': 'Murmrr.', 'question': 'Exact venue of event'}
{'truth': '5:30 pm', 'predicted': '', 'question': 'Starting time for event'}
{'truth': 'The Paper Kites', 'predicted': '', 'question': 'Name of matc

In [701]:
with open('data/dialog/test.out', 'w') as f:
    json.dump(out, f)

In [140]:
# Failure Analysis
def read_dict(filename):
    with open(filename) as f:
        return json.load(f)
def read_json(filename):
    with open(filename) as f:
        return json.load(f)
    
# filename = "data/dialog/out/bert_bert-large-uncased-whole-word-masking-finetuned-squad_unseenTrain_unseenTest.out"
filename = "data/dialog/out/bert_bert-large-uncased-whole-word-masking-finetuned-squad_distantWithNegativeTrain_unseenTest.out"
out = read_dict(filename)
filename = "data/dialog/unseenTest.json"
test = read_json(filename)


In [141]:
incorrect_ids = out['incorrect_text'].keys()

In [142]:
def getId2Service(test):
    id2service = {}
    for i in test:
        for j in i['qas']:
            if 'service' in j:
                id2service[j['id']] = simplifyDomainName(j['service'])
    return id2service
id2service = getId2Service(test)

In [143]:
from collections import Counter
def printFreq(l):
    for i in sorted(Counter(l).items(), key=lambda x: -x[1])[:10]:
         print(i[0] +","+ str(i[1]))
printFreq([id2service[i] for i in incorrect_ids])
print()
printFreq([out['incorrect_text'][i]['question'] for i in out['incorrect_text']])

hotel,2678
service,1531
rent car,1258
event,1071
media,368
movie,215
taxi,212
weather,113
bank,59
travel,18

Location of the house,473
Date for the appointment,400
Review rating of the house,396
Number of days in the reservation,335
Address of the house,329
Starting time for event,277
Time of the appointment,264
Name of the dentist,261
Date of rental car pickup,235
Name of the director of the movie,219


In [144]:
out['incorrect_text']

{'17': {'truth': '17 eastern parkway, brooklyn',
  'predicted': 'venue',
  'question': 'Address of event venue'},
 '21': {'truth': 'Murmrr',
  'predicted': 'venue',
  'question': 'Exact venue of event'},
 '27': {'truth': 'Murmrr',
  'predicted': 'venue',
  'question': 'Exact venue of event'},
 '29': {'truth': '17 eastern parkway, brooklyn',
  'predicted': 'venue',
  'question': 'Address of event venue'},
 '45': {'truth': '5:30 pm',
  'predicted': 'time',
  'question': 'Starting time for event'},
 '95': {'truth': '7 pm',
  'predicted': 'time',
  'question': 'Starting time for event'},
 '153': {'truth': '56-15 Northern Boulevard, Woodside',
  'predicted': 'venue',
  'question': 'Address of event venue'},
 '161': {'truth': 'La Boom',
  'predicted': 'venue',
  'question': 'Exact venue of event'},
 '163': {'truth': '56-15 Northern Boulevard, Woodside',
  'predicted': 'venue',
  'question': 'Address of event venue'},
 '169': {'truth': 'La Boom',
  'predicted': 'venue',
  'question': 'Exact v