From 7587e187ebf772d1d79a096796df1a32e7cecee2 Mon Sep 17 00:00:00 2001 From: Fedor Ignatov Date: Fri, 6 May 2022 08:46:27 +0300 Subject: [PATCH 1/2] fix: KBQA static EL and Wikidata URLs replaced with environment variables (#151) * fix: kbqa static el and wikidata urls * refactor: removed redundant f-string in dialogflows/flows/sport.py --- .env | 4 ++-- annotators/kbqa/kbqa_cq.json | 8 +++++--- annotators/kbqa/kbqa_cq_mt_bert.json | 8 +++++--- annotators/kbqa/kbqa_cq_mt_bert_lite.json | 5 +++-- common/custom_requests.py | 4 ++-- .../dff_bot_persona_skill/dialogflows/flows/shared.py | 4 ++-- skills/dff_gossip_skill/dialogflows/flows/utils.py | 4 ++-- skills/dff_sport_skill/dialogflows/flows/sport.py | 10 +++++----- 8 files changed, 26 insertions(+), 21 deletions(-) diff --git a/.env b/.env index 056a9728f4..2510758678 100644 --- a/.env +++ b/.env @@ -22,8 +22,8 @@ COMET_SERVICE_URL=http://comet-atomic:8053/comet CONCEPTNET_SERVICE_URL=http://comet-conceptnet:8065/comet MASKED_LM_SERVICE_URL=http://masked-lm:8088/respond SENTIMENT_CLASSIFICATION_SERVICE_URL=http://sentiment-classification:8024/model -WIKIDATA_URL=http://wiki-parser:8077/model -ENTITY_LINKING_URL=http://entity-linking:8075/model +DP_WIKIDATA_URL=http://wiki-parser:8077/model +DP_ENTITY_LINKING_URL=http://entity-linking:8075/model KNOWLEDGE_GROUNDING_SERVICE_URL=http://knowledge-grounding:8083/respond WIKIDATA_DIALOGUE_SERVICE_URL=http://wikidata-dial-service:8092/model NEWS_API_ANNOTATOR_URL=http://news-api-annotator:8112/respond diff --git a/annotators/kbqa/kbqa_cq.json b/annotators/kbqa/kbqa_cq.json index b142ea33b5..7f874ea452 100644 --- a/annotators/kbqa/kbqa_cq.json +++ b/annotators/kbqa/kbqa_cq.json @@ -37,14 +37,14 @@ { "class_name": "api_requester", "id": "linker_entities", - "url": "http://entity-linking:8075/model", + "url": "{ENTITY_LINKING_URL}", "out": ["entity_ids"], "param_names": ["entity_substr", "template_found"] }, { "class_name": "api_requester", "id": "wiki_p", - "url": "http://wiki-parser:8077/model", + "url": "{WIKIDATA_URL}", "out": ["wiki_parser_output"], "param_names": ["parser_info", "query"] }, @@ -132,7 +132,9 @@ "MODELS_PATH": "{ROOT_PATH}/models", "BERT_PATH": "{DOWNLOADS_PATH}/bert_models_kbqa/cased_L-12_H-768_A-12", "NER_PATH": "{MODELS_PATH}/ner_lcquad_ent_and_type", - "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs" + "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs", + "ENTITY_LINKING_URL": "http://entity-linking:8075/model", + "WIKIDATA_URL": "http://wiki-parser:8077/model" }, "requirements": [ "{DEEPPAVLOV_PATH}/requirements/tf.txt", diff --git a/annotators/kbqa/kbqa_cq_mt_bert.json b/annotators/kbqa/kbqa_cq_mt_bert.json index 863be83793..d35b1eeeec 100644 --- a/annotators/kbqa/kbqa_cq_mt_bert.json +++ b/annotators/kbqa/kbqa_cq_mt_bert.json @@ -117,14 +117,14 @@ { "class_name": "api_requester", "id": "linker_entities", - "url": "http://entity-linking:8075/model", + "url": "{ENTITY_LINKING_URL}", "out": ["entity_ids"], "param_names": ["entity_substr", "template_found"] }, { "class_name": "api_requester", "id": "wiki_p", - "url": "http://wiki-parser:8077/model", + "url": "{WIKIDATA_URL}", "out": ["wiki_parser_output"], "param_names": ["parser_info", "query"] }, @@ -228,7 +228,9 @@ "MODELS_PATH": "{ROOT_PATH}/models", "BERT_PATH": "{DOWNLOADS_PATH}/bert_models_kbqa/cased_L-12_H-768_A-12", "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs", - "MT_BERT_PATH": "{MODELS_PATH}/mt_bert_kbqa" + "MT_BERT_PATH": "{MODELS_PATH}/mt_bert_kbqa", + "ENTITY_LINKING_URL": "http://entity-linking:8075/model", + "WIKIDATA_URL": "http://wiki-parser:8077/model" }, "requirements": [ "{DEEPPAVLOV_PATH}/requirements/tf.txt", diff --git a/annotators/kbqa/kbqa_cq_mt_bert_lite.json b/annotators/kbqa/kbqa_cq_mt_bert_lite.json index fb3032a37f..aa0281db14 100644 --- a/annotators/kbqa/kbqa_cq_mt_bert_lite.json +++ b/annotators/kbqa/kbqa_cq_mt_bert_lite.json @@ -12,7 +12,7 @@ { "class_name": "api_requester", "id": "linker_entities", - "url": "http://entity-linking:8075/model", + "url": "{ENTITY_LINKING_URL}", "out": ["entity_ids"], "param_names": ["entity_substr", "template_found"] }, @@ -90,7 +90,8 @@ "MODELS_PATH": "{ROOT_PATH}/models", "BERT_PATH": "{DOWNLOADS_PATH}/bert_models_kbqa/cased_L-12_H-768_A-12", "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs", - "MT_BERT_PATH": "{MODELS_PATH}/mt_bert_kbqa" + "MT_BERT_PATH": "{MODELS_PATH}/mt_bert_kbqa", + "ENTITY_LINKING_URL": "http://entity-linking:8075/model" }, "requirements": [ "{DEEPPAVLOV_PATH}/requirements/tf.txt", diff --git a/common/custom_requests.py b/common/custom_requests.py index c8bffb796f..4ccf621f74 100644 --- a/common/custom_requests.py +++ b/common/custom_requests.py @@ -8,8 +8,8 @@ sentry_sdk.init(getenv("SENTRY_DSN")) logger = logging.getLogger(__name__) -WIKIDATA_URL = getenv("WIKIDATA_URL") -ENTITY_LINKING_URL = getenv("ENTITY_LINKING_URL") +WIKIDATA_URL = getenv("DP_WIKIDATA_URL") +ENTITY_LINKING_URL = getenv("DP_ENTITY_LINKING_URL") assert WIKIDATA_URL and ENTITY_LINKING_URL diff --git a/skills/dff_bot_persona_skill/dialogflows/flows/shared.py b/skills/dff_bot_persona_skill/dialogflows/flows/shared.py index 15f7389f86..91e6f08497 100644 --- a/skills/dff_bot_persona_skill/dialogflows/flows/shared.py +++ b/skills/dff_bot_persona_skill/dialogflows/flows/shared.py @@ -9,8 +9,8 @@ sentry_sdk.init(dsn=os.getenv("SENTRY_DSN")) -ENTITY_LINKING_URL = os.getenv("ENTITY_LINKING_URL") -WIKIDATA_URL = os.getenv("WIKIDATA_URL") +ENTITY_LINKING_URL = os.getenv("DP_ENTITY_LINKING_URL") +WIKIDATA_URL = os.getenv("DP_WIKIDATA_URL") assert ENTITY_LINKING_URL, ENTITY_LINKING_URL assert WIKIDATA_URL, WIKIDATA_URL diff --git a/skills/dff_gossip_skill/dialogflows/flows/utils.py b/skills/dff_gossip_skill/dialogflows/flows/utils.py index 85e8b54d19..a1a0923658 100644 --- a/skills/dff_gossip_skill/dialogflows/flows/utils.py +++ b/skills/dff_gossip_skill/dialogflows/flows/utils.py @@ -18,8 +18,8 @@ sentry_sdk.init(dsn=os.getenv("SENTRY_DSN")) -ENTITY_LINKING_URL = os.getenv("ENTITY_LINKING_URL") -WIKIDATA_URL = os.getenv("WIKIDATA_URL") +ENTITY_LINKING_URL = os.getenv("DP_ENTITY_LINKING_URL") +WIKIDATA_URL = os.getenv("DP_WIKIDATA_URL") assert ENTITY_LINKING_URL, ENTITY_LINKING_URL assert WIKIDATA_URL, WIKIDATA_URL diff --git a/skills/dff_sport_skill/dialogflows/flows/sport.py b/skills/dff_sport_skill/dialogflows/flows/sport.py index 81bf7bbdab..558dae892d 100644 --- a/skills/dff_sport_skill/dialogflows/flows/sport.py +++ b/skills/dff_sport_skill/dialogflows/flows/sport.py @@ -245,7 +245,7 @@ def entity_in_last_uttr_from_sport_area(vars): def get_dict_entity(entity_substr, entity_ids, type): try: - WIKIDATA_URL = "http://wiki-parser:8077/model" + WIKIDATA_URL = os.getenv("DP_WIKIDATA_URL") dict_result = requests.post( WIKIDATA_URL, json={ @@ -352,8 +352,8 @@ def user_ask_about_sport_request(ngrams, vars): def lets_chat_about_sport_response(vars): # USR_ASK_ABOUT_SPORT responses = [ - f"I have no physical embodiment. Sport is interesting and useful. Tell me what sport do you enjoy?", - f"I live on a cloud, so i can't do sport , but I'm really curious about what sport are you fond of?", + "I have no physical embodiment. Sport is interesting and useful. Tell me what sport do you enjoy?", + "I live on a cloud, so i can't do sport , but I'm really curious about what sport are you fond of?", ] try: state_utils.set_confidence(vars, confidence=SUPER_CONFIDENCE) @@ -398,7 +398,7 @@ def user_ask_about_athletes_response(vars): try: state_utils.set_confidence(vars, confidence=SUPER_CONFIDENCE) state_utils.set_can_continue(vars, continue_flag=MUST_CONTINUE) - return f"I know all the athletes on this planet. Which athlete do you like the most?" + return "I know all the athletes on this planet. Which athlete do you like the most?" except Exception as exc: logger.exception(exc) sentry_sdk.capture_exception(exc) @@ -922,7 +922,7 @@ def last_chance_response(vars): def error_response(vars): state_utils.set_confidence(vars, ZERO_CONFIDENCE) - return f"" + return "" ################################################################################################################## From 84762881270e452c7c1d983bc407b7c12edd4a3f Mon Sep 17 00:00:00 2001 From: dmitrijeuseew Date: Fri, 6 May 2022 13:00:42 +0300 Subject: [PATCH 2/2] fix uppercase in entity detection (#154) * Fix requirements.txt (#84) * fix itsdangerous requirements * pin itsdangerous requirements for all flask==1.1.1 servers * fix uppercase * fix confidence * fix lowercase * capitalize in ner * use get entities * fixes * fix capitalize Co-authored-by: Andrii.Hura <54397922+AndriiHura@users.noreply.github.com> Co-authored-by: mtalimanchuk Co-authored-by: Dilyara Baymurzina --- annotators/NER/server.py | 11 ++++++++++- annotators/entity_detection/server.py | 17 ++++++++++++++--- annotators/entity_linking/server.py | 1 + annotators/wiki_parser/wiki_parser.py | 2 ++ skills/factoid_qa/server.py | 3 --- state_formatters/dp_formatters.py | 16 +++++++++------- 6 files changed, 36 insertions(+), 14 deletions(-) diff --git a/annotators/NER/server.py b/annotators/NER/server.py index 530621f4be..f6610b93aa 100644 --- a/annotators/NER/server.py +++ b/annotators/NER/server.py @@ -43,7 +43,8 @@ def extract_good_entities(preds, sentences): good_entities_for_sent = [] for ent in entities_for_sent: - ent_text = ent["text"].lower() + ent_text = ent["text"] + ent_text = " ".join([ent_word[0].capitalize() + ent_word[1:] for ent_word in ent_text.split()]) # remove everything except of letters, digitals, spaces and - ent_text = EVERYTHING_EXCEPT_LETTERS_DIGITALS_AND_SPACE.sub(" ", ent_text) ent_text = DOUBLE_SPACES.sub(" ", ent_text).strip() @@ -56,6 +57,7 @@ def extract_good_entities(preds, sentences): is_long_enough = len(ent_text) > 2 is_not_banned = not re.match(BANNED_ENTITIES, ent_text) if is_not_stopword and is_not_banned and is_long_enough: + ent["text"] = ent_text good_entities_for_sent.append(deepcopy(ent)) good_preds.append(good_entities_for_sent) @@ -64,7 +66,14 @@ def extract_good_entities(preds, sentences): def get_predictions_for_list_sentences(sentences): sents = [word_tokenize(sent.lower()) for sent in sentences] + sents_upper = [word_tokenize(sent) for sent in sentences] preds = ner.predict(sents) + for i in range(len(preds)): + sent_upper = sents_upper[i] + for j in range(len(preds[i])): + ent_upper = " ".join(sent_upper[preds[i][j]["start_pos"] : preds[i][j]["end_pos"]]) + if ent_upper.lower() == preds[i][j]["text"]: + preds[i][j]["text"] = ent_upper # each sample is a list of sentences of current utterance # so, preds is a list of length = number of sents in utterances # each element of preds is a list of entities. diff --git a/annotators/entity_detection/server.py b/annotators/entity_detection/server.py index 6b2fe32255..961431d3f6 100644 --- a/annotators/entity_detection/server.py +++ b/annotators/entity_detection/server.py @@ -49,6 +49,7 @@ def get_result(request, what_to_annotate): logger.info(f"annotating: {what_to_annotate}, input (the last utterances): {last_utts}") utts_list = [] + utts_list_init = [] utts_nums = [] last_utt_starts = [] for n, hist_utt in enumerate(last_utts): @@ -71,6 +72,7 @@ def get_result(request, what_to_annotate): utts_list.append(concat_utt.lower()) else: utts_list.append(concat_utt) + utts_list_init.append(concat_utt) utts_nums.append(n) utt_entities_batch = [{} for _ in last_utts] @@ -91,14 +93,23 @@ def get_result(request, what_to_annotate): ) = entity_detection(utts_list) logger.info(f"entity_substr_batch {entity_substr_batch} finegrained_tags_batch {finegrained_tags_batch}") - for entity_substr_list, tags_list, finegrained_tags_list, entity_offsets_list, last_utt_start, num in zip( - entity_substr_batch, tags_batch, finegrained_tags_batch, entity_offsets_batch, last_utt_starts, utts_nums + for entity_substr_list, tags_list, finegrained_tags_list, entity_offsets_list, last_utt_start, uttr, num in zip( + entity_substr_batch, + tags_batch, + finegrained_tags_batch, + entity_offsets_batch, + last_utt_starts, + utts_list_init, + utts_nums, ): utt_entities = {} for entity, tag, finegrained_tag, (start_offset, end_offset) in zip( entity_substr_list, tags_list, finegrained_tags_list, entity_offsets_list ): - if entity not in stopwords and len(entity) > 2 and start_offset >= last_utt_start: + entity_init = uttr[start_offset:end_offset] + if entity_init.lower() == entity: + entity = entity_init + if entity.lower() not in stopwords and len(entity) > 2 and start_offset >= last_utt_start: entity = EVERYTHING_EXCEPT_LETTERS_DIGITALS_AND_SPACE.sub(" ", entity) entity = DOUBLE_SPACES.sub(" ", entity).strip() if finegrained_tag[0][0] > 0.5: diff --git a/annotators/entity_linking/server.py b/annotators/entity_linking/server.py index ee88629855..8b91dc0dda 100644 --- a/annotators/entity_linking/server.py +++ b/annotators/entity_linking/server.py @@ -87,6 +87,7 @@ def respond(): entity_substr_batch = inp.get("entity_substr", [[""]]) template_batch = inp.get("template", [""]) context_batch = inp.get("context", [[""]]) + logger.info(f"entity linking, input {entity_substr_batch}") long_context_batch = [] short_context_batch = [] for entity_substr_list, context_list in zip(entity_substr_batch, context_batch): diff --git a/annotators/wiki_parser/wiki_parser.py b/annotators/wiki_parser/wiki_parser.py index 3046b942fd..41341a49e3 100644 --- a/annotators/wiki_parser/wiki_parser.py +++ b/annotators/wiki_parser/wiki_parser.py @@ -620,6 +620,8 @@ def find_top_triplets(entity, entity_substr, pos=None, token_conf=None, conf=Non triplets["token_conf"] = token_conf if conf is not None: triplets["conf"] = conf + if entity_substr.lower() in entity_label.lower(): + entity_substr = entity_label triplets_info[entity_substr] = triplets return triplets_info diff --git a/skills/factoid_qa/server.py b/skills/factoid_qa/server.py index c81d44e3ca..1fa0c84b99 100644 --- a/skills/factoid_qa/server.py +++ b/skills/factoid_qa/server.py @@ -363,9 +363,6 @@ def respond(): logger.info("Question is not classified as factoid.") response = "" confidence = 0.0 - - if confidence == 1.0: - confidence = 0.99 responses.append(response) confidences.append(confidence) attributes.append(attr) diff --git a/state_formatters/dp_formatters.py b/state_formatters/dp_formatters.py index 3efd88fbf7..b4fcb44098 100755 --- a/state_formatters/dp_formatters.py +++ b/state_formatters/dp_formatters.py @@ -535,21 +535,23 @@ def el_formatter_dialog(dialog: Dict): # Used by: entity_linking annotator num_last_utterances = 2 ner_output = get_entities(dialog["human_utterances"][-1], only_named=True, with_labels=True) - nounphrases = dialog["human_utterances"][-1]["annotations"].get("cobot_entities", {}).get("entities", []) - entity_substr = [] + nounphrases = get_entities(dialog["human_utterances"][-1], only_named=False, with_labels=False) + entity_substr_list = [] if ner_output: for entity in ner_output: if entity and isinstance(entity, dict) and "text" in entity and entity["text"].lower() != "alexa": - entity_substr.append(entity["text"].lower()) - + entity_substr_list.append(entity["text"]) + entity_substr_lower_list = {entity_substr.lower() for entity_substr in entity_substr_list} dialog = utils.get_last_n_turns(dialog, bot_last_turns=1) dialog = utils.replace_with_annotated_utterances(dialog, mode="punct_sent") context = [[uttr["text"] for uttr in dialog["utterances"][-num_last_utterances:]]] if nounphrases: - entity_substr += [nounphrase.lower() for nounphrase in nounphrases] - entity_substr = list(set(entity_substr)) + entity_substr_list += [ + nounphrase for nounphrase in nounphrases if nounphrase.lower() not in entity_substr_lower_list + ] + entity_substr_list = list(set(entity_substr_list)) - return [{"entity_substr": [entity_substr], "template": [""], "context": context}] + return [{"entity_substr": [entity_substr_list], "template": [""], "context": context}] def kbqa_formatter_dialog(dialog: Dict):