update entity detection (#309)

* Fix requirements.txt (#84) * fix itsdangerous requirements * pin itsdangerous requirements for all flask==1.1.1 servers * update entity detection * fix * fix * codestyle * fix a typo * add: use related tags * fix test entity linking --------- Co-authored-by: Andrii.Hura <54397922+AndriiHura@users.noreply.github.com> Co-authored-by: mtalimanchuk <mtalimanchuk@gmail.com> Co-authored-by: Dilyara Baymurzina <dilyara.rimovna@gmail.com>
deeppavlov · Feb 15, 2023 · c55f796 · c55f796
1 parent 5edea73
commit c55f796
Show file tree

Hide file tree

Showing 16 changed files with 88 additions and 1,823 deletions.
diff --git a/annotators/entity_detection/Dockerfile b/annotators/entity_detection/Dockerfile
@@ -1,22 +1,16 @@
-FROM deeppavlov/base-gpu:0.12.1
-RUN pip install --upgrade pip && pip install git+https://github.com/deeppavlov/DeepPavlov.git@0.12.1
+FROM deeppavlov/base-gpu:0.17.6
 
 RUN apt-get update && apt-get install git -y
 
 ARG SEQ_TAG_CONFIG
-ARG EL_TAG_CONFIG
 ARG CONFIG
 ARG FINEGRAINED
-ARG LOWERCASE
 ARG PORT
 ARG SRC_DIR
-ARG SED_ARG=" | "
 
 ENV SEQ_TAG_CONFIG=$SEQ_TAG_CONFIG
-ENV EL_TAG_CONFIG=$EL_TAG_CONFIG
 ENV CONFIG=$CONFIG
 ENV FINEGRAINED=$FINEGRAINED
-ENV LOWERCASE=$LOWERCASE
 ENV PORT=$PORT
 
 COPY ./annotators/entity_detection/requirements.txt /src/requirements.txt
@@ -28,9 +22,5 @@ COPY $SRC_DIR /src
 
 WORKDIR /src
 RUN python -m deeppavlov install $SEQ_TAG_CONFIG
-RUN python -m deeppavlov install $EL_TAG_CONFIG
 
-RUN sed -i "s|$SED_ARG|g" "$SEQ_TAG_CONFIG"
-RUN sed -i "s|$SED_ARG|g" "$EL_TAG_CONFIG"
-
-CMD gunicorn  --workers=1 --timeout 500 server:app -b 0.0.0.0:8103
+CMD gunicorn  --workers=1 --timeout 500 server:app -b 0.0.0.0:$PORT
diff --git a/annotators/entity_detection/entity_detection_eng.json b/annotators/entity_detection/entity_detection_eng.json
@@ -3,12 +3,12 @@
     "in": ["text"],
     "pipe": [
       {
-        "class_name": "src.ner_chunker:NerChunker",
+        "class_name": "ner_chunker",
         "batch_size": 16,
         "max_chunk_len" : 180,
         "max_seq_len" : 400,
         "vocab_file": "{TRANSFORMER}",
-        "do_lower_case": false,
+        "lowercase": true,
         "in": ["text"],
         "out": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"]
       },
@@ -17,35 +17,26 @@
         "o_tag": "O",
         "tags_file": "{NER_PATH}/tag.dict",
         "return_entities_with_tags": true,
-        "add_nouns": true,
-        "class_name": "src.entity_detection_parser:EntityDetectionParser",
+        "class_name": "entity_detection_parser",
         "id": "edp"
       },
       {
-        "class_name": "src.ner_chunker:NerChunkModel",
-        "add_nouns": true,
-        "ner": {"config_path": "src/wikipedia_entity_detection_distilbert.json"},
+        "class_name": "ner_chunk_model",
+        "ner": {"config_path": "./wikipedia_entity_detection_distilbert.json"},
         "ner_parser": "#edp",
         "in": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"],
-        "out": ["entity_substr", "entity_offsets", "tags", "probas", "sentences_offsets",
-                "sentences", "tokens", "tokens_conf", "entity_positions", "sentences_tokens"]
-      },
-      {
-        "config_path": "src/el_tags_infer.json",
-        "in": ["text", "entity_offsets", "entity_substr"],
-        "out": ["finegrained_tags"]
+        "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"]
       }
     ],
-    "out": ["entity_substr", "entity_offsets", "entity_positions", "tokens", "tags", "finegrained_tags", "sentences_offsets", "sentences", "probas", "tokens_conf"]
+    "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"]
   },
   "metadata": {
     "variables": {
       "ROOT_PATH": "~/.deeppavlov",
       "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
-      "MODELS_PATH": "{ROOT_PATH}/models",
       "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs",
-      "NER_PATH": "{MODELS_PATH}/entity_detection_wikipedia_distilbert",
-      "TRANSFORMER": "{DOWNLOADS_PATH}/torch_bert_models/distilbert_base_uncased"
+      "NER_PATH": "{ROOT_PATH}/models/dialog_entity_detection",
+      "TRANSFORMER": "bert-base-uncased"
     },
     "download": [
     ]

diff --git a/annotators/entity_detection/requirements.txt b/annotators/entity_detection/requirements.txt
@@ -1,14 +1,11 @@
 Flask==1.1.1
 itsdangerous==2.0.1
-nltk==3.4.5
 setuptools<=65.5.1
 gunicorn==19.9.0
 requests==2.22.0
 jinja2<=3.0.3
 Werkzeug<=2.0.3
 sentry-sdk==0.12.3
-torch==1.6.0
-transformers==4.6.0
 pydantic==1.3
-deeppavlov==0.17.3
+deeppavlov==1.0.1
 spacy==2.2.3
diff --git a/annotators/entity_detection/server.py b/annotators/entity_detection/server.py
@@ -4,6 +4,7 @@
 import time
 
 import sentry_sdk
+import spacy
 from flask import Flask, jsonify, request
 from nltk.corpus import stopwords
 
@@ -16,19 +17,8 @@
 app = Flask(__name__)
 
 config_name = os.getenv("CONFIG")
-lowercase = int(os.getenv("LOWERCASE", "1"))
 finegrained = int(os.getenv("FINEGRAINED", "0"))
 
-
-def replace_finegrained_tags(tag):
-    if tag in {"loc", "city", "country", "county", "river", "us_state", "road"}:
-        return "location"
-    elif tag in {"per", "musician", "writer", "athlete", "politician", "actor"}:
-        return "person"
-    else:
-        return tag
-
-
 try:
     entity_detection = build_model(config_name, download=True)
     entity_detection(["What is the capital of Russia?"])
@@ -41,6 +31,35 @@ def replace_finegrained_tags(tag):
 EVERYTHING_EXCEPT_LETTERS_DIGITALS_AND_SPACE = re.compile(r"[^a-zA-Z0-9 \-&*+]")
 DOUBLE_SPACES = re.compile(r"\s+")
 stopwords = set(stopwords.words("english"))
+nlp = spacy.load("en_core_web_sm")
+
+replace_tag_dict = {
+    "softwareapplication": "product",
+    "gamename": "product",
+    "device": "product",
+    "vehicle": "product",
+    "channelname": "org",
+    "organization": "org",
+    "wear": "misc",
+    "sportrole": "misc",
+    "bookname": "literary_work",
+    "party": "political_party",
+    "position": "occupation",
+    "sport": "type_of_sport",
+    "venue": "sports_venue",
+    "genre": "music_genre",
+    "videoname": "film",
+    "songname": "song",
+    "location": "loc",
+    "person": "per",
+    "sportteam": "sport_team",
+    "anaphor": "black",
+    "date": "black",
+    "number": "black",
+    "year": "black",
+    "duration": "black",
+    "ordinal": "black",
+}
 
 
 def get_result(request, what_to_annotate):
@@ -51,7 +70,6 @@ def get_result(request, what_to_annotate):
     utts_list = []
     utts_list_init = []
     utts_nums = []
-    last_utt_starts = []
     for n, hist_utt in enumerate(last_utts):
         if isinstance(hist_utt, str):
             hist_utt = [hist_utt]
@@ -63,15 +81,10 @@ def get_result(request, what_to_annotate):
                 prev_utt = hist_utt[-2]
                 if prev_utt and prev_utt[-1] not in {".", "!", "?"}:
                     prev_utt = f"{prev_utt}."
-                last_utt_starts.append(len(prev_utt) + 1)
                 concat_utt = f"{prev_utt} {last_utt}"
             else:
-                last_utt_starts.append(0)
                 concat_utt = last_utt
-            if lowercase:
-                utts_list.append(concat_utt.lower())
-            else:
-                utts_list.append(concat_utt)
+            utts_list.append(concat_utt)
             utts_list_init.append(concat_utt)
             utts_nums.append(n)
 
@@ -84,74 +97,55 @@ def get_result(request, what_to_annotate):
                 entity_substr_batch,
                 entity_offsets_batch,
                 entity_positions_batch,
-                tokens_batch,
                 tags_batch,
-                finegrained_tags_batch,
                 sentences_offsets_batch,
                 sentences_batch,
                 probas_batch,
-                tokens_conf_batch,
             ) = entity_detection(utts_list)
-            logger.info(f"entity_substr_batch {entity_substr_batch} finegrained_tags_batch {finegrained_tags_batch}")
-            for (
-                entity_substr_list,
-                tags_list,
-                finegrained_tags_list,
-                entity_offsets_list,
-                last_utt_start,
-                uttr,
-                num,
-            ) in zip(
+            logger.info(f"entity_substr_batch {entity_substr_batch} tags_batch {tags_batch}")
+            for (entity_substr_list, tags_list, probas_list, entity_offsets_list, uttr, num,) in zip(
                 entity_substr_batch,
                 tags_batch,
-                finegrained_tags_batch,
+                probas_batch,
                 entity_offsets_batch,
-                last_utt_starts,
                 utts_list_init,
                 utts_nums,
             ):
                 utt_entities = {}
-                for entity, tag, finegrained_tags, (start_offset, end_offset) in zip(
-                    entity_substr_list, tags_list, finegrained_tags_list, entity_offsets_list
+                for entity, tag, proba, (start_offset, end_offset) in zip(
+                    entity_substr_list, tags_list, probas_list, entity_offsets_list
                 ):
                     entity_init = uttr[start_offset:end_offset]
                     if entity_init.lower() == entity:
                         entity = entity_init
-                    if entity.lower() not in stopwords and len(entity) > 2 and start_offset >= last_utt_start:
+                    if (
+                        entity.lower() not in stopwords
+                        and len(entity) > 2
+                        and not (len(entity.split()) == 1 and nlp(entity)[0].pos_ == "PRON")
+                    ):
                         entity = EVERYTHING_EXCEPT_LETTERS_DIGITALS_AND_SPACE.sub(" ", entity)
                         entity = DOUBLE_SPACES.sub(" ", entity).strip()
-                        filtered_finegrained_tags = []
-                        if finegrained_tags[0][0] > 0.5:
-                            tag = finegrained_tags[0][1].lower()
-                            conf, finegrained_tag = finegrained_tags[0]
-                            filtered_finegrained_tags.append((finegrained_tag.lower(), round(conf, 3)))
-                            for finegrained_elem in finegrained_tags[1:]:
-                                conf, finegrained_tag = finegrained_elem
-                                if conf > 0.2:
-                                    filtered_finegrained_tags.append((finegrained_tag.lower(), round(conf, 3)))
-                        else:
-                            tag = "misc"
-                            filtered_finegrained_tags.append(("misc", 1.0))
-                        if not finegrained:
-                            tag = replace_finegrained_tags(tag)
+                        finegrained_tag = replace_tag_dict.get(tag.lower(), tag.lower())
+                        if finegrained_tag == "black":
+                            continue
                         if "entities" in utt_entities:
                             utt_entities["entities"].append(entity)
                             utt_entities["labelled_entities"].append(
                                 {
                                     "text": entity,
-                                    "label": tag,
-                                    "finegrained_label": filtered_finegrained_tags,
-                                    "offsets": (start_offset - last_utt_start, end_offset - last_utt_start),
+                                    "label": tag.lower(),
+                                    "finegrained_label": [(finegrained_tag, proba)],
+                                    "offsets": (start_offset, end_offset),
                                 }
                             )
                         else:
                             utt_entities["entities"] = [entity]
                             utt_entities["labelled_entities"] = [
                                 {
                                     "text": entity,
-                                    "label": tag,
-                                    "finegrained_label": filtered_finegrained_tags,
-                                    "offsets": (start_offset - last_utt_start, end_offset - last_utt_start),
+                                    "label": tag.lower(),
+                                    "finegrained_label": [(finegrained_tag, proba)],
+                                    "offsets": (start_offset, end_offset),
                                 }
                             ]
 

diff --git a/annotators/entity_detection/src/el_tags_infer.json b/annotators/entity_detection/src/el_tags_infer.json