Skip to content

Commit

Permalink
update entity detection (#309)
Browse files Browse the repository at this point in the history
* Fix requirements.txt (#84)

* fix itsdangerous requirements

* pin itsdangerous requirements for all flask==1.1.1 servers

* update entity detection

* fix

* fix

* codestyle

* fix a typo

* add: use related tags

* fix test entity linking

---------

Co-authored-by: Andrii.Hura <54397922+AndriiHura@users.noreply.github.com>
Co-authored-by: mtalimanchuk <mtalimanchuk@gmail.com>
Co-authored-by: Dilyara Baymurzina <dilyara.rimovna@gmail.com>
  • Loading branch information
4 people committed Feb 15, 2023
1 parent 5edea73 commit c55f796
Show file tree
Hide file tree
Showing 16 changed files with 88 additions and 1,823 deletions.
14 changes: 2 additions & 12 deletions annotators/entity_detection/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,22 +1,16 @@
FROM deeppavlov/base-gpu:0.12.1
RUN pip install --upgrade pip && pip install git+https://github.com/deeppavlov/DeepPavlov.git@0.12.1
FROM deeppavlov/base-gpu:0.17.6

RUN apt-get update && apt-get install git -y

ARG SEQ_TAG_CONFIG
ARG EL_TAG_CONFIG
ARG CONFIG
ARG FINEGRAINED
ARG LOWERCASE
ARG PORT
ARG SRC_DIR
ARG SED_ARG=" | "

ENV SEQ_TAG_CONFIG=$SEQ_TAG_CONFIG
ENV EL_TAG_CONFIG=$EL_TAG_CONFIG
ENV CONFIG=$CONFIG
ENV FINEGRAINED=$FINEGRAINED
ENV LOWERCASE=$LOWERCASE
ENV PORT=$PORT

COPY ./annotators/entity_detection/requirements.txt /src/requirements.txt
Expand All @@ -28,9 +22,5 @@ COPY $SRC_DIR /src

WORKDIR /src
RUN python -m deeppavlov install $SEQ_TAG_CONFIG
RUN python -m deeppavlov install $EL_TAG_CONFIG

RUN sed -i "s|$SED_ARG|g" "$SEQ_TAG_CONFIG"
RUN sed -i "s|$SED_ARG|g" "$EL_TAG_CONFIG"

CMD gunicorn --workers=1 --timeout 500 server:app -b 0.0.0.0:8103
CMD gunicorn --workers=1 --timeout 500 server:app -b 0.0.0.0:$PORT
27 changes: 9 additions & 18 deletions annotators/entity_detection/entity_detection_eng.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
"in": ["text"],
"pipe": [
{
"class_name": "src.ner_chunker:NerChunker",
"class_name": "ner_chunker",
"batch_size": 16,
"max_chunk_len" : 180,
"max_seq_len" : 400,
"vocab_file": "{TRANSFORMER}",
"do_lower_case": false,
"lowercase": true,
"in": ["text"],
"out": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"]
},
Expand All @@ -17,35 +17,26 @@
"o_tag": "O",
"tags_file": "{NER_PATH}/tag.dict",
"return_entities_with_tags": true,
"add_nouns": true,
"class_name": "src.entity_detection_parser:EntityDetectionParser",
"class_name": "entity_detection_parser",
"id": "edp"
},
{
"class_name": "src.ner_chunker:NerChunkModel",
"add_nouns": true,
"ner": {"config_path": "src/wikipedia_entity_detection_distilbert.json"},
"class_name": "ner_chunk_model",
"ner": {"config_path": "./wikipedia_entity_detection_distilbert.json"},
"ner_parser": "#edp",
"in": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"],
"out": ["entity_substr", "entity_offsets", "tags", "probas", "sentences_offsets",
"sentences", "tokens", "tokens_conf", "entity_positions", "sentences_tokens"]
},
{
"config_path": "src/el_tags_infer.json",
"in": ["text", "entity_offsets", "entity_substr"],
"out": ["finegrained_tags"]
"out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"]
}
],
"out": ["entity_substr", "entity_offsets", "entity_positions", "tokens", "tags", "finegrained_tags", "sentences_offsets", "sentences", "probas", "tokens_conf"]
"out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"]
},
"metadata": {
"variables": {
"ROOT_PATH": "~/.deeppavlov",
"DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
"MODELS_PATH": "{ROOT_PATH}/models",
"CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs",
"NER_PATH": "{MODELS_PATH}/entity_detection_wikipedia_distilbert",
"TRANSFORMER": "{DOWNLOADS_PATH}/torch_bert_models/distilbert_base_uncased"
"NER_PATH": "{ROOT_PATH}/models/dialog_entity_detection",
"TRANSFORMER": "bert-base-uncased"
},
"download": [
]
Expand Down
5 changes: 1 addition & 4 deletions annotators/entity_detection/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
Flask==1.1.1
itsdangerous==2.0.1
nltk==3.4.5
setuptools<=65.5.1
gunicorn==19.9.0
requests==2.22.0
jinja2<=3.0.3
Werkzeug<=2.0.3
sentry-sdk==0.12.3
torch==1.6.0
transformers==4.6.0
pydantic==1.3
deeppavlov==0.17.3
deeppavlov==1.0.1
spacy==2.2.3
106 changes: 50 additions & 56 deletions annotators/entity_detection/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import time

import sentry_sdk
import spacy
from flask import Flask, jsonify, request
from nltk.corpus import stopwords

Expand All @@ -16,19 +17,8 @@
app = Flask(__name__)

config_name = os.getenv("CONFIG")
lowercase = int(os.getenv("LOWERCASE", "1"))
finegrained = int(os.getenv("FINEGRAINED", "0"))


def replace_finegrained_tags(tag):
if tag in {"loc", "city", "country", "county", "river", "us_state", "road"}:
return "location"
elif tag in {"per", "musician", "writer", "athlete", "politician", "actor"}:
return "person"
else:
return tag


try:
entity_detection = build_model(config_name, download=True)
entity_detection(["What is the capital of Russia?"])
Expand All @@ -41,6 +31,35 @@ def replace_finegrained_tags(tag):
EVERYTHING_EXCEPT_LETTERS_DIGITALS_AND_SPACE = re.compile(r"[^a-zA-Z0-9 \-&*+]")
DOUBLE_SPACES = re.compile(r"\s+")
stopwords = set(stopwords.words("english"))
nlp = spacy.load("en_core_web_sm")

replace_tag_dict = {
"softwareapplication": "product",
"gamename": "product",
"device": "product",
"vehicle": "product",
"channelname": "org",
"organization": "org",
"wear": "misc",
"sportrole": "misc",
"bookname": "literary_work",
"party": "political_party",
"position": "occupation",
"sport": "type_of_sport",
"venue": "sports_venue",
"genre": "music_genre",
"videoname": "film",
"songname": "song",
"location": "loc",
"person": "per",
"sportteam": "sport_team",
"anaphor": "black",
"date": "black",
"number": "black",
"year": "black",
"duration": "black",
"ordinal": "black",
}


def get_result(request, what_to_annotate):
Expand All @@ -51,7 +70,6 @@ def get_result(request, what_to_annotate):
utts_list = []
utts_list_init = []
utts_nums = []
last_utt_starts = []
for n, hist_utt in enumerate(last_utts):
if isinstance(hist_utt, str):
hist_utt = [hist_utt]
Expand All @@ -63,15 +81,10 @@ def get_result(request, what_to_annotate):
prev_utt = hist_utt[-2]
if prev_utt and prev_utt[-1] not in {".", "!", "?"}:
prev_utt = f"{prev_utt}."
last_utt_starts.append(len(prev_utt) + 1)
concat_utt = f"{prev_utt} {last_utt}"
else:
last_utt_starts.append(0)
concat_utt = last_utt
if lowercase:
utts_list.append(concat_utt.lower())
else:
utts_list.append(concat_utt)
utts_list.append(concat_utt)
utts_list_init.append(concat_utt)
utts_nums.append(n)

Expand All @@ -84,74 +97,55 @@ def get_result(request, what_to_annotate):
entity_substr_batch,
entity_offsets_batch,
entity_positions_batch,
tokens_batch,
tags_batch,
finegrained_tags_batch,
sentences_offsets_batch,
sentences_batch,
probas_batch,
tokens_conf_batch,
) = entity_detection(utts_list)
logger.info(f"entity_substr_batch {entity_substr_batch} finegrained_tags_batch {finegrained_tags_batch}")
for (
entity_substr_list,
tags_list,
finegrained_tags_list,
entity_offsets_list,
last_utt_start,
uttr,
num,
) in zip(
logger.info(f"entity_substr_batch {entity_substr_batch} tags_batch {tags_batch}")
for (entity_substr_list, tags_list, probas_list, entity_offsets_list, uttr, num,) in zip(
entity_substr_batch,
tags_batch,
finegrained_tags_batch,
probas_batch,
entity_offsets_batch,
last_utt_starts,
utts_list_init,
utts_nums,
):
utt_entities = {}
for entity, tag, finegrained_tags, (start_offset, end_offset) in zip(
entity_substr_list, tags_list, finegrained_tags_list, entity_offsets_list
for entity, tag, proba, (start_offset, end_offset) in zip(
entity_substr_list, tags_list, probas_list, entity_offsets_list
):
entity_init = uttr[start_offset:end_offset]
if entity_init.lower() == entity:
entity = entity_init
if entity.lower() not in stopwords and len(entity) > 2 and start_offset >= last_utt_start:
if (
entity.lower() not in stopwords
and len(entity) > 2
and not (len(entity.split()) == 1 and nlp(entity)[0].pos_ == "PRON")
):
entity = EVERYTHING_EXCEPT_LETTERS_DIGITALS_AND_SPACE.sub(" ", entity)
entity = DOUBLE_SPACES.sub(" ", entity).strip()
filtered_finegrained_tags = []
if finegrained_tags[0][0] > 0.5:
tag = finegrained_tags[0][1].lower()
conf, finegrained_tag = finegrained_tags[0]
filtered_finegrained_tags.append((finegrained_tag.lower(), round(conf, 3)))
for finegrained_elem in finegrained_tags[1:]:
conf, finegrained_tag = finegrained_elem
if conf > 0.2:
filtered_finegrained_tags.append((finegrained_tag.lower(), round(conf, 3)))
else:
tag = "misc"
filtered_finegrained_tags.append(("misc", 1.0))
if not finegrained:
tag = replace_finegrained_tags(tag)
finegrained_tag = replace_tag_dict.get(tag.lower(), tag.lower())
if finegrained_tag == "black":
continue
if "entities" in utt_entities:
utt_entities["entities"].append(entity)
utt_entities["labelled_entities"].append(
{
"text": entity,
"label": tag,
"finegrained_label": filtered_finegrained_tags,
"offsets": (start_offset - last_utt_start, end_offset - last_utt_start),
"label": tag.lower(),
"finegrained_label": [(finegrained_tag, proba)],
"offsets": (start_offset, end_offset),
}
)
else:
utt_entities["entities"] = [entity]
utt_entities["labelled_entities"] = [
{
"text": entity,
"label": tag,
"finegrained_label": filtered_finegrained_tags,
"offsets": (start_offset - last_utt_start, end_offset - last_utt_start),
"label": tag.lower(),
"finegrained_label": [(finegrained_tag, proba)],
"offsets": (start_offset, end_offset),
}
]

Expand Down
60 changes: 0 additions & 60 deletions annotators/entity_detection/src/el_tags_infer.json

This file was deleted.

Loading

0 comments on commit c55f796

Please sign in to comment.