Skip to content

Commit

Permalink
Filter entities by offsets (#497)
Browse files Browse the repository at this point in the history
* Fix requirements.txt (#84)

* fix itsdangerous requirements

* pin itsdangerous requirements for all flask==1.1.1 servers

* fix entity offsets

* codestyle

---------

Co-authored-by: Andrii.Hura <54397922+AndriiHura@users.noreply.github.com>
Co-authored-by: mtalimanchuk <mtalimanchuk@gmail.com>
Co-authored-by: Dilyara Baymurzina <dilyara.rimovna@gmail.com>
  • Loading branch information
4 people committed Jun 22, 2023
1 parent 04dbfbb commit 407b48f
Showing 1 changed file with 9 additions and 0 deletions.
9 changes: 9 additions & 0 deletions annotators/entity_detection/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def get_result(request, what_to_annotate):
utts_list = []
utts_list_init = []
utts_nums = []
uttr_offsets = []
for n, hist_utt in enumerate(last_utts):
if isinstance(hist_utt, str):
hist_utt = [hist_utt]
Expand All @@ -81,12 +82,15 @@ def get_result(request, what_to_annotate):
prev_utt = hist_utt[-2]
if prev_utt and prev_utt[-1] not in {".", "!", "?"}:
prev_utt = f"{prev_utt}."
offset = len(prev_utt)
concat_utt = f"{prev_utt} {last_utt}"
else:
offset = 0
concat_utt = last_utt
utts_list.append(concat_utt)
utts_list_init.append(concat_utt)
utts_nums.append(n)
uttr_offsets.append(offset)

utt_entities_batch = [{} for _ in last_utts]
utt_entities = {}
Expand All @@ -110,13 +114,15 @@ def get_result(request, what_to_annotate):
entity_offsets_list,
uttr,
num,
uttr_offset,
) in zip(
entity_substr_batch,
tags_batch,
probas_batch,
entity_offsets_batch,
utts_list_init,
utts_nums,
uttr_offsets,
):
utt_entities = {}
for entity, tag, proba, (start_offset, end_offset) in zip(
Expand All @@ -129,7 +135,10 @@ def get_result(request, what_to_annotate):
entity.lower() not in stopwords
and len(entity) > 2
and not (len(entity.split()) == 1 and nlp(entity)[0].pos_ == "PRON")
and start_offset >= uttr_offset
):
start_offset -= uttr_offset
end_offset -= uttr_offset
entity = EVERYTHING_EXCEPT_LETTERS_DIGITALS_AND_SPACE.sub(" ", entity)
entity = DOUBLE_SPACES.sub(" ", entity).strip()
finegrained_tag = replace_tag_dict.get(tag.lower(), tag.lower())
Expand Down

0 comments on commit 407b48f

Please sign in to comment.