deeppavlov · dilyararimovna · Aug 29, 2022 · Aug 12, 2022 · Aug 23, 2022 · Aug 23, 2022
diff --git a/assistant_dists/dream_mini/cpu.yml b/assistant_dists/dream_mini/cpu.yml
@@ -1,14 +1,14 @@
 version: '3.7'
 services:
-  convers-evaluator-annotator:
+  dialogpt:
     environment:
       DEVICE: cpu
       CUDA_VISIBLE_DEVICES: ""
-  dialogpt:
+  intent-catcher:
     environment:
       DEVICE: cpu
       CUDA_VISIBLE_DEVICES: ""
-  intent-catcher:
+  sentence-ranker:
     environment:
       DEVICE: cpu
       CUDA_VISIBLE_DEVICES: ""
diff --git a/assistant_dists/dream_mini/dev.yml b/assistant_dists/dream_mini/dev.yml
@@ -5,12 +5,6 @@ services:
       - ".:/dp-agent"
     ports:
       - 4242:4242
-  convers-evaluator-annotator:
-    volumes:
-      - "./annotators/ConversationEvaluator:/src"
-      - "~/.deeppavlov:/root/.deeppavlov"
-    ports:
-      - 8004:8004
   dff-program-y-skill:
     volumes:
       - "./skills/dff_program_y_skill:/src"
@@ -57,4 +51,9 @@ services:
       - "./services/dialogpt:/src"
     ports:
       - 8125:8125
+  sentence-ranker:
+    volumes:
+      - "./services/sentence_ranker:/src"
+    ports:
+      - 8128:8128
 version: "3.7"
diff --git a/assistant_dists/dream_mini/docker-compose.override.yml b/assistant_dists/dream_mini/docker-compose.override.yml
@@ -2,29 +2,10 @@ services:
   agent:
     command: sh -c 'bin/wait && python -m deeppavlov_agent.run agent.pipeline_config=assistant_dists/dream_mini/pipeline_conf.json'
     environment:
-      WAIT_HOSTS: "convers-evaluator-annotator:8004, dff-program-y-skill:8008, sentseg:8011, convers-evaluation-selector:8009, 
+      WAIT_HOSTS: "dff-program-y-skill:8008, sentseg:8011, convers-evaluation-selector:8009, 
           dff-intent-responder-skill:8012, intent-catcher:8014, badlisted-words:8018,
-          spelling-preprocessing:8074, dialogpt:8125"
+          spelling-preprocessing:8074, dialogpt:8125, sentence-ranker:8128"
       WAIT_HOSTS_TIMEOUT: ${WAIT_TIMEOUT:-480}
-  convers-evaluator-annotator:
-    env_file: [.env]
-    build:
-      args:
-        CONFIG: conveval.json
-        PORT: 8004
-        DATA_URL: https://files.deeppavlov.ai/alexaprize_data/cobot_conveval2.tar.gz
-      context: .
-      dockerfile: ./annotators/ConversationEvaluator/Dockerfile
-    environment:
-      - CUDA_VISIBLE_DEVICES=0
-    deploy:
-      mode: replicated
-      replicas: 1
-      resources:
-        limits:
-          memory: 2G
-        reservations:
-          memory: 2G
 
   dff-program-y-skill:
     env_file: [.env]
@@ -173,4 +154,22 @@ services:
         reservations:
           memory: 2G
 
+  sentence-ranker:
+    env_file: [ .env ]
+    build:
+      args:
+        SERVICE_PORT: 8128
+        PRETRAINED_MODEL_NAME_OR_PATH: sentence-transformers/bert-base-nli-mean-tokens
+      context: ./services/sentence_ranker/
+    command: flask run -h 0.0.0.0 -p 8128
+    environment:
+      - CUDA_VISIBLE_DEVICES=0
+      - FLASK_APP=server
+    deploy:
+      resources:
+        limits:
+          memory: 3G
+        reservations:
+          memory: 3G
+
 version: '3.7'
diff --git a/assistant_dists/dream_mini/pipeline_conf.json b/assistant_dists/dream_mini/pipeline_conf.json
@@ -202,13 +202,13 @@
                 ],
                 "state_manager_method": "add_hypothesis_annotation_batch"
             },
-            "convers_evaluator_annotator": {
+            "sentence_ranker": {
                 "connector": {
                     "protocol": "http",
                     "timeout": 1,
-                    "url": "http://convers-evaluator-annotator:8004/batch_model"
+                    "url": "http://sentence-ranker:8128/respond"
                 },
-                "dialog_formatter": "state_formatters.dp_formatters:convers_evaluator_annotator_formatter",
+                "dialog_formatter": "state_formatters.dp_formatters:sentence_ranker_formatter",
                 "response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
                 "previous_services": ["skills"],
                 "state_manager_method": "add_hypothesis_annotation_batch"

diff --git a/assistant_dists/dream_mini/proxy.yml b/assistant_dists/dream_mini/proxy.yml
@@ -1,12 +1,4 @@
 services:
-  convers-evaluator-annotator:
-    command: ["nginx", "-g", "daemon off;"]
-    build:
-      context: dp/proxy/
-      dockerfile: Dockerfile
-    environment:
-      - PROXY_PASS=dream.deeppavlov.ai:8004
-      - PORT=8004
 
   dff-program-y-skill:
     command: ["nginx", "-g", "daemon off;"]
@@ -80,4 +72,12 @@ services:
       - PROXY_PASS=dream.deeppavlov.ai:8125
       - PORT=8125
 
+  sentence-ranker:
+    command: [ "nginx", "-g", "daemon off;" ]
+    build:
+      context: dp/proxy/
+      dockerfile: Dockerfile
+    environment:
+      - PROXY_PASS=dream.deeppavlov.ai:8128
+      - PORT=8128
 version: '3.7'
diff --git a/common/utils.py b/common/utils.py
@@ -1208,18 +1208,6 @@ def is_special_factoid_question(annotated_utterance):
 )
 
 
-def get_conv_eval_annotations(annotated_utterance):
-    default_conv_eval = {
-        "isResponseOnTopic": 0.0,
-        "isResponseInteresting": 0.0,
-        "responseEngagesUser": 0.0,
-        "isResponseComprehensible": 0.0,
-        "isResponseErroneous": 0.0,
-    }
-
-    return annotated_utterance.get("annotations", {}).get("convers_evaluator_annotator", default_conv_eval)
-
-
 def get_dialog_breakdown_annotations(annotated_utterance):
     breakdown = annotated_utterance.get("annotations", {}).get("dialog_breakdown", {}).get("breakdown", 0.0) > 0.5
     return breakdown
diff --git a/response_selectors/convers_evaluation_based_selector/server.py b/response_selectors/convers_evaluation_based_selector/server.py
@@ -20,17 +20,14 @@
     low_priority_intents,
     substitute_nonwords,
     is_toxic_or_badlisted_utterance,
-    get_conv_eval_annotations,
 )
 from tag_based_selection import tag_based_response_selection
 from utils import (
     add_question_to_statement,
     lower_duplicates_score,
     lower_retrieve_skills_confidence_if_scenario_exist,
-    calculate_single_convers_evaluator_score,
+    calculate_single_evaluator_score,
     downscore_toxic_badlisted_responses,
-    CONV_EVAL_STRENGTH,
-    CONFIDENCE_STRENGTH,
     how_are_you_spec,
     what_i_can_do_spec,
     misheard_with_spec1,
@@ -104,7 +101,9 @@ def respond():
                         )
                         logger.info(msg)
 
-                curr_scores += [get_conv_eval_annotations(skill_data)]
+                curr_scores += [
+                    calculate_single_evaluator_score(skill_data.get("annotations"), skill_data["confidence"])
+                ]
 
             curr_is_toxics = np.array(curr_is_toxics)
             curr_scores = np.array(curr_scores)
@@ -293,24 +292,17 @@ def rule_score_based_selection(dialog, candidates, scores, confidences, is_toxic
             dummy_question_human_attr = candidates[i].get("human_attributes", {})
 
         if curr_score is None:
-            cand_scores = scores[i]
+            score = scores[i]
             confidence = confidences[i]
             skill_name = skill_names[i]
-            score_conv_eval = calculate_single_convers_evaluator_score(cand_scores)
-            score = CONV_EVAL_STRENGTH * score_conv_eval + CONFIDENCE_STRENGTH * confidence
             logger.info(
-                f"Skill {skill_name} has final score: {score}. Confidence: {confidence}. "
-                f"Toxicity: {is_toxics[i]}. Cand scores: {cand_scores}"
+                f"Skill {skill_name} has final score: {score}. Confidence: {confidence}. " f"Toxicity: {is_toxics[i]}"
             )
             curr_single_scores.append(score)
         else:
-            cand_scores = scores[i]
+            score = scores[i]
             skill_name = skill_names[i]
-            score_conv_eval = calculate_single_convers_evaluator_score(cand_scores)
-            score = CONV_EVAL_STRENGTH * score_conv_eval + curr_score
-            logger.info(
-                f"Skill {skill_name} has final score: {score}. " f"Toxicity: {is_toxics[i]}. Cand scores: {cand_scores}"
-            )
+            logger.info(f"Skill {skill_name} has final score: {score}. " f"Toxicity: {is_toxics[i]}")
             curr_single_scores.append(score)
 
     highest_conf_exist = True if any(confidences >= 1.0) else False

diff --git a/response_selectors/convers_evaluation_based_selector/tag_based_selection.py b/response_selectors/convers_evaluation_based_selector/tag_based_selection.py
@@ -28,9 +28,6 @@
     get_dialog_breakdown_annotations,
 )
 from utils import (
-    calculate_single_convers_evaluator_score,
-    CONV_EVAL_STRENGTH,
-    CONFIDENCE_STRENGTH,
     how_are_you_spec,
     what_i_can_do_spec,
     misheard_with_spec1,
@@ -251,28 +248,6 @@ def acknowledgement_decision(all_user_intents):
     return False
 
 
-def compute_curr_single_scores(candidates, scores, confidences):
-    curr_single_scores = []
-    if all(["hypothesis_scorer" in cand["annotations"] for cand in candidates]):
-        for i in range(len(candidates)):
-            curr_single_scores.append(candidates[i]["annotations"]["hypothesis_scorer"])
-    else:
-        for i in range(len(scores)):
-            cand_scores = scores[i]
-            confidence = confidences[i]
-            skill_name = candidates[i]["skill_name"]
-            if all(["dialogrpt" in cand["annotations"] for cand in candidates]):
-                score_conv_eval = candidates[i]["annotations"]["dialogrpt"]
-            else:
-                score_conv_eval = calculate_single_convers_evaluator_score(cand_scores)
-            score = CONV_EVAL_STRENGTH * score_conv_eval + CONFIDENCE_STRENGTH * confidence
-
-            logger.info(f"Skill {skill_name} has final score: {score}. Confidence: {confidence}.")
-            curr_single_scores.append(score)
-
-    return curr_single_scores
-
-
 def add_to_top1_category(cand_id, categorized, _is_require_action_intent):
     if _is_require_action_intent:
         categorized["active_same_topic_entity_no_db_reqda"].append(cand_id)
@@ -351,7 +326,9 @@ def rule_based_prioritization(cand_uttr, dialog):
     return flag
 
 
-def tag_based_response_selection(dialog, candidates, scores, confidences, bot_utterances, all_prev_active_skills=None):
+def tag_based_response_selection(
+    dialog, candidates, curr_single_scores, confidences, bot_utterances, all_prev_active_skills=None
+):
     all_prev_active_skills = all_prev_active_skills if all_prev_active_skills is not None else []
     all_prev_active_skills = Counter(all_prev_active_skills)
     annotated_uttr = dialog["human_utterances"][-1]
@@ -423,6 +400,10 @@ def tag_based_response_selection(dialog, candidates, scores, confidences, bot_ut
         if confidences[cand_id] == 0.0 and cand_uttr["skill_name"] not in ACTIVE_SKILLS:
             logger.info(f"Dropping cand_id: {cand_id} due to toxicity/badlists")
             continue
+        skill_name = cand_uttr["skill_name"]
+        confidence = confidences[cand_id]
+        score = curr_single_scores[cand_id]
+        logger.info(f"Skill {skill_name} has final score: {score}. Confidence: {confidence}.")
 
         all_cand_intents, all_cand_topics, all_cand_named_entities, all_cand_nounphrases = get_main_info_annotations(
             cand_uttr
@@ -646,7 +627,6 @@ def tag_based_response_selection(dialog, candidates, scores, confidences, bot_ut
 
     logger.info(f"Current CASE: {CASE}")
     # now compute current scores as one float value
-    curr_single_scores = compute_curr_single_scores(candidates, scores, confidences)
 
     # remove disliked skills from hypotheses
     if IGNORE_DISLIKED_SKILLS:

diff --git a/response_selectors/convers_evaluation_based_selector/utils.py b/response_selectors/convers_evaluation_based_selector/utils.py
@@ -119,8 +119,7 @@ def lower_duplicates_score(candidates, bot_utt_counter, scores, confidences):
         # apply penalties to non-script skills and in case if response consists only from duplicates
         if confidences[i] < 1.0 or n_duplicates == len(cand_sents):
             confidences[i] /= coeff
-            scores[i]["isResponseInteresting"] /= coeff
-            scores[i]["responseEngagesUser"] /= coeff
+            scores[i] /= coeff
 
 
 def lower_retrieve_skills_confidence_if_scenario_exist(candidates, scores, confidences):
@@ -134,33 +133,42 @@ def lower_retrieve_skills_confidence_if_scenario_exist(candidates, scores, confi
         for i, cand in enumerate(candidates):
             if cand["skill_name"] in retrieve_skills:
                 confidences[i] *= lower_coeff
-                scores[i]["isResponseInteresting"] *= lower_coeff
-
-
-def calculate_single_convers_evaluator_score(cand_scores):
-    score_conv_eval = sum(
-        [
-            cand_scores["isResponseOnTopic"],
-            cand_scores["isResponseInteresting"],
-            cand_scores["responseEngagesUser"],
-            cand_scores["isResponseComprehensible"],
-        ]
-    )
-    score_conv_eval -= cand_scores["isResponseErroneous"]
-    return score_conv_eval
+                scores[i] *= lower_coeff
+
+
+def calculate_single_evaluator_score(hypothesis_annotations, confidence):
+    if "convers_evaluator_annotator" in hypothesis_annotations:
+        cand_scores = hypothesis_annotations["convers_evaluator_annotator"]
+        score_conv_eval = sum(
+            [
+                cand_scores["isResponseOnTopic"],
+                cand_scores["isResponseInteresting"],
+                cand_scores["responseEngagesUser"],
+                cand_scores["isResponseComprehensible"],
+            ]
+        )
+        score_conv_eval -= cand_scores["isResponseErroneous"]
+        score = CONV_EVAL_STRENGTH * score_conv_eval + CONFIDENCE_STRENGTH * confidence
+        return score
+    elif "dialogrpt" in hypothesis_annotations:
+        score_conv_eval = hypothesis_annotations["dialogrpt"]
+        score = CONV_EVAL_STRENGTH * score_conv_eval + CONFIDENCE_STRENGTH * confidence
+        return score
+    elif "sentence_ranker" in hypothesis_annotations:
+        score_conv_eval = hypothesis_annotations["sentence_ranker"]
+        score = CONV_EVAL_STRENGTH * score_conv_eval + CONFIDENCE_STRENGTH * confidence
+        return score
+    elif "hypothesis_scorer" in hypothesis_annotations:
+        return hypothesis_annotations["hypothesis_scorer"]
+    else:
+        return 0.0
 
 
 def downscore_toxic_badlisted_responses(scores, confidences, is_toxics):
     # exclude toxic messages and messages with badlisted phrases
     ids = np.arange(len(confidences))[is_toxics]
     logger.info(f"Bot excluded utterances: {ids}. is_toxics: {is_toxics}")
-    scores[ids] = {
-        "isResponseOnTopic": 0.0,
-        "isResponseInteresting": 0.0,
-        "responseEngagesUser": 0.0,
-        "isResponseComprehensible": 0.0,
-        "isResponseErroneous": 1.0,
-    }
+    scores[ids] = 0.0
     confidences[ids] = 0.0
 
     return len(ids), scores, confidences

diff --git a/services/sentence_ranker/Dockerfile b/services/sentence_ranker/Dockerfile
@@ -0,0 +1,23 @@
+# syntax=docker/dockerfile:experimental
+
+FROM pytorch/pytorch:1.5-cuda10.1-cudnn7-runtime
+
+WORKDIR /src
+
+ARG PRETRAINED_MODEL_NAME_OR_PATH
+ENV PRETRAINED_MODEL_NAME_OR_PATH ${PRETRAINED_MODEL_NAME_OR_PATH}
+ARG SERVICE_PORT
+ENV SERVICE_PORT ${SERVICE_PORT}
+
+RUN mkdir /data/
+
+COPY ./requirements.txt /src/requirements.txt
+RUN pip install -r /src/requirements.txt
+
+RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('${PRETRAINED_MODEL_NAME_OR_PATH}');"
+RUN python -c "from transformers import AutoModel; AutoModel.from_pretrained('${PRETRAINED_MODEL_NAME_OR_PATH}');"
+
+COPY . /src
+
+CMD gunicorn --workers=1 server:app -b 0.0.0.0:${SERVICE_PORT} --timeout=300
+
diff --git a/services/sentence_ranker/README.md b/services/sentence_ranker/README.md
@@ -0,0 +1,9 @@
+# Sentence Ranker Service
+
+This is a universal service for evaluation of a sentence pair.
+
+The model can be selected from HugginFace library and passed as a `PRETRAINED_MODEL_NAME_OR_PATH` parameter.
+
+The service accepts a batch of sentence pairs (a pair is a list of two strings), and returns a batch of floating point values. 
+
+To rank a list of sentence pairs, one can get floating point values for each pair and maximize the value.