deeppavlov · dilyararimovna · Feb 7, 2022 · Jan 27, 2022 · Jan 27, 2022 · Jan 27, 2022
diff --git a/annotators/spelling_preprocessing_ru/Dockerfile b/annotators/spelling_preprocessing_ru/Dockerfile
@@ -0,0 +1,29 @@
+FROM tensorflow/tensorflow:1.15.2-gpu
+
+RUN apt-get -y update && \
+    apt-get install -y software-properties-common && \
+    apt-get update && apt-get install git -y
+
+ARG CONFIG
+ARG COMMIT=0.13.0
+ARG PORT
+ARG SRC_DIR
+ARG SED_ARG=" | "
+
+ENV CONFIG=$CONFIG
+ENV PORT=$PORT
+
+COPY ./annotators/spelling_preprocessing_ru/requirements.txt /src/requirements.txt
+RUN pip install -r /src/requirements.txt
+
+RUN pip install git+https://github.com/deepmipt/DeepPavlov.git@${COMMIT}
+
+COPY $SRC_DIR /src
+
+WORKDIR /src
+
+RUN python -m deeppavlov install $CONFIG
+
+RUN sed -i "s|$SED_ARG|g" "$CONFIG"
+
+CMD gunicorn  --workers=1 --timeout 500 server:app -b 0.0.0.0:8074
diff --git a/annotators/spelling_preprocessing_ru/levenshtein_corrector_ru.json b/annotators/spelling_preprocessing_ru/levenshtein_corrector_ru.json
@@ -0,0 +1,60 @@
+{
+  "chainer":{
+    "in": ["x"],
+    "pipe": [
+      {
+        "class_name": "str_lower",
+        "id": "lower",
+        "in": ["x"],
+        "out": ["x_lower"]
+      },
+      {
+        "class_name": "nltk_moses_tokenizer",
+        "id": "tokenizer",
+        "in": ["x_lower"],
+        "out": ["x_tokens"]
+      },
+      {
+        "id": "vocab",
+        "class_name": "simple_vocab",
+        "save_path": "{DOWNLOADS_PATH}/vocabs/russian_words_vocab.dict",
+        "load_path": "{DOWNLOADS_PATH}/vocabs/russian_words_vocab.dict"
+      },
+      {
+        "in": ["x_tokens"],
+        "out": ["tokens_candidates"],
+        "class_name": "spelling_levenshtein",
+        "words": "#vocab.keys()"
+      },
+      {
+        "class_name": "kenlm_elector",
+        "in": ["tokens_candidates"],
+        "out": ["y_predicted_tokens"],
+        "load_path": "{DOWNLOADS_PATH}/language_models/ru_wiyalen_no_punkt.arpa.binary"
+      },
+      {
+        "ref": "tokenizer",
+        "in": ["y_predicted_tokens"],
+        "out": ["y_predicted"]
+      }
+    ],
+    "out": ["y_predicted"]
+  },
+  "metadata": {
+    "variables": {
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "MODELS_PATH": "{ROOT_PATH}/models"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/vocabs/russian_words_vocab.dict.gz",
+        "subdir": "{DOWNLOADS_PATH}/vocabs"
+      },
+      {
+        "url": "http://files.deeppavlov.ai/lang_models/ru_wiyalen_no_punkt.arpa.binary.gz",
+        "subdir": "{DOWNLOADS_PATH}/language_models"
+      }
+    ]
+  }
+}
diff --git a/annotators/spelling_preprocessing_ru/requirements.txt b/annotators/spelling_preprocessing_ru/requirements.txt
@@ -0,0 +1,4 @@
+sentry-sdk[flask]==0.14.1
+flask==1.1.1
+gunicorn==19.9.0
+requests==2.22.0
diff --git a/annotators/spelling_preprocessing_ru/server.py b/annotators/spelling_preprocessing_ru/server.py
@@ -0,0 +1,43 @@
+import logging
+import os
+import time
+
+import sentry_sdk
+from flask import Flask, jsonify, request
+
+from deeppavlov import build_model
+
+sentry_sdk.init(os.getenv("SENTRY_DSN"))
+
+logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = Flask(__name__)
+
+config_name = os.getenv("CONFIG")
+
+try:
+    spelling_preprocessing_model = build_model(config_name, download=True)
+    r = "я ге видел малако"
+    logger.info(f"Original: {r}. Corrected: {spelling_preprocessing_model([r])}")
+    logger.info("spelling_preprocessing model is loaded.")
+except Exception as e:
+    sentry_sdk.capture_exception(e)
+    logger.exception(e)
+    raise e
+
+
+@app.route("/respond", methods=["POST"])
+def respond():
+    st_time = time.time()
+
+    sentences = request.json["sentences"]
+    sentences = [text.lower() for text in sentences]
+    corrected_sentences = spelling_preprocessing_model(sentences)
+
+    total_time = time.time() - st_time
+    logger.info(f"spelling_preprocessing exec time: {total_time:.3f}s")
+    return jsonify(corrected_sentences)
+
+
+if __name__ == "__main__":
+    app.run(debug=False, host="0.0.0.0", port=8074)
diff --git a/annotators/spelling_preprocessing_ru/test.sh b/annotators/spelling_preprocessing_ru/test.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+
+python test_server.py
diff --git a/annotators/spelling_preprocessing_ru/test_server.py b/annotators/spelling_preprocessing_ru/test_server.py
@@ -0,0 +1,22 @@
+import requests
+
+
+def main():
+    url = "http://0.0.0.0:8074/respond"
+
+    request_data = [{"sentences": ["я ге видел малако"]}]
+
+    gold_results = [["я не видел малакон"]]
+
+    count = 0
+    for data, gold_result in zip(request_data, gold_results):
+        result = requests.post(url, json=data).json()
+        if result == gold_result:
+            count += 1
+
+    assert count == len(request_data)
+    print("Success")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/assistant_dists/dream_russian/dev.yml b/assistant_dists/dream_russian/dev.yml
@@ -78,7 +78,8 @@ services:
     #   - ./venv/data/db_data:/root/data/db
   spelling-preprocessing:
     volumes:
-      - "./annotators/spelling_preprocessing:/src"
+      - "./annotators/spelling_preprocessing_ru:/src"
+      - "~/.deeppavlov:/root/.deeppavlov"
     ports:
       - 8074:8074
   dff-friendship-skill:

diff --git a/assistant_dists/dream_russian/docker-compose.override.yml b/assistant_dists/dream_russian/docker-compose.override.yml
@@ -189,16 +189,22 @@ services:
   spelling-preprocessing:
     env_file: [.env]
     build:
-      context: ./annotators/spelling_preprocessing/
+      args:
+        CONFIG: levenshtein_corrector_ru.json
+        PORT: 8074
+        SRC_DIR: annotators/spelling_preprocessing_ru
+        COMMIT: f5117cd9ad1e64f6c2d970ecaa42fc09ccb23144
+      context: ./
+      dockerfile: annotators/spelling_preprocessing_ru/Dockerfile
     command: flask run -h 0.0.0.0 -p 8074
     environment:
       - FLASK_APP=server
     deploy:
       resources:
         limits:
-          memory: 50M
+          memory: 256M
         reservations:
-          memory: 50M
+          memory: 256M
 
   dff-friendship-skill:
     env_file: [.env]

diff --git a/assistant_dists/dream_russian/pipeline_conf.json b/assistant_dists/dream_russian/pipeline_conf.json
@@ -177,8 +177,7 @@
                     "annotators.ner"
                 ],
                 "previous_services": [
-                    "annotators.entity_detection",
-                    "annotators.spacy_nounphrases"
+                    "annotators.entity_detection"
                 ]
             },
             "wiki_parser": {
@@ -333,30 +332,6 @@
                 ],
                 "state_manager_method": "add_hypothesis_annotation_batch"
             },
-            "convers_evaluator_annotator": {
-                "connector": {
-                    "protocol": "http",
-                    "timeout": 1,
-                    "url": "http://convers-evaluator-annotator:8004/batch_model"
-                },
-                "dialog_formatter": "state_formatters.dp_formatters:convers_evaluator_annotator_formatter",
-                "response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
-                "previous_services": ["skills"],
-                "state_manager_method": "add_hypothesis_annotation_batch"
-            },
-            "spacy_nounphrases": {
-                "connector": {
-                    "protocol": "http",
-                    "timeout": 1,
-                    "url": "http://spacy-nounphrases:8006/respond_batch"
-                },
-                "dialog_formatter": "state_formatters.dp_formatters:hypotheses_list",
-                "response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
-                "previous_services": [
-                    "skills"
-                ],
-                "state_manager_method": "add_hypothesis_annotation_batch"
-            },
             "entity_detection": {
                 "connector": {
                     "protocol": "http",
@@ -369,45 +344,6 @@
                     "skills"
                 ],
                 "state_manager_method": "add_hypothesis_annotation_batch"
-            },
-            "midas_classification": {
-                "connector": {
-                    "protocol": "http",
-                    "timeout": 1,
-                    "url": "http://midas-classification:8090/batch_model"
-                },
-                "dialog_formatter": "state_formatters.dp_formatters:hypotheses_list_last_uttr",
-                "response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
-                "previous_services": [
-                    "skills"
-                ],
-                "state_manager_method": "add_hypothesis_annotation_batch"
-            },
-            "combined_classification": {
-                "connector": {
-                    "protocol": "http",
-                    "timeout": 1,
-                    "url": "http://combined-classification:8087/batch_model"
-                },
-                "dialog_formatter": "state_formatters.dp_formatters:hypothesis_histories_list",
-                "response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
-                "previous_services": [
-                    "skills"
-                ],
-                "state_manager_method": "add_hypothesis_annotation_batch"
-            },
-            "hypothesis_scorer": {
-                "connector": {
-                    "protocol": "http",
-                    "timeout": 1,
-                    "url": "http://hypothesis-scorer:8110/batch_model"
-                },
-                "dialog_formatter": "state_formatters.dp_formatters:hypothesis_scorer_formatter",
-                "response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
-                "previous_services": [
-                    "skills"
-                ],
-                "state_manager_method": "add_hypothesis_annotation_batch"
             }
         },
         "response_selectors": {