Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/spellchecker levenstein ru #89

Merged
merged 21 commits into from
Feb 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions annotators/spelling_preprocessing_ru/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
FROM tensorflow/tensorflow:1.15.2-gpu

RUN apt-get -y update && \
apt-get install -y software-properties-common && \
apt-get update && apt-get install git -y

ARG CONFIG
ARG COMMIT=0.13.0
ARG PORT
ARG SRC_DIR
ARG SED_ARG=" | "

ENV CONFIG=$CONFIG
ENV PORT=$PORT

COPY ./annotators/spelling_preprocessing_ru/requirements.txt /src/requirements.txt
RUN pip install -r /src/requirements.txt

RUN pip install git+https://github.com/deepmipt/DeepPavlov.git@${COMMIT}

COPY $SRC_DIR /src

WORKDIR /src

RUN python -m deeppavlov install $CONFIG

RUN sed -i "s|$SED_ARG|g" "$CONFIG"

CMD gunicorn --workers=1 --timeout 500 server:app -b 0.0.0.0:8074
60 changes: 60 additions & 0 deletions annotators/spelling_preprocessing_ru/levenshtein_corrector_ru.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
{
"chainer":{
"in": ["x"],
"pipe": [
{
"class_name": "str_lower",
"id": "lower",
"in": ["x"],
"out": ["x_lower"]
},
{
"class_name": "nltk_moses_tokenizer",
"id": "tokenizer",
"in": ["x_lower"],
"out": ["x_tokens"]
},
{
"id": "vocab",
"class_name": "simple_vocab",
"save_path": "{DOWNLOADS_PATH}/vocabs/russian_words_vocab.dict",
"load_path": "{DOWNLOADS_PATH}/vocabs/russian_words_vocab.dict"
},
{
"in": ["x_tokens"],
"out": ["tokens_candidates"],
"class_name": "spelling_levenshtein",
"words": "#vocab.keys()"
},
{
"class_name": "kenlm_elector",
"in": ["tokens_candidates"],
"out": ["y_predicted_tokens"],
"load_path": "{DOWNLOADS_PATH}/language_models/ru_wiyalen_no_punkt.arpa.binary"
},
{
"ref": "tokenizer",
"in": ["y_predicted_tokens"],
"out": ["y_predicted"]
}
],
"out": ["y_predicted"]
},
"metadata": {
"variables": {
"ROOT_PATH": "~/.deeppavlov",
"DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
"MODELS_PATH": "{ROOT_PATH}/models"
},
"download": [
{
"url": "http://files.deeppavlov.ai/deeppavlov_data/vocabs/russian_words_vocab.dict.gz",
"subdir": "{DOWNLOADS_PATH}/vocabs"
},
{
"url": "http://files.deeppavlov.ai/lang_models/ru_wiyalen_no_punkt.arpa.binary.gz",
"subdir": "{DOWNLOADS_PATH}/language_models"
}
]
}
}
4 changes: 4 additions & 0 deletions annotators/spelling_preprocessing_ru/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
sentry-sdk[flask]==0.14.1
flask==1.1.1
gunicorn==19.9.0
requests==2.22.0
43 changes: 43 additions & 0 deletions annotators/spelling_preprocessing_ru/server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import logging
import os
import time

import sentry_sdk
from flask import Flask, jsonify, request

from deeppavlov import build_model

sentry_sdk.init(os.getenv("SENTRY_DSN"))

logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO)
logger = logging.getLogger(__name__)
app = Flask(__name__)

config_name = os.getenv("CONFIG")

try:
spelling_preprocessing_model = build_model(config_name, download=True)
r = "я ге видел малако"
logger.info(f"Original: {r}. Corrected: {spelling_preprocessing_model([r])}")
logger.info("spelling_preprocessing model is loaded.")
except Exception as e:
sentry_sdk.capture_exception(e)
logger.exception(e)
raise e


@app.route("/respond", methods=["POST"])
def respond():
st_time = time.time()

sentences = request.json["sentences"]
sentences = [text.lower() for text in sentences]
corrected_sentences = spelling_preprocessing_model(sentences)

total_time = time.time() - st_time
logger.info(f"spelling_preprocessing exec time: {total_time:.3f}s")
return jsonify(corrected_sentences)


if __name__ == "__main__":
app.run(debug=False, host="0.0.0.0", port=8074)
4 changes: 4 additions & 0 deletions annotators/spelling_preprocessing_ru/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash


python test_server.py
22 changes: 22 additions & 0 deletions annotators/spelling_preprocessing_ru/test_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import requests


def main():
url = "http://0.0.0.0:8074/respond"

request_data = [{"sentences": ["я ге видел малако"]}]

gold_results = [["я не видел малакон"]]

count = 0
for data, gold_result in zip(request_data, gold_results):
result = requests.post(url, json=data).json()
if result == gold_result:
count += 1

assert count == len(request_data)
print("Success")


if __name__ == "__main__":
main()
3 changes: 2 additions & 1 deletion assistant_dists/dream_russian/dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ services:
# - ./venv/data/db_data:/root/data/db
spelling-preprocessing:
volumes:
- "./annotators/spelling_preprocessing:/src"
- "./annotators/spelling_preprocessing_ru:/src"
- "~/.deeppavlov:/root/.deeppavlov"
ports:
- 8074:8074
dff-friendship-skill:
Expand Down
12 changes: 9 additions & 3 deletions assistant_dists/dream_russian/docker-compose.override.yml
Original file line number Diff line number Diff line change
Expand Up @@ -189,16 +189,22 @@ services:
spelling-preprocessing:
env_file: [.env]
build:
context: ./annotators/spelling_preprocessing/
args:
CONFIG: levenshtein_corrector_ru.json
PORT: 8074
SRC_DIR: annotators/spelling_preprocessing_ru
COMMIT: f5117cd9ad1e64f6c2d970ecaa42fc09ccb23144
context: ./
dockerfile: annotators/spelling_preprocessing_ru/Dockerfile
command: flask run -h 0.0.0.0 -p 8074
environment:
- FLASK_APP=server
deploy:
resources:
limits:
memory: 50M
memory: 256M
reservations:
memory: 50M
memory: 256M

dff-friendship-skill:
env_file: [.env]
Expand Down
66 changes: 1 addition & 65 deletions assistant_dists/dream_russian/pipeline_conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,7 @@
"annotators.ner"
],
"previous_services": [
"annotators.entity_detection",
"annotators.spacy_nounphrases"
"annotators.entity_detection"
]
},
"wiki_parser": {
Expand Down Expand Up @@ -333,30 +332,6 @@
],
"state_manager_method": "add_hypothesis_annotation_batch"
},
"convers_evaluator_annotator": {
"connector": {
"protocol": "http",
"timeout": 1,
"url": "http://convers-evaluator-annotator:8004/batch_model"
},
"dialog_formatter": "state_formatters.dp_formatters:convers_evaluator_annotator_formatter",
"response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
"previous_services": ["skills"],
"state_manager_method": "add_hypothesis_annotation_batch"
},
"spacy_nounphrases": {
"connector": {
"protocol": "http",
"timeout": 1,
"url": "http://spacy-nounphrases:8006/respond_batch"
},
"dialog_formatter": "state_formatters.dp_formatters:hypotheses_list",
"response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
"previous_services": [
"skills"
],
"state_manager_method": "add_hypothesis_annotation_batch"
},
"entity_detection": {
"connector": {
"protocol": "http",
Expand All @@ -369,45 +344,6 @@
"skills"
],
"state_manager_method": "add_hypothesis_annotation_batch"
},
"midas_classification": {
"connector": {
"protocol": "http",
"timeout": 1,
"url": "http://midas-classification:8090/batch_model"
},
"dialog_formatter": "state_formatters.dp_formatters:hypotheses_list_last_uttr",
"response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
"previous_services": [
"skills"
],
"state_manager_method": "add_hypothesis_annotation_batch"
},
"combined_classification": {
"connector": {
"protocol": "http",
"timeout": 1,
"url": "http://combined-classification:8087/batch_model"
},
"dialog_formatter": "state_formatters.dp_formatters:hypothesis_histories_list",
"response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
"previous_services": [
"skills"
],
"state_manager_method": "add_hypothesis_annotation_batch"
},
"hypothesis_scorer": {
"connector": {
"protocol": "http",
"timeout": 1,
"url": "http://hypothesis-scorer:8110/batch_model"
},
"dialog_formatter": "state_formatters.dp_formatters:hypothesis_scorer_formatter",
"response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
"previous_services": [
"skills"
],
"state_manager_method": "add_hypothesis_annotation_batch"
}
},
"response_selectors": {
Expand Down