Skip to content

Commit

Permalink
Merge branch 'docs/add-sphinx-documentation' into docs/refactor-ner-g…
Browse files Browse the repository at this point in the history
…obot
  • Loading branch information
nikolay-bushkov committed Jun 6, 2018
2 parents dea89bf + 65ca607 commit 9d495b7
Show file tree
Hide file tree
Showing 28 changed files with 2,129 additions and 1,092 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,13 @@ Available model configs are:

| Component | Description |
| --------- | ----------- |
| [Slot filling and NER components](deeppavlov/models/ner/README.md) | Based on neural Named Entity Recognition network and fuzzy Levenshtein search to extract normalized slot values from text. The NER component reproduces architecture from the paper [Application of a Hybrid Bi-LSTM-CRF model to the task of Russian Named Entity Recognition](https://arxiv.org/pdf/1709.09686.pdf) which is inspired by Bi-LSTM+CRF architecture from https://arxiv.org/pdf/1603.01360.pdf. |
| [NER component](deeppavlov/models/ner/README.md) | Based on neural Named Entity Recognition network. The NER component reproduces architecture from the paper [Application of a Hybrid Bi-LSTM-CRF model to the task of Russian Named Entity Recognition](https://arxiv.org/pdf/1709.09686.pdf) which is inspired by Bi-LSTM+CRF architecture from https://arxiv.org/pdf/1603.01360.pdf. |
| [Slot filling components](deeppavlov/models/slotfill/README.md) | Based on fuzzy Levenshtein search to extract normalized slot values from text. The components either rely on NER results or perform needle in haystack search.|
| [Intent classification component](deeppavlov/models/classifiers/intents/README.md) | Based on shallow-and-wide Convolutional Neural Network architecture from [Kim Y. Convolutional neural networks for sentence classification – 2014](https://arxiv.org/pdf/1408.5882). The model allows multilabel classification of sentences. |
| [Automatic spelling correction component](deeppavlov/models/spelling_correction/README.md) | Pipelines that use candidates search in a static dictionary and an ARPA language model to correct spelling errors. |
| [Ranking component](deeppavlov/models/ranking/README.md) | Based on [LSTM-based deep learning models for non-factoid answer selection](https://arxiv.org/abs/1511.04108). The model performs ranking of responses or contexts from some database by their relevance for the given context. |
| [Question Answering component](deeppavlov/models/squad/README.md) | Based on [R-NET: Machine Reading Comprehension with Self-matching Networks](https://www.microsoft.com/en-us/research/publication/mrc/). The model solves the task of looking for an answer on a question in a given context ([SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) task format). |
| [Morphological tagging component](deeppavlov/models/morpho_tagger/README.md) | Based on character-based approach to morphological tagging [Heigold et al., 2017. An extensive empirical evaluation of character-based morphological tagging for 14 languages](http://www.aclweb.org/anthology/E17-1048). A state-of-the-art model for Russian and several other languages. Model assigns morphological tags in UD format to sequences of words.|
| **Skills** | |
| [Goal-oriented bot](deeppavlov/skills/go_bot/README.md) | Based on Hybrid Code Networks (HCNs) architecture from [Jason D. Williams, Kavosh Asadi, Geoffrey Zweig, Hybrid Code Networks: practical and efficient end-to-end dialog control with supervised and reinforcement learning – 2017](https://arxiv.org/abs/1702.03274). It allows to predict responses in goal-oriented dialog. The model is customizable: embeddings, slot filler and intent classifier can switched on and off on demand. |
| [Seq2seq goal-oriented bot](deeppavlov/skills/seq2seq_go_bot/README.md) | Dialogue agent predicts responses in a goal-oriented dialog and is able to handle multiple domains (pretrained bot allows calendar scheduling, weather information retrieval, and point-of-interest navigation). The model is end-to-end differentiable and does not need to explicitly model dialogue state or belief trackers. |
Expand Down
8 changes: 8 additions & 0 deletions deeppavlov/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
import deeppavlov.dataset_readers.typos_reader
import deeppavlov.dataset_readers.basic_classification_reader
import deeppavlov.dataset_readers.squad_dataset_reader
import deeppavlov.dataset_readers.morphotagging_dataset_reader

import deeppavlov.dataset_iterators.dialog_iterator
import deeppavlov.dataset_iterators.kvret_dialog_iterator
import deeppavlov.dataset_iterators.dstc2_ner_iterator
Expand All @@ -49,6 +51,8 @@
import deeppavlov.dataset_iterators.basic_classification_iterator
import deeppavlov.dataset_iterators.squad_iterator
import deeppavlov.dataset_iterators.sqlite_iterator
import deeppavlov.dataset_iterators.morphotagger_iterator

import deeppavlov.models.classifiers.intents.intent_model
import deeppavlov.models.commutators.random_commutator
import deeppavlov.models.embedders.fasttext_embedder
Expand All @@ -63,13 +67,17 @@
import deeppavlov.models.trackers.hcn_et
import deeppavlov.models.preprocessors.str_lower
import deeppavlov.models.preprocessors.squad_preprocessor
import deeppavlov.models.preprocessors.capitalization
import deeppavlov.models.preprocessors.dirty_comments_preprocessor
import deeppavlov.models.tokenizers.nltk_tokenizer
import deeppavlov.models.tokenizers.nltk_moses_tokenizer
import deeppavlov.models.tokenizers.spacy_tokenizer
import deeppavlov.models.tokenizers.split_tokenizer
import deeppavlov.models.tokenizers.ru_tokenizer
import deeppavlov.models.squad.squad
import deeppavlov.models.morpho_tagger.tagger
import deeppavlov.models.morpho_tagger.common

import deeppavlov.skills.go_bot.bot
import deeppavlov.skills.go_bot.network
import deeppavlov.skills.go_bot.tracker
Expand Down
76 changes: 76 additions & 0 deletions deeppavlov/configs/morpho_tagger/UD2.0/hu/morpho_hu_predict.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
{
"dataset_reader": {
"name": "morphotagger_dataset_reader",
"data_path": "UD2.0_source",
"language": "hu", "data_types": ["test"]
},
"dataset_iterator": {
"name": "morphotagger_dataset"
},
"chainer": {
"in": ["x"],
"in_y": ["y"],
"pipe": [
{
"id": "lowercase_preprocessor",
"name": "lowercase_preprocessor",
"in": ["x"],
"out": ["x_processed"]
},
{
"id": "tag_vocab",
"name": "default_vocab",
"level": "token",
"special_tokens": ["PAD", "BEGIN", "END"],
"load_path": "morpho_tagger/UD2.0/hu/tag.dict",
"save_path": "morpho_tagger/UD2.0/hu/tag.dict"
},
{
"id": "char_vocab",
"name": "default_vocab",
"min_freq": 3,
"special_tokens": ["PAD", "BEGIN", "END"],
"load_path": "morpho_tagger/UD2.0/hu/char.dict",
"save_path": "morpho_tagger/UD2.0/hu/char.dict"
},
{
"in": ["x_processed"],
"in_y": ["y"],
"out": ["y_predicted"],
"name": "morpho_tagger",
"main": true,
"save_path": "morpho_tagger/UD2.0/hu/model.hdf5",
"load_path": "morpho_tagger/UD2.0/hu/model.hdf5",
"tags": "#tag_vocab",
"symbols": "#char_vocab",
"verbose": 1,
"char_embeddings_size": 32, "char_window_size": [1, 2, 3, 4, 5, 6, 7],
"word_lstm_units": 128, "conv_dropout": 0.0, "char_conv_layers": 1,
"char_highway_layers": 1, "highway_dropout": 0.0, "word_lstm_layers": 1,
"char_filter_multiple": 50, "intermediate_dropout": 0.0,
"word_dropout": 0.2, "lstm_dropout": 0.2, "regularizer": 0.01
},
{
"in": ["x", "y_predicted"],
"out": ["y_prettified"],
"name": "tag_output_prettifier",
"end": "\n"
}
],
"out": ["y_prettified"]
},
"predict": {
"batch_size": 32,
"outfile": "results/ud_hu_test.res"
},
"metadata": {
"download": [
"http://lnsigo.mipt.ru/export/deeppavlov_data/morpho_tagger.tar.gz",
{
"url": "http://lnsigo.mipt.ru/export/datasets/UD2.0_source/hu.tar.gz",
"subdir": "UD2.0_source/hu"
}
],
"telegram_utils": "MorphoTaggerModel"
}
}
78 changes: 78 additions & 0 deletions deeppavlov/configs/morpho_tagger/UD2.0/hu/morpho_hu_train.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
{
"dataset_reader": {
"name": "morphotagger_dataset_reader",
"data_path": "UD2.0_source",
"language": "hu", "data_types": ["train", "dev", "test"]
},
"dataset_iterator": {
"name": "morphotagger_dataset"
},
"chainer": {
"in": ["x"],
"in_y": ["y"],
"pipe": [
{
"id": "lowercase_preprocessor",
"name": "lowercase_preprocessor",
"in": ["x"],
"out": ["x_processed"]
},
{
"id": "tag_vocab",
"name": "default_vocab",
"fit_on": ["y"],
"level": "token",
"special_tokens": ["PAD", "BEGIN", "END"],
"save_path": "morpho_tagger/UD2.0/hu/tag.dict",
"load_path": "morpho_tagger/UD2.0/hu/tag.dict"
},
{
"id": "char_vocab",
"name": "default_vocab",
"min_freq": 3,
"fit_on": ["x_processed"],
"special_tokens": ["PAD", "BEGIN", "END"],
"level": "char",
"save_path": "morpho_tagger/UD2.0/hu/char.dict",
"load_path": "morpho_tagger/UD2.0/hu/char.dict"
},
{
"in": ["x_processed"],
"in_y": ["y"],
"out": ["y_predicted"],
"name": "morpho_tagger",
"main": true,
"save_path": "morpho_tagger/UD2.0/hu/model.hdf5",
"load_path": "morpho_tagger/UD2.0/hu/model.hdf5",
"tags": "#tag_vocab",
"symbols": "#char_vocab",
"verbose": 1,
"char_embeddings_size": 32, "char_window_size": [1, 2, 3, 4, 5, 6, 7],
"word_lstm_units": 128, "conv_dropout": 0.0, "char_conv_layers": 1,
"char_highway_layers": 1, "highway_dropout": 0.0, "word_lstm_layers": 1,
"char_filter_multiple": 50, "intermediate_dropout": 0.0, "word_dropout": 0.2,
"lstm_dropout": 0.3, "regularizer": 0.01, "lm_dropout": 0.3
}
],
"out": ["y_predicted"]
},
"train": {
"epochs": 50,
"batch_size": 32,

"metrics": ["per_token_accuracy"],
"validation_patience": 10,
"val_every_n_epochs": 1,
"log_every_n_epochs": 1
},
"metadata": {
"download": [
"http://lnsigo.mipt.ru/export/deeppavlov_data/morpho_tagger.tar.gz",
{
"url": "http://lnsigo.mipt.ru/export/datasets/UD2.0_source/hu.tar.gz",
"subdir": "UD2.0_source/hu"
}
],
"telegram_utils": "MorphoTaggerModel"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
{
"dataset_reader": {
"name": "morphotagger_dataset_reader",
"data_path": "UD2.0_source",
"language": "ru_syntagrus", "data_types": ["test"]
},
"dataset_iterator": {
"name": "morphotagger_dataset"
},
"chainer": {
"in": ["x"],
"in_y": ["y"],
"pipe": [
{
"id": "lowercase_preprocessor",
"name": "lowercase_preprocessor",
"in": ["x"],
"out": ["x_processed"]
},
{
"id": "tag_vocab",
"name": "default_vocab",
"level": "token",
"special_tokens": ["PAD", "BEGIN", "END"],
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/tag.dict",
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/tag.dict"
},
{
"id": "char_vocab",
"name": "default_vocab",
"min_freq": 3,
"special_tokens": ["PAD", "BEGIN", "END"],
"level": "char",
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/char.dict",
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/char.dict"
},
{
"in": ["x_processed"],
"in_y": ["y"],
"out": ["y_predicted"],
"name": "morpho_tagger",
"main": true,
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/model.hdf5",
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/model.hdf5",
"tags": "#tag_vocab",
"symbols": "#char_vocab",
"verbose": 1,
"char_embeddings_size": 32, "char_window_size": [1, 2, 3, 4, 5, 6, 7],
"word_lstm_units": 128, "conv_dropout": 0.0, "char_conv_layers": 1,
"char_highway_layers": 1, "highway_dropout": 0.0, "word_lstm_layers": 1,
"char_filter_multiple": 50, "intermediate_dropout": 0.0,
"word_dropout": 0.2, "lstm_dropout": 0.2, "regularizer": 0.01
},
{
"in": ["x", "y_predicted"],
"out": ["y_prettified"],
"name": "tag_output_prettifier",
"end": "\n"
}
],
"out": ["y_prettified"]
},
"predict": {
"batch_size": 32,
"outfile": "results/ud_ru_syntagrus_test.res"
},
"metadata": {
"download": [
"http://lnsigo.mipt.ru/export/deeppavlov_data/morpho_tagger.tar.gz",
{
"url": "http://lnsigo.mipt.ru/export/datasets/UD2.0_source/ru_syntagrus.tar.gz",
"subdir": "UD2.0_source/ru_syntagrus"
}
],
"telegram_utils": "MorphoTaggerModel"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
{
"dataset_reader": {
"name": "morphotagger_dataset_reader",
"data_path": "UD2.0_source",
"language": "ru_syntagrus", "data_types": ["train", "dev", "test"]
},
"dataset_iterator": {
"name": "morphotagger_dataset"
},
"chainer": {
"in": ["x"],
"in_y": ["y"],
"pipe": [
{
"id": "lowercase_preprocessor",
"name": "lowercase_preprocessor",
"in": ["x"],
"out": ["x_processed"]
},
{
"id": "tag_vocab",
"name": "default_vocab",
"fit_on": ["y"],
"level": "token",
"special_tokens": ["PAD", "BEGIN", "END"],
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/tag.dict",
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/tag.dict"
},
{
"id": "char_vocab",
"name": "default_vocab",
"min_freq": 3,
"fit_on": ["x_processed"],
"special_tokens": ["PAD", "BEGIN", "END"],
"level": "char",
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/char.dict",
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/char.dict"
},
{
"in": ["x_processed"],
"in_y": ["y"],
"out": ["y_predicted"],
"name": "morpho_tagger",
"main": true,
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/model.hdf5",
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/model.hdf5",
"tags": "#tag_vocab",
"symbols": "#char_vocab",
"verbose": 1,
"char_embeddings_size": 32, "char_window_size": [1, 2, 3, 4, 5, 6, 7],
"word_lstm_units": 128, "conv_dropout": 0.0, "char_conv_layers": 1,
"char_highway_layers": 1, "highway_dropout": 0.0, "word_lstm_layers": 1,
"char_filter_multiple": 50, "intermediate_dropout": 0.0, "word_dropout": 0.2,
"lstm_dropout": 0.2, "regularizer": 0.01
}
],
"out": ["y_predicted"]
},
"train": {
"epochs": 50,
"batch_size": 32,

"metrics": ["per_token_accuracy"],
"validation_patience": 10,
"val_every_n_epochs": 1,
"log_every_n_epochs": 1
},
"metadata": {
"download": [
"http://lnsigo.mipt.ru/export/deeppavlov_data/morpho_tagger.tar.gz",
{
"url": "http://lnsigo.mipt.ru/export/datasets/UD2.0_source/ru_syntagrus.tar.gz",
"subdir": "UD2.0_source/ru_syntagrus"
}
],
"telegram_utils": "MorphoTaggerModel"
}
}

0 comments on commit 9d495b7

Please sign in to comment.