-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feta: morphological tagging model which uses Pymorphy as an additiona…
…l input * feature: dictionary-based vectorizer * feature: dictionary-based vectorizer * feature: dictionary-based vectorizer * feature: Pymorphy-based vectorizer for Russian * feature: Pymorphy-based vectorizer for Russian * test: update tests and requirements for morpho_tagger * fix: fix pymorphy_vectorizer bug in interaction * test: uncomment tests * test: uncomment tests * update: update registry * docs: update morpho_tagger readme, configs and requirements * refactor: move requirements to dp_requirements * fix: change requirements to dp_requirements in morpho_tagger config * refactor: introduce basic class WordIndexVectorizer for word-level vectorizers * doc: update requirements for morpho_tagger
- Loading branch information
1 parent
3aa7ca0
commit 64252de
Showing
13 changed files
with
582 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
87 changes: 87 additions & 0 deletions
87
deeppavlov/configs/morpho_tagger/UD2.0/hu/morpho_hu_train_dict.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
{ | ||
"dataset_reader": { | ||
"name": "morphotagger_dataset_reader", | ||
"data_path": "UD2.0_source", | ||
"language": "hu", "data_types": ["train", "dev", "test"] | ||
}, | ||
"dataset_iterator": { | ||
"name": "morphotagger_dataset" | ||
}, | ||
"chainer": { | ||
"in": ["x"], | ||
"in_y": ["y"], | ||
"pipe": [ | ||
{ | ||
"id": "lowercase_preprocessor", | ||
"name": "lowercase_preprocessor", | ||
"in": ["x"], | ||
"out": ["x_processed"] | ||
}, | ||
{ | ||
"id": "tag_vocab", | ||
"name": "default_vocab", | ||
"fit_on": ["y"], | ||
"level": "token", | ||
"special_tokens": ["PAD", "BEGIN", "END"], | ||
"save_path": "morpho_tagger/UD2.0/hu/tag.dict", | ||
"load_path": "morpho_tagger/UD2.0/hu/tag.dict" | ||
}, | ||
{ | ||
"id": "char_vocab", | ||
"name": "default_vocab", | ||
"min_freq": 3, | ||
"fit_on": ["x_processed"], | ||
"special_tokens": ["PAD", "BEGIN", "END"], | ||
"level": "char", | ||
"save_path": "morpho_tagger/UD2.0/hu/char.dict", | ||
"load_path": "morpho_tagger/UD2.0/hu/char.dict" | ||
}, | ||
{ | ||
"id": "dictionary_vectorizer", | ||
"name": "dictionary_vectorizer", | ||
"save_path": "/home/alexeysorokin/data/DeepPavlov/download/UD2.0_dict", | ||
"load_path": "/home/alexeysorokin/data/DeepPavlov/download/UD2.0_dict", | ||
"in": ["x"], | ||
"out": ["x_possible_tags"] | ||
}, | ||
{ | ||
"in": ["x_processed", "x_possible_tags"], | ||
"in_y": ["y"], | ||
"out": ["y_predicted"], | ||
"name": "morpho_tagger", | ||
"main": true, | ||
"save_path": "morpho_tagger/UD2.0/hu/model_dict.hdf5", | ||
"load_path": "morpho_tagger/UD2.0/hu/model_dict.hdf5", | ||
"tags": "#tag_vocab", | ||
"symbols": "#char_vocab", | ||
"verbose": 1, | ||
"char_embeddings_size": 32, "char_window_size": [1, 2, 3, 4, 5, 6, 7], | ||
"word_lstm_units": 128, "conv_dropout": 0.0, "char_conv_layers": 1, | ||
"char_highway_layers": 1, "highway_dropout": 0.0, "word_lstm_layers": 1, | ||
"char_filter_multiple": 50, "intermediate_dropout": 0.0, "word_dropout": 0.2, | ||
"lstm_dropout": 0.3, "regularizer": 0.01, "lm_dropout": 0.3, | ||
"word_vectorizers": [["#dictionary_vectorizer.dim", 128]] | ||
} | ||
], | ||
"out": ["y_predicted"] | ||
}, | ||
"train": { | ||
"epochs": 2, | ||
"batch_size": 32, | ||
|
||
"metrics": ["per_token_accuracy"], | ||
"validation_patience": 10, | ||
"val_every_n_epochs": 1, | ||
"log_every_n_epochs": 1 | ||
}, | ||
"metadata": { | ||
"download": [ | ||
"http://lnsigo.mipt.ru/export/deeppavlov_data/morpho_tagger.tar.gz", | ||
{ | ||
"url": "http://lnsigo.mipt.ru/export/datasets/UD2.0_source/hu.tar.gz", | ||
"subdir": "UD2.0_source/hu" | ||
} | ||
], | ||
"telegram_utils": "MorphoTaggerModel" | ||
} | ||
} |
90 changes: 90 additions & 0 deletions
90
...pavlov/configs/morpho_tagger/UD2.0/ru_syntagrus/morpho_ru_syntagrus_predict_pymorphy.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
{ | ||
"dataset_reader": { | ||
"name": "morphotagger_dataset_reader", | ||
"data_path": "UD2.0_source", | ||
"language": "ru_syntagrus", "data_types": ["test"] | ||
}, | ||
"dataset_iterator": { | ||
"name": "morphotagger_dataset" | ||
}, | ||
"chainer": { | ||
"in": ["x"], | ||
"in_y": ["y"], | ||
"pipe": [ | ||
{ | ||
"id": "lowercase_preprocessor", | ||
"name": "lowercase_preprocessor", | ||
"in": ["x"], | ||
"out": ["x_processed"] | ||
}, | ||
{ | ||
"id": "tag_vocab", | ||
"name": "default_vocab", | ||
"level": "token", | ||
"special_tokens": ["PAD", "BEGIN", "END"], | ||
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/tag.dict", | ||
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/tag.dict" | ||
}, | ||
{ | ||
"id": "char_vocab", | ||
"name": "default_vocab", | ||
"min_freq": 3, | ||
"special_tokens": ["PAD", "BEGIN", "END"], | ||
"level": "char", | ||
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/char.dict", | ||
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/char.dict" | ||
}, | ||
{ | ||
"id": "pymorphy_vectorizer", | ||
"name": "pymorphy_vectorizer", | ||
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/tags_russian.txt", | ||
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/tags_russian.txt", | ||
"max_pymorphy_variants": 5, | ||
"in": ["x"], | ||
"out": ["x_possible_tags"] | ||
}, | ||
{ | ||
"in": ["x_processed", "x_possible_tags"], | ||
"in_y": ["y"], | ||
"out": ["y_predicted"], | ||
"name": "morpho_tagger", | ||
"main": true, | ||
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/model_pymorphy.hdf5", | ||
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/model_pymorphy.hdf5", | ||
"tags": "#tag_vocab", | ||
"symbols": "#char_vocab", | ||
"verbose": 1, | ||
"char_embeddings_size": 32, "char_window_size": [1, 2, 3, 4, 5, 6, 7], | ||
"word_lstm_units": 128, "conv_dropout": 0.0, "char_conv_layers": 1, | ||
"char_highway_layers": 1, "highway_dropout": 0.0, "word_lstm_layers": 1, | ||
"char_filter_multiple": 50, "intermediate_dropout": 0.0, | ||
"word_dropout": 0.2, "lstm_dropout": 0.2, "regularizer": 0.01, | ||
"word_vectorizers": [["#pymorphy_vectorizer.dim", 128]] | ||
}, | ||
{ | ||
"in": ["x", "y_predicted"], | ||
"out": ["y_prettified"], | ||
"name": "tag_output_prettifier", | ||
"end": "\n" | ||
} | ||
], | ||
"out": ["y_prettified"] | ||
}, | ||
"predict": { | ||
"batch_size": 32, | ||
"outfile": "results/ud_ru_syntagrus_test.res" | ||
}, | ||
"metadata": { | ||
"requirements": [ | ||
"../dp_requirements/tf.txt", "../dp_requirements/morpho_tagger.txt" | ||
], | ||
"download": [ | ||
"http://lnsigo.mipt.ru/export/deeppavlov_data/morpho_tagger.tar.gz", | ||
{ | ||
"url": "http://lnsigo.mipt.ru/export/datasets/UD2.0_source/ru_syntagrus.tar.gz", | ||
"subdir": "UD2.0_source/ru_syntagrus" | ||
} | ||
], | ||
"telegram_utils": "MorphoTaggerPymorphyModel" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
91 changes: 91 additions & 0 deletions
91
deeppavlov/configs/morpho_tagger/UD2.0/ru_syntagrus/morpho_ru_syntagrus_train_pymorphy.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
{ | ||
"dataset_reader": { | ||
"name": "morphotagger_dataset_reader", | ||
"language": "ru_syntagrus", | ||
"data_types": ["train", "dev", "test"] | ||
}, | ||
"dataset_iterator": { | ||
"name": "morphotagger_dataset" | ||
}, | ||
"chainer": { | ||
"in": ["x"], | ||
"in_y": ["y"], | ||
"pipe": [ | ||
{ | ||
"id": "lowercase_preprocessor", | ||
"name": "lowercase_preprocessor", | ||
"in": ["x"], | ||
"out": ["x_processed"] | ||
}, | ||
{ | ||
"id": "tag_vocab", | ||
"name": "default_vocab", | ||
"fit_on": ["y"], | ||
"level": "token", | ||
"special_tokens": ["PAD", "BEGIN", "END"], | ||
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/tag.dict", | ||
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/tag.dict" | ||
}, | ||
{ | ||
"id": "char_vocab", | ||
"name": "default_vocab", | ||
"min_freq": 3, | ||
"fit_on": ["x_processed"], | ||
"special_tokens": ["PAD", "BEGIN", "END"], | ||
"level": "char", | ||
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/char.dict", | ||
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/char.dict" | ||
}, | ||
{ | ||
"id": "pymorphy_vectorizer", | ||
"name": "pymorphy_vectorizer", | ||
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/tags_russian.txt", | ||
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/tags_russian.txt", | ||
"max_pymorphy_variants": 5, | ||
"in": ["x"], | ||
"out": ["x_possible_tags"] | ||
}, | ||
{ | ||
"in": ["x_processed", "x_possible_tags"], | ||
"in_y": ["y"], | ||
"out": ["y_predicted"], | ||
"name": "morpho_tagger", | ||
"main": true, | ||
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/model_pymorphy.hdf5", | ||
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/model_pymorphy.hdf5", | ||
"tags": "#tag_vocab", | ||
"symbols": "#char_vocab", | ||
"verbose": 1, | ||
"char_embeddings_size": 32, "char_window_size": [1, 2, 3, 4, 5, 6, 7], | ||
"word_lstm_units": 128, "conv_dropout": 0.0, "char_conv_layers": 1, | ||
"char_highway_layers": 1, "highway_dropout": 0.0, "word_lstm_layers": 1, | ||
"char_filter_multiple": 50, "intermediate_dropout": 0.0, "word_dropout": 0.2, | ||
"lstm_dropout": 0.2, "regularizer": 0.01, | ||
"word_vectorizers": [["#pymorphy_vectorizer.dim", 128]] | ||
} | ||
], | ||
"out": ["y_predicted"] | ||
}, | ||
"train": { | ||
"epochs": 75, | ||
"batch_size": 32, | ||
|
||
"metrics": ["per_token_accuracy", "accuracy"], | ||
"validation_patience": 10, | ||
"val_every_n_epochs": 1, | ||
"log_every_n_epochs": 1 | ||
}, | ||
"metadata": { | ||
"requirements": [ | ||
"../dp_requirements/tf.txt", "../dp_requirements/morpho_tagger.txt" | ||
], | ||
"download": [ | ||
"http://lnsigo.mipt.ru/export/deeppavlov_data/morpho_tagger.tar.gz", | ||
{ | ||
"url": "http://lnsigo.mipt.ru/export/datasets/UD2.0_source/ru_syntagrus.tar.gz", | ||
"subdir": "UD2.0_source/ru_syntagrus" | ||
} | ||
], | ||
"telegram_utils": "MorphoTaggerPymorphyModel" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.