Skip to content

Commit

Permalink
refactor: ODQA models improvement (#1635)
Browse files Browse the repository at this point in the history
Co-authored-by: Fedor Ignatov <ignatov.fedor@gmail.com>
  • Loading branch information
dmitrijeuseew and IgnatovFedor committed Jun 16, 2023
1 parent 852f5be commit 75e0473
Show file tree
Hide file tree
Showing 24 changed files with 438 additions and 365 deletions.
Expand Up @@ -8,56 +8,39 @@
"dataset_iterator": {
"class_name": "sqlite_iterator",
"shuffle": false,
"load_path": "{DOWNLOADS_PATH}/odqa/enwiki.db"
"load_path": "{DOWNLOADS_PATH}/odqa/enwiki_l100.db"
},
"chainer": {
"in": [
"docs"
],
"in_y": [
"doc_ids",
"doc_nums"
],
"out": [
"pop_doc_ids"
],
"in": ["docs"],
"in_y": ["doc_ids", "doc_nums"],
"out": ["pop_doc_ids"],
"pipe": [
{
"class_name": "hashing_tfidf_vectorizer",
"id": "vectorizer",
"fit_on": [
"docs",
"doc_ids",
"doc_nums"
],
"save_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix.npz",
"load_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix.npz",
"fit_on": ["docs", "doc_ids", "doc_nums"],
"save_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz",
"load_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz",
"tokenizer": {
"class_name": "stream_spacy_tokenizer",
"lemmas": true,
"ngram_range": [
1,
2
]
"lowercase": true,
"filter_stopwords": true,
"ngram_range": [1, 3]
}
},
{
"class_name": "tfidf_ranker",
"top_n": 20,
"in": [
"docs"
],
"out": [
"tfidf_doc_ids",
"tfidf_doc_scores"
],
"top_n": 100,
"in": ["docs"],
"out": ["tfidf_doc_ids", "tfidf_doc_scores"],
"vectorizer": "#vectorizer"
},
{
"class_name": "pop_ranker",
"pop_dict_path": "{DOWNLOADS_PATH}/odqa/enwiki20180211_popularities.json",
"pop_dict_path": "{DOWNLOADS_PATH}/odqa/enwiki_popularities.json",
"load_path": "{MODELS_PATH}/odqa/logreg_3features_v2.joblib",
"top_n": 10,
"top_n": 100,
"in": ["tfidf_doc_ids", "tfidf_doc_scores"],
"out": ["pop_doc_ids", "pop_doc_scores"]
}
Expand All @@ -76,21 +59,21 @@
},
"download": [
{
"url": "http://files.deeppavlov.ai/datasets/wikipedia/enwiki.tar.gz",
"subdir": "{DOWNLOADS_PATH}"
"url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_l100.tar.gz",
"subdir": "{DOWNLOADS_PATH}/odqa"
},
{
"url": "http://files.deeppavlov.ai/deeppavlov_data/en_odqa.tar.gz",
"subdir": "{MODELS_PATH}"
"url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_tfidf_matrix_par_lite.tar.gz",
"subdir": "{MODELS_PATH}/odqa"
},
{
"url": "http://files.deeppavlov.ai/datasets/wikipedia/enwiki20180211_popularities.tar.gz",
"subdir": "{DOWNLOADS_PATH}"
"url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_popularities.tar.gz",
"subdir": "{DOWNLOADS_PATH}/odqa"
},
{
"url": "http://files.deeppavlov.ai/deeppavlov_data/ranking/logreg_3features_v2.joblib",
"subdir": "{MODELS_PATH}/odqa"
}
]
}
}
}
49 changes: 15 additions & 34 deletions deeppavlov/configs/doc_retrieval/en_ranker_tfidf_wiki.json
Expand Up @@ -8,51 +8,32 @@
"dataset_iterator": {
"class_name": "sqlite_iterator",
"shuffle": false,
"load_path": "{DOWNLOADS_PATH}/odqa/enwiki.db"
"load_path": "{DOWNLOADS_PATH}/odqa/enwiki_l100.db"
},
"chainer": {
"in": [
"docs"
],
"in_y": [
"doc_ids",
"doc_nums"
],
"out": [
"tfidf_doc_ids"
],
"in": ["docs"],
"in_y": ["doc_ids", "doc_nums"],
"out": ["tfidf_doc_ids"],
"pipe": [
{
"class_name": "hashing_tfidf_vectorizer",
"id": "vectorizer",
"fit_on": [
"docs",
"doc_ids",
"doc_nums"
],
"save_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix.npz",
"load_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix.npz",
"fit_on": ["docs", "doc_ids", "doc_nums"],
"save_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz",
"load_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz",
"tokenizer": {
"class_name": "stream_spacy_tokenizer",
"lemmas": true,
"lowercase": true,
"filter_stopwords": true,
"ngram_range": [
1,
2
]
"ngram_range": [1, 3]
}
},
{
"class_name": "tfidf_ranker",
"top_n": 12,
"in": [
"docs"
],
"out": [
"tfidf_doc_ids",
"tfidf_doc_scores"
],
"top_n": 100,
"in": ["docs"],
"out": ["tfidf_doc_ids", "tfidf_doc_scores"],
"vectorizer": "#vectorizer"
}
]
Expand All @@ -70,12 +51,12 @@
},
"download": [
{
"url": "http://files.deeppavlov.ai/datasets/wikipedia/enwiki.tar.gz",
"subdir": "{DOWNLOADS_PATH}"
"url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_l100.tar.gz",
"subdir": "{DOWNLOADS_PATH}/odqa"
},
{
"url": "http://files.deeppavlov.ai/deeppavlov_data/en_odqa.tar.gz",
"subdir": "{MODELS_PATH}"
"url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_tfidf_matrix_par_lite.tar.gz",
"subdir": "{MODELS_PATH}/odqa"
}
]
}
Expand Down
51 changes: 16 additions & 35 deletions deeppavlov/configs/doc_retrieval/ru_ranker_tfidf_wiki.json
Expand Up @@ -2,58 +2,39 @@
"dataset_reader": {
"class_name": "odqa_reader",
"data_path": "{DOWNLOADS_PATH}/odqa/ruwiki",
"save_path": "{DOWNLOADS_PATH}/odqa/ruwiki.db",
"save_path": "{DOWNLOADS_PATH}/odqa/ruwiki_par_page_compr.db",
"dataset_format": "wiki"
},
"dataset_iterator": {
"class_name": "sqlite_iterator",
"shuffle": false,
"load_path": "{DOWNLOADS_PATH}/odqa/ruwiki.db"
"load_path": "{DOWNLOADS_PATH}/odqa/ruwiki_par_page_compr.db"
},
"chainer": {
"in": [
"docs"
],
"in_y": [
"doc_ids",
"doc_nums"
],
"out": [
"tfidf_doc_ids"
],
"in": ["docs"],
"in_y": ["doc_ids", "doc_nums"],
"out": ["tfidf_doc_ids"],
"pipe": [
{
"class_name": "hashing_tfidf_vectorizer",
"id": "vectorizer",
"fit_on": [
"docs",
"doc_ids",
"doc_nums"
],
"save_path": "{MODELS_PATH}/odqa/ruwiki_tfidf_matrix.npz",
"load_path": "{MODELS_PATH}/odqa/ruwiki_tfidf_matrix.npz",
"fit_on": ["docs", "doc_ids", "doc_nums"],
"save_path": "{MODELS_PATH}/odqa/ruwiki_tfidf_matrix_compr.npz",
"load_path": "{MODELS_PATH}/odqa/ruwiki_tfidf_matrix_compr.npz",
"tokenizer": {
"class_name": "stream_spacy_tokenizer",
"spacy_model": "ru_core_news_sm",
"lemmas": true,
"lowercase": true,
"filter_stopwords": true,
"ngram_range": [
1,
2
]
"ngram_range": [1, 3]
}
},
{
"class_name": "tfidf_ranker",
"top_n": 1,
"in": [
"docs"
],
"out": [
"tfidf_doc_ids",
"tfidf_doc_scores"
],
"top_n": 100,
"in": ["docs"],
"out": ["tfidf_doc_ids", "tfidf_doc_scores"],
"vectorizer": "#vectorizer"
}
]
Expand All @@ -71,12 +52,12 @@
},
"download": [
{
"url": "http://files.deeppavlov.ai/datasets/wikipedia/ruwiki.tar.gz",
"subdir": "{DOWNLOADS_PATH}"
"url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/ruwiki_par_page_compr.tar.gz",
"subdir": "{DOWNLOADS_PATH}/odqa"
},
{
"url": "http://files.deeppavlov.ai/deeppavlov_data/ru_odqa.tar.gz",
"subdir": "{MODELS_PATH}"
"url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/ruwiki_tfidf_matrix_compr.tar.gz",
"subdir": "{MODELS_PATH}/odqa"
}
]
}
Expand Down
38 changes: 25 additions & 13 deletions deeppavlov/configs/odqa/en_odqa_infer_wiki.json
Expand Up @@ -8,32 +8,40 @@
"in": ["question_raw"],
"out": ["tfidf_doc_ids"]
},
{
"class_name": "bpr",
"load_path": "{MODELS_PATH}/bpr/eng",
"query_encoder_file": "query_encoder_en.pth.tar",
"bpr_index": "bpr_finetuned_nq_adv.idx",
"pretrained_model": "bert-base-uncased",
"top_n": 100,
"in": ["question_raw"],
"out": ["bpr_doc_ids"]
},
{
"class_name": "concat_lists",
"in": ["tfidf_doc_ids", "bpr_doc_ids"],
"out": ["doc_ids"]
},
{
"class_name": "wiki_sqlite_vocab",
"in": ["tfidf_doc_ids"],
"out": ["tfidf_doc_text"],
"in": ["doc_ids"],
"out": ["doc_text"],
"join_docs": false,
"shuffle": false,
"load_path": "{DOWNLOADS_PATH}/odqa/enwiki.db"
},
{
"class_name": "document_chunker",
"in": ["tfidf_doc_text"],
"out": ["chunks"],
"flatten_result": true,
"paragraphs": true
"load_path": "{DOWNLOADS_PATH}/odqa/enwiki_l100.db"
},
{
"class_name": "string_multiplier",
"in": ["question_raw", "chunks"],
"in": ["question_raw", "doc_text"],
"out":["questions"]
},
{
"class_name": "logit_ranker",
"batch_size": 64,
"squad_model": {"config_path": "{CONFIGS_PATH}/squad/qa_squad2_bert.json"},
"squad_model": {"config_path": "{CONFIGS_PATH}/squad/qa_nq_psgcls_bert.json"},
"sort_noans": true,
"in": ["chunks", "questions"],
"in": ["doc_text", "questions"],
"out": ["answer", "answer_score", "answer_place"]
}
]
Expand All @@ -46,6 +54,10 @@
"CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
},
"download": [
{
"url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/bpr_encoder_index_eng.tar.gz",
"subdir": "{MODELS_PATH}/bpr/eng"
}
]
}
}

0 comments on commit 75e0473

Please sign in to comment.