Skip to content

Commit

Permalink
fix: many minor fixes
Browse files Browse the repository at this point in the history
* fix: fix mark_done data_path

* refactor: rename ranking_dataset to ranking_iterator.py and move it to the dataset_iterators folder

* fix: fix embedding matrix construction, change epochs num
default parameter value

* refactor: rename registered name and name of the class

* refactor: rename files and classes

* refactor: change dataset downlaod

* feat: add insurance embeddings and datasets in urls.py
  • Loading branch information
puleon authored and seliverstov committed Mar 20, 2018
1 parent 4d09dae commit 32c9f36
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 21 deletions.
4 changes: 2 additions & 2 deletions deeppavlov/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@
import deeppavlov.skills.go_bot.network
import deeppavlov.skills.go_bot.tracker
import deeppavlov.vocabs.typos
import deeppavlov.dataset_readers.ranking_dataset_reader
import deeppavlov.datasets.ranking_dataset
import deeppavlov.dataset_readers.insurance_reader
import deeppavlov.dataset_iterators.ranking_iterator
import deeppavlov.models.ranking.ranking_model
import deeppavlov.models.ranking.metrics

Expand Down
8 changes: 4 additions & 4 deletions deeppavlov/configs/ranking/insurance_config.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"dataset_reader": {
"name": "insurance_dataset_reader",
"data_path": "insuranceQA-master/V1"
"name": "insurance_reader",
"data_path": "./"
},
"dataset_iterator": {
"name": "insurance_dataset",
"name": "ranking_iterator",
"seed": 243,
"sample_candidates": "global",
"sample_candidates_valid": "pool",
Expand Down Expand Up @@ -44,7 +44,7 @@
"out": ["y_predicted"]
},
"train": {
"epochs": 100,
"epochs": 150,
"batch_size": 256,
"metrics": ["r@1", "rank_response"],
"validation_patience": 5,
Expand Down
7 changes: 5 additions & 2 deletions deeppavlov/core/data/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
'http://lnsigo.mipt.ru/export/deeppavlov_data/ner_conll2003.tar.gz',
'http://lnsigo.mipt.ru/export/deeppavlov_data/error_model.tar.gz',
'http://lnsigo.mipt.ru/export/deeppavlov_data/ranking.tar.gz',
'http://lnsigo.mipt.ru/export/embeddings/insurance_v1_word2vec',
'http://lnsigo.mipt.ru/export/deeppavlov_data/vocabs.tar.gz',
'http://lnsigo.mipt.ru/export/deeppavlov_data/slots.tar.gz',
'http://lnsigo.mipt.ru/export/deeppavlov_data/embeddings/dstc2_fastText_model.bin',
Expand All @@ -35,9 +36,11 @@

EMBEDDING_URLS = {
'http://lnsigo.mipt.ru/export/deeppavlov_data/embeddings/wiki.en.bin',
'http://lnsigo.mipt.ru/export/deeppavlov_data/embeddings/dstc2_fastText_model.bin'
'http://lnsigo.mipt.ru/export/deeppavlov_data/embeddings/dstc2_fastText_model.bin',
'http://lnsigo.mipt.ru/export/embeddings/insurance_v1_word2vec'
}

DATA_URLS = {
'http://lnsigo.mipt.ru/export/datasets/dstc2.tar.gz'
'http://lnsigo.mipt.ru/export/datasets/dstc2.tar.gz',
'http://lnsigo.mipt.ru/export/datasets/insuranceQA-master.zip'
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import numpy as np


@register('insurance_dataset')
class InsuranceDataset:
@register('ranking_iterator')
class RankingIterator:

def __init__(self, data,
sample_candidates, sample_candidates_valid, sample_candidates_test,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,28 @@
from pathlib import Path
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.utils import download_decompress, mark_done, is_done
from deeppavlov.core.commands.utils import get_deeppavlov_root
from deeppavlov.core.commands.utils import get_deeppavlov_root, expand_path

@register('insurance_dataset_reader')
class InsuranceDatasetReader(DatasetReader):

@register('insurance_reader')
class InsuranceReader(DatasetReader):

def read(self, data_path):
data_path = expand_path(data_path)
self.download_data(data_path)
dataset = {'train': None, 'valid': None, 'test': None}
train_fname = Path(data_path) / 'question.train.token_idx.label'
train_fname = Path(data_path) / 'insuranceQA-master/V1/question.train.token_idx.label'
dataset["train"] = self.preprocess_data_train(train_fname)
valid_fname = Path(data_path) / 'question.dev.label.token_idx.pool'
valid_fname = Path(data_path) / 'insuranceQA-master/V1/question.dev.label.token_idx.pool'
dataset["valid"] = self.preprocess_data_valid_test(valid_fname)
test_fname = Path(data_path) / 'question.test1.label.token_idx.pool'
test_fname = Path(data_path) / 'insuranceQA-master/V1/question.test1.label.token_idx.pool'
dataset["test"] = self.preprocess_data_valid_test(test_fname)
return dataset

def download_data(self, data_path):
if not is_done(Path(data_path)):
download_decompress(url="https://github.com/shuzi/insuranceQA/archive/master.zip",
download_path=get_deeppavlov_root())
download_decompress(url="http://lnsigo.mipt.ru/export/datasets/insuranceQA-master.zip",
download_path=data_path)
mark_done(data_path)

def preprocess_data_train(self, fname):
Expand Down
9 changes: 6 additions & 3 deletions deeppavlov/models/ranking/emb_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,10 @@ def create_emb_matrix(self, tok2int_vocab):
dummy_emb = list(np.zeros(self.embedding_dim))
self.emb_matrix = np.zeros((len(tok2int_vocab), self.embedding_dim))
for tok, i in tok2int_vocab.items():
try:
self.emb_matrix[i] = self.embeddings_model[tok]
except:
if tok == '<UNK>':
self.emb_matrix[i] = dummy_emb
else:
try:
self.emb_matrix[i] = self.embeddings_model[tok]
except:
self.emb_matrix[i] = dummy_emb

0 comments on commit 32c9f36

Please sign in to comment.