Skip to content

Commit

Permalink
Rename dataset to dataset_iterator and other renames (#103)
Browse files Browse the repository at this point in the history
* refactor: rename 'dataset' to 'dataset_iterator'

* refactor: rename dataset readers and iterators

* refactor: classification iterator and reader

* fix: dialog_iterator
  • Loading branch information
seliverstov committed Mar 13, 2018
1 parent 887a09b commit 4f1a08a
Show file tree
Hide file tree
Showing 27 changed files with 85 additions and 122 deletions.
23 changes: 11 additions & 12 deletions deeppavlov/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,18 @@


import deeppavlov.core.models.keras_model
import deeppavlov.core.data.dataset
import deeppavlov.core.data.dataset_iterator
import deeppavlov.core.data.vocab
import deeppavlov.dataset_readers.babi_dataset_reader
import deeppavlov.dataset_readers.dstc2_dataset_reader
import deeppavlov.dataset_readers.basic_ner_dataset_reader
import deeppavlov.dataset_readers.typos
import deeppavlov.dataset_readers.classification_dataset_reader
import deeppavlov.datasets.dialog_dataset
import deeppavlov.datasets.dstc2_datasets
import deeppavlov.datasets.hcn_dataset
import deeppavlov.datasets.intent_dataset
import deeppavlov.datasets.typos_dataset
import deeppavlov.datasets.classification_dataset
import deeppavlov.dataset_readers.babi_reader
import deeppavlov.dataset_readers.dstc2_reader
import deeppavlov.dataset_readers.conll2003_reader
import deeppavlov.dataset_readers.typos_reader
import deeppavlov.dataset_readers.csv_classification_reader
import deeppavlov.dataset_iterators.dialog_iterator
import deeppavlov.dataset_iterators.dstc2_ner_iterator
import deeppavlov.dataset_iterators.dstc2_intents_iterator
import deeppavlov.dataset_iterators.typos_iterator
import deeppavlov.dataset_iterators.basic_classification_iterator
import deeppavlov.models.classifiers.intents.intent_model
import deeppavlov.models.commutators.random_commutator
import deeppavlov.models.embedders.fasttext_embedder
Expand Down
4 changes: 2 additions & 2 deletions deeppavlov/configs/error_model/brillmoore_kartaslov_ru.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
"dataset_reader": {
"name": "typos_kartaslov_reader"
},
"dataset": {
"name": "typos_dataset",
"dataset_iterator": {
"name": "typos_iterator",
"test_ratio": 0.02
},
"chainer":{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
"dataset_reader": {
"name": "typos_kartaslov_reader"
},
"dataset": {
"name": "typos_dataset",
"dataset_iterator": {
"name": "typos_iterator",
"test_ratio": 0.02
},
"chainer":{
Expand Down
4 changes: 2 additions & 2 deletions deeppavlov/configs/error_model/brillmoore_wikitypos_en.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
"dataset_reader": {
"name": "typos_wikipedia_reader"
},
"dataset": {
"name": "typos_dataset",
"dataset_iterator": {
"name": "typos_iterator",
"test_ratio": 0.05
},
"chainer":{
Expand Down
6 changes: 3 additions & 3 deletions deeppavlov/configs/go_bot/gobot_dstc2.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"dataset_reader": {
"name": "dstc2_datasetreader",
"name": "dstc2_reader",
"data_path": "dstc2"
},
"dataset": {
"name": "dialog_dataset"
"dataset_iterator": {
"name": "dialog_iterator"
},
"chainer": {
"in": ["x"],
Expand Down
6 changes: 3 additions & 3 deletions deeppavlov/configs/go_bot/gobot_dstc2_all.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"dataset_reader": {
"name": "dstc2_datasetreader",
"name": "dstc2_reader",
"data_path": "dstc2"
},
"dataset": {
"name": "dialog_dataset"
"dataset_iterator": {
"name": "dialog_iterator"
},
"chainer": {
"in": ["x"],
Expand Down
6 changes: 3 additions & 3 deletions deeppavlov/configs/go_bot/gobot_dstc2_minimal.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"dataset_reader": {
"name": "dstc2_datasetreader",
"name": "dstc2_reader",
"data_path": "dstc2"
},
"dataset": {
"name": "dialog_dataset"
"dataset_iterator": {
"name": "dialog_iterator"
},
"chainer": {
"in": ["x"],
Expand Down
6 changes: 3 additions & 3 deletions deeppavlov/configs/intents/intents_dstc2.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"dataset_reader": {
"name": "dstc2_datasetreader",
"name": "dstc2_reader",
"data_path": "dstc2"
},
"dataset": {
"name": "intent_dataset",
"dataset_iterator": {
"name": "dstc2_intents_iterator",
"seed": 42,
"fields_to_merge": [
"train",
Expand Down
9 changes: 5 additions & 4 deletions deeppavlov/configs/intents/intents_snips.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
{
"dataset_reader": {
"name": "classification_datasetreader",
"data_path": "snips"
"name": "csv_classification_reader",
"data_path": "snips",
"url": "http://lnsigo.mipt.ru/export/datasets/snips_intents/train.csv"
},
"dataset": {
"name": "classification_dataset",
"dataset_iterator": {
"name": "basic_classification_iterator",
"seed": 42,
"field_to_split": "train",
"split_fields": [
Expand Down
6 changes: 3 additions & 3 deletions deeppavlov/configs/ner/ner_conll2003.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"dataset_reader": {
"name": "ner_dataset_reader",
"name": "conll2003_reader",
"data_path": "conll2003/"
},
"dataset": {
"name": "basic_dataset"
"dataset_iterator": {
"name": "basic_dataset_iterator"
},
"chainer": {
"in": ["x"],
Expand Down
6 changes: 3 additions & 3 deletions deeppavlov/configs/ner/ner_dstc2.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"dataset_reader": {
"name": "dstc2_datasetreader",
"name": "dstc2_reader",
"data_path": "dstc2"
},
"dataset": {
"name": "dstc2_ner_dataset",
"dataset_iterator": {
"name": "dstc2_ner_iterator",
"dataset_path": "dstc2"
},
"chainer": {
Expand Down
6 changes: 3 additions & 3 deletions deeppavlov/configs/ner/slotfill_dstc2.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"dataset_reader": {
"name": "dstc2_datasetreader",
"name": "dstc2_reader",
"data_path": "dstc2"
},
"dataset": {
"name": "dstc2_ner_dataset",
"dataset_iterator": {
"name": "dstc2_ner_iterator",
"dataset_path": "dstc2"
},
"chainer": {
Expand Down
19 changes: 11 additions & 8 deletions deeppavlov/core/commands/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from deeppavlov.core.common.registry import model as get_model
from deeppavlov.core.common.metrics_registry import get_metrics_by_names
from deeppavlov.core.common.params import from_params
from deeppavlov.core.data.dataset import Dataset
from deeppavlov.core.data.dataset_iterator import BasicDatasetIterator
from deeppavlov.core.models.component import Component
from deeppavlov.core.models.estimator import Estimator
from deeppavlov.core.models.nn_model import NNModel
Expand All @@ -38,14 +38,14 @@
log = get_logger(__name__)


def _fit(model: Estimator, dataset: Dataset, train_config={}):
def _fit(model: Estimator, dataset: BasicDatasetIterator, train_config={}):
x, y = dataset.iter_all('train')
model.fit(x, y)
model.save()
return model


def fit_chainer(config: dict, dataset: Dataset):
def fit_chainer(config: dict, dataset: BasicDatasetIterator):

chainer_config: dict = config['chainer']
chainer = Chainer(chainer_config['in'], chainer_config['out'], chainer_config.get('in_y'))
Expand Down Expand Up @@ -76,10 +76,13 @@ def train_model_from_config(config_path: str):
reader_config = config['dataset_reader']
reader = get_model(reader_config['name'])()
data_path = expand_path(reader_config.get('data_path', ''))
data = reader.read(data_path)
kwargs = reader_config.copy()
if "name" in kwargs: del kwargs["name"]
if "data_path" in kwargs: del kwargs["data_path"]
data = reader.read(data_path, **kwargs)

dataset_config = config['dataset']
dataset: Dataset = from_params(dataset_config, data=data)
dataset_config = config['dataset_iterator']
dataset: BasicDatasetIterator = from_params(dataset_config, data=data)

if 'chainer' in config:
model = fit_chainer(config, dataset)
Expand Down Expand Up @@ -137,7 +140,7 @@ def train_model_from_config(config_path: str):


def _test_model(model: Component, metrics_functions: List[Tuple[str, Callable]],
dataset: Dataset, batch_size=-1, data_type='valid', start_time=None):
dataset: BasicDatasetIterator, batch_size=-1, data_type='valid', start_time=None):
if start_time is None:
start_time = time.time()

Expand All @@ -158,7 +161,7 @@ def _test_model(model: Component, metrics_functions: List[Tuple[str, Callable]],
return report


def _train_batches(model: NNModel, dataset: Dataset, train_config: dict,
def _train_batches(model: NNModel, dataset: BasicDatasetIterator, train_config: dict,
metrics_functions: List[Tuple[str, Callable]]):

default_train_config = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
from deeppavlov.core.common.registry import register


@register('basic_dataset')
class Dataset:
@register('basic_dataset_iterator')
class BasicDatasetIterator:
def split(self, *args, **kwargs):
pass

Expand Down
2 changes: 1 addition & 1 deletion deeppavlov/core/data/dataset_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class DatasetReader:
A ``DatasetReader`` reads data from some location and constructs a dataset.
"""

def read(self, file_path: str) -> List:
def read(self, data_path: str, *args, **kwargs) -> List:
"""
Read a file from a path and returns data as list with training instances.
"""
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,17 @@
from sklearn.model_selection import train_test_split

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset import Dataset
from deeppavlov.core.data.dataset_iterator import BasicDatasetIterator
from deeppavlov.core.common.log import get_logger


log = get_logger(__name__)


@register('classification_dataset')
class ClassificationDataset(Dataset):
@register('basic_classification_iterator')
class BasicClassificationDatasetIterator(BasicDatasetIterator):
"""
Class gets data dictionary from ClassificationDatasetReader instance,
Class gets data dictionary from DatasetReader instance,
merge fields if necessary,
split a field if necessary
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
from overrides import overrides

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset import Dataset
from deeppavlov.core.data.dataset_iterator import BasicDatasetIterator


@register('dialog_dataset')
class DialogDataset(Dataset):
@register('dialog_iterator')
class DialogDatasetIterator(BasicDatasetIterator):
@staticmethod
def _dialogs(data):
dialogs = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@
from sklearn.model_selection import train_test_split

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset import Dataset
from deeppavlov.core.data.dataset_iterator import BasicDatasetIterator
from deeppavlov.core.common.log import get_logger


log = get_logger(__name__)


@register('intent_dataset')
class IntentDataset(Dataset):
@register('dstc2_intents_iterator')
class Dstc2IntentsDatasetIterator(BasicDatasetIterator):
"""
Class gets data dictionary from DSTC2DatasetReader instance,
construct intents from act and slots,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset import Dataset
from deeppavlov.core.data.dataset_iterator import BasicDatasetIterator
from deeppavlov.core.data.utils import download

logger = logging.getLogger(__name__)


@register('dstc2_ner_dataset')
class DstcNerDataset(Dataset):
@register('dstc2_ner_iterator')
class Dstc2NerDatasetIterator(BasicDatasetIterator):

def __init__(self, data, dataset_path, seed=None, shuffle=False):
r""" Dataset takes a dict with fields 'train', 'test', 'valid'. A list of samples (pairs x, y) is stored
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
"""

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset import Dataset
from deeppavlov.core.data.dataset_iterator import BasicDatasetIterator


@register('typos_dataset')
class TyposDataset(Dataset):
@register('typos_iterator')
class TyposDatasetIterator(BasicDatasetIterator):
def split(self, test_ratio=0., *args, **kwargs):
"""Split all data into train and test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
logger = logging.getLogger(__name__)


@register('babi')
@register('babi_reader')
class BabiDatasetReader(DatasetReader):
def __init__(self):
pass
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from deeppavlov.core.common.registry import register


@register('ner_dataset_reader')
class NerDatasetReader(DatasetReader):
@register('conll2003_reader')
class Conll2003DatasetReader(DatasetReader):

def download_conll(self, dir_path):
download_decompress('http://lnsigo.mipt.ru/export/datasets/conll2003.tar.gz', dir_path)
Expand Down

0 comments on commit 4f1a08a

Please sign in to comment.