Skip to content

Commit

Permalink
Merge pull request #2216 from melismat/master
Browse files Browse the repository at this point in the history
Support for UD Estonian (PoS) added
  • Loading branch information
alanakbik committed Apr 16, 2021
2 parents fd8e42c + 9b91403 commit e4e78c7
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 0 deletions.
1 change: 1 addition & 0 deletions flair/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@
from .treebanks import UniversalDependenciesCorpus
from .treebanks import UniversalDependenciesDataset
from .treebanks import UD_ENGLISH
from .treebanks import UD_ESTONIAN
from .treebanks import UD_GERMAN
from .treebanks import UD_GERMAN_HDT
from .treebanks import UD_DUTCH
Expand Down
27 changes: 27 additions & 0 deletions flair/datasets/treebanks.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,33 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, s
super(UD_ENGLISH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)


class UD_ESTONIAN(UniversalDependenciesCorpus):
def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):

if type(base_path) == str:
base_path: Path = Path(base_path)

# this dataset name
dataset_name = self.__class__.__name__.lower()

# default dataset folder is the cache root
if not base_path:
base_path = Path(flair.cache_root) / "datasets"
data_folder = base_path / dataset_name

# download data if necessary
web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Estonian-EDT/master"
cached_path(f"{web_path}/et_edt-ud-dev.conllu", Path("datasets") / dataset_name)
cached_path(
f"{web_path}/et_edt-ud-test.conllu", Path("datasets") / dataset_name
)
cached_path(
f"{web_path}/et_edt-ud-train.conllu", Path("datasets") / dataset_name
)

super(UD_ESTONIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)


class UD_GERMAN(UniversalDependenciesCorpus):
def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):

Expand Down

0 comments on commit e4e78c7

Please sign in to comment.