Skip to content

Commit

Permalink
GH-1193: add parameter to choose encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
aakbik committed Oct 8, 2019
1 parent bf7d536 commit 3ae2e7e
Showing 1 changed file with 36 additions and 17 deletions.
53 changes: 36 additions & 17 deletions flair/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(
tag_to_bioes=None,
comment_symbol: str = None,
in_memory: bool = True,
encoding: str = "utf-8",
document_separator_token: str = None,
):
"""
Expand All @@ -51,6 +52,8 @@ def __init__(
:param tag_to_bioes: whether to convert to BIOES tagging scheme
:param comment_symbol: if set, lines that begin with this symbol are treated as comments
:param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
:param document_separator_token: If provided, multiple sentences are read into one object. Provide the string token
that indicates that a new document begins
:return: a Corpus with annotated train, dev and test data
"""

Expand Down Expand Up @@ -98,6 +101,7 @@ def __init__(
train_file,
column_format,
tag_to_bioes,
encoding=encoding,
comment_symbol=comment_symbol,
in_memory=in_memory,
document_separator_token=document_separator_token,
Expand All @@ -109,6 +113,7 @@ def __init__(
test_file,
column_format,
tag_to_bioes,
encoding=encoding,
comment_symbol=comment_symbol,
in_memory=in_memory,
document_separator_token=document_separator_token,
Expand All @@ -126,6 +131,7 @@ def __init__(
dev_file,
column_format,
tag_to_bioes,
encoding=encoding,
comment_symbol=comment_symbol,
in_memory=in_memory,
document_separator_token=document_separator_token,
Expand Down Expand Up @@ -626,6 +632,7 @@ def __init__(
comment_symbol: str = None,
in_memory: bool = True,
document_separator_token: str = None,
encoding: str = "utf-8",
):
"""
Instantiates a column dataset (typically used for sequence labeling or word-level prediction).
Expand All @@ -635,6 +642,8 @@ def __init__(
:param tag_to_bioes: whether to convert to BIOES tagging scheme
:param comment_symbol: if set, lines that begin with this symbol are treated as comments
:param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
:param document_separator_token: If provided, multiple sentences are read into one object. Provide the string token
that indicates that a new document begins
"""
assert path_to_column_file.exists()
self.path_to_column_file = path_to_column_file
Expand All @@ -659,21 +668,10 @@ def __init__(
self.text_column = column

# determine encoding of text file
encoding = "utf-8"
try:
lines: List[str] = open(str(path_to_column_file), encoding="utf-8").read(
10
).strip().split("\n")
except:
log.info(
'UTF-8 can\'t read: {} ... using "latin-1" instead.'.format(
path_to_column_file
)
)
encoding = "latin1"
self.encoding = encoding

sentence: Sentence = Sentence()
with open(str(self.path_to_column_file), encoding=encoding) as f:
with open(str(self.path_to_column_file), encoding=self.encoding) as f:

line = f.readline()
position = 0
Expand All @@ -685,7 +683,9 @@ def __init__(
continue

if self.__line_completes_sentence(line):

if len(sentence) > 0:

sentence.infer_space_after()
if self.in_memory:
if self.tag_to_bioes is not None:
Expand Down Expand Up @@ -744,7 +744,7 @@ def __getitem__(self, index: int = 0) -> Sentence:
sentence = self.sentences[index]

else:
with open(str(self.path_to_column_file), encoding="utf-8") as file:
with open(str(self.path_to_column_file), encoding=self.encoding) as file:
file.seek(self.indices[index])
line = file.readline()
sentence: Sentence = Sentence()
Expand Down Expand Up @@ -1298,6 +1298,7 @@ def __init__(
:param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' or 'np' to predict
POS tags or chunks respectively
:param in_memory: If True, keeps dataset in memory giving speedups in training.
:param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
"""
if type(base_path) == str:
base_path: Path = Path(base_path)
Expand Down Expand Up @@ -1337,6 +1338,7 @@ def __init__(
base_path: Union[str, Path] = None,
tag_to_bioes: str = "ner",
in_memory: bool = True,
document_as_sequence: bool = False,
):
"""
Initialize the CoNLL-03 corpus for German. This is only possible if you've manually downloaded it to your machine.
Expand All @@ -1346,6 +1348,7 @@ def __init__(
:param tag_to_bioes: NER by default, need not be changed, but you could also select 'lemma', 'pos' or 'np' to predict
word lemmas, POS tags or chunks respectively
:param in_memory: If True, keeps dataset in memory giving speedups in training.
:param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
"""
if type(base_path) == str:
base_path: Path = Path(base_path)
Expand All @@ -1371,7 +1374,11 @@ def __init__(
log.warning("-" * 100)

super(CONLL_03_GERMAN, self).__init__(
data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
data_folder,
columns,
tag_to_bioes=tag_to_bioes,
in_memory=in_memory,
document_separator_token=None if not document_as_sequence else "-DOCSTART-",
)


Expand All @@ -1381,6 +1388,7 @@ def __init__(
base_path: Union[str, Path] = None,
tag_to_bioes: str = "ner",
in_memory: bool = True,
document_as_sequence: bool = False,
):
"""
Initialize the CoNLL-03 corpus for Dutch. The first time you call this constructor it will automatically
Expand All @@ -1390,6 +1398,7 @@ def __init__(
:param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
POS tags instead
:param in_memory: If True, keeps dataset in memory giving speedups in training.
:param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
"""
if type(base_path) == str:
base_path: Path = Path(base_path)
Expand All @@ -1412,7 +1421,12 @@ def __init__(
cached_path(f"{conll_02_path}ned.train", Path("datasets") / dataset_name)

super(CONLL_03_DUTCH, self).__init__(
data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
data_folder,
columns,
tag_to_bioes=tag_to_bioes,
encoding="latin-1",
in_memory=in_memory,
document_separator_token=None if not document_as_sequence else "-DOCSTART-",
)


Expand All @@ -1430,6 +1444,7 @@ def __init__(
to point to a different folder but typically this should not be necessary.
:param tag_to_bioes: NER by default, should not be changed
:param in_memory: If True, keeps dataset in memory giving speedups in training.
:param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
"""
if type(base_path) == str:
base_path: Path = Path(base_path)
Expand All @@ -1452,7 +1467,11 @@ def __init__(
cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name)

super(CONLL_03_SPANISH, self).__init__(
data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
data_folder,
columns,
tag_to_bioes=tag_to_bioes,
encoding="latin-1",
in_memory=in_memory,
)


Expand Down

0 comments on commit 3ae2e7e

Please sign in to comment.