GH-1193: add parameter to choose encoding

flairNLP · Oct 8, 2019 · 3ae2e7e · 3ae2e7e
1 parent bf7d536
commit 3ae2e7e
Showing 1 changed file with 36 additions and 17 deletions.
diff --git a/flair/datasets.py b/flair/datasets.py
@@ -38,6 +38,7 @@ def __init__(
         tag_to_bioes=None,
         comment_symbol: str = None,
         in_memory: bool = True,
+        encoding: str = "utf-8",
         document_separator_token: str = None,
     ):
         """
@@ -51,6 +52,8 @@ def __init__(
         :param tag_to_bioes: whether to convert to BIOES tagging scheme
         :param comment_symbol: if set, lines that begin with this symbol are treated as comments
         :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
+        :param document_separator_token: If provided, multiple sentences are read into one object. Provide the string token
+        that indicates that a new document begins
         :return: a Corpus with annotated train, dev and test data
         """
 
@@ -98,6 +101,7 @@ def __init__(
             train_file,
             column_format,
             tag_to_bioes,
+            encoding=encoding,
             comment_symbol=comment_symbol,
             in_memory=in_memory,
             document_separator_token=document_separator_token,
@@ -109,6 +113,7 @@ def __init__(
                 test_file,
                 column_format,
                 tag_to_bioes,
+                encoding=encoding,
                 comment_symbol=comment_symbol,
                 in_memory=in_memory,
                 document_separator_token=document_separator_token,
@@ -126,6 +131,7 @@ def __init__(
                 dev_file,
                 column_format,
                 tag_to_bioes,
+                encoding=encoding,
                 comment_symbol=comment_symbol,
                 in_memory=in_memory,
                 document_separator_token=document_separator_token,
@@ -626,6 +632,7 @@ def __init__(
         comment_symbol: str = None,
         in_memory: bool = True,
         document_separator_token: str = None,
+        encoding: str = "utf-8",
     ):
         """
         Instantiates a column dataset (typically used for sequence labeling or word-level prediction).
@@ -635,6 +642,8 @@ def __init__(
         :param tag_to_bioes: whether to convert to BIOES tagging scheme
         :param comment_symbol: if set, lines that begin with this symbol are treated as comments
         :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
+        :param document_separator_token: If provided, multiple sentences are read into one object. Provide the string token
+        that indicates that a new document begins
         """
         assert path_to_column_file.exists()
         self.path_to_column_file = path_to_column_file
@@ -659,21 +668,10 @@ def __init__(
                 self.text_column = column
 
         # determine encoding of text file
-        encoding = "utf-8"
-        try:
-            lines: List[str] = open(str(path_to_column_file), encoding="utf-8").read(
-                10
-            ).strip().split("\n")
-        except:
-            log.info(
-                'UTF-8 can\'t read: {} ... using "latin-1" instead.'.format(
-                    path_to_column_file
-                )
-            )
-            encoding = "latin1"
+        self.encoding = encoding
 
         sentence: Sentence = Sentence()
-        with open(str(self.path_to_column_file), encoding=encoding) as f:
+        with open(str(self.path_to_column_file), encoding=self.encoding) as f:
 
             line = f.readline()
             position = 0
@@ -685,7 +683,9 @@ def __init__(
                     continue
 
                 if self.__line_completes_sentence(line):
+
                     if len(sentence) > 0:
+
                         sentence.infer_space_after()
                         if self.in_memory:
                             if self.tag_to_bioes is not None:
@@ -744,7 +744,7 @@ def __getitem__(self, index: int = 0) -> Sentence:
             sentence = self.sentences[index]
 
         else:
-            with open(str(self.path_to_column_file), encoding="utf-8") as file:
+            with open(str(self.path_to_column_file), encoding=self.encoding) as file:
                 file.seek(self.indices[index])
                 line = file.readline()
                 sentence: Sentence = Sentence()
@@ -1298,6 +1298,7 @@ def __init__(
         :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' or 'np' to predict
         POS tags or chunks respectively
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
@@ -1337,6 +1338,7 @@ def __init__(
         base_path: Union[str, Path] = None,
         tag_to_bioes: str = "ner",
         in_memory: bool = True,
+        document_as_sequence: bool = False,
     ):
         """
         Initialize the CoNLL-03 corpus for German. This is only possible if you've manually downloaded it to your machine.
@@ -1346,6 +1348,7 @@ def __init__(
         :param tag_to_bioes: NER by default, need not be changed, but you could also select 'lemma', 'pos' or 'np' to predict
         word lemmas, POS tags or chunks respectively
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
@@ -1371,7 +1374,11 @@ def __init__(
             log.warning("-" * 100)
 
         super(CONLL_03_GERMAN, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
         )
 
 
@@ -1381,6 +1388,7 @@ def __init__(
         base_path: Union[str, Path] = None,
         tag_to_bioes: str = "ner",
         in_memory: bool = True,
+        document_as_sequence: bool = False,
     ):
         """
         Initialize the CoNLL-03 corpus for Dutch. The first time you call this constructor it will automatically
@@ -1390,6 +1398,7 @@ def __init__(
         :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
         POS tags instead
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
@@ -1412,7 +1421,12 @@ def __init__(
         cached_path(f"{conll_02_path}ned.train", Path("datasets") / dataset_name)
 
         super(CONLL_03_DUTCH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            encoding="latin-1",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
         )
 
 
@@ -1430,6 +1444,7 @@ def __init__(
         to point to a different folder but typically this should not be necessary.
         :param tag_to_bioes: NER by default, should not be changed
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
@@ -1452,7 +1467,11 @@ def __init__(
         cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name)
 
         super(CONLL_03_SPANISH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            encoding="latin-1",
+            in_memory=in_memory,
         )