flairNLP · alanakbik · Jun 28, 2022 · Jun 25, 2022
diff --git a/examples/ner/run_ner.py b/examples/ner/run_ner.py
@@ -113,7 +113,7 @@ def main():
     logger.info(corpus)
 
     tag_type: str = "ner"
-    tag_dictionary = corpus.make_label_dictionary(tag_type)
+    tag_dictionary = corpus.make_label_dictionary(tag_type, add_unk=False)
     logger.info(tag_dictionary)
 
     embeddings = TransformerWordEmbeddings(

diff --git a/flair/data.py b/flair/data.py
@@ -1415,12 +1415,15 @@ def __str__(self) -> str:
             _len_dataset(self.test) if self.test else 0,
         )
 
-    def make_label_dictionary(self, label_type: str, min_count: int = -1) -> Dictionary:
+    def make_label_dictionary(self, label_type: str, min_count: int = -1, add_unk: bool = True) -> Dictionary:
         """
         Creates a dictionary of all labels assigned to the sentences in the corpus.
         :return: dictionary of labels
         """
-        label_dictionary: Dictionary = Dictionary(add_unk=True)
+        if min_count > 0 and not add_unk:
+            raise ValueError("Cannot require a minimum count if no unk-token is created.")
+
+        label_dictionary: Dictionary = Dictionary(add_unk=add_unk)
         label_dictionary.span_labels = False
 
         assert self.train

diff --git a/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md b/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md
@@ -19,7 +19,7 @@ corpus = NCBI_DISEASE()
 print(corpus)
 
 # 2. make the tag dictionary from the corpus
-tag_dictionary = corpus.make_label_dictionary(label_type="ner")
+tag_dictionary = corpus.make_label_dictionary(label_type="ner", add_unk=False)
 
 # 3. initialize embeddings
 from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
@@ -134,7 +134,7 @@ embedding_types = [
 embeddings = StackedEmbeddings(embeddings=embedding_types)
 
 # 3. initialize sequence tagger
-tag_dictionary = corpus.make_label_dictionary(label_type="ner")
+tag_dictionary = corpus.make_label_dictionary(label_type="ner", add_unk=False)
 
 tagger = SequenceTagger(
     hidden_size=256,

diff --git a/resources/docs/KOR_docs/TUTORIAL_7_TRAINING_A_MODEL.md b/resources/docs/KOR_docs/TUTORIAL_7_TRAINING_A_MODEL.md
@@ -90,7 +90,7 @@ print(corpus)
 # 2. 어떤 레이블을 예측하고 싶으신가요?
 label_type = 'ner'
 # 3. 말뭉치에서 레이블 사전 만들기
-label_dict = corpus.make_label_dictionary(label_type=label_type)
+label_dict = corpus.make_label_dictionary(label_type=label_type, add_unk=False)
 print(label_dict)
 # 4. Flair 및 GloVe로 임베딩 스택 초기화하기
 embedding_types = [
@@ -137,7 +137,7 @@ print(corpus)
 # 2. 어떤 레이블을 예측하고 싶으신가요?
 label_type = 'ner'
 # 3. 말뭉치에서 레이블 사전 만들기
-label_dict = corpus.make_label_dictionary(label_type=label_type)
+label_dict = corpus.make_label_dictionary(label_type=label_type, add_unk=False)
 print(label_dict)
 # 4. 문서 컨텍스트로 미세 조정 가능한 변환기 임베딩 초기화
 embeddings = TransformerWordEmbeddings(
@@ -311,7 +311,7 @@ corpus: Corpus = WNUT_17().downsample(0.1)
 # 2. 어떤 레이블을 예측하고 싶으신가요?
 label_type = 'ner'
 # 3. 말뭉치에서 레이블 사전 만들기
-label_dict = corpus.make_label_dictionary(label_type=label_type)
+label_dict = corpus.make_label_dictionary(label_type=label_type, add_unk=False)
 # 4. 임베딩 초기화하기
 embedding_types: List[TokenEmbeddings] = [
     WordEmbeddings('glove')

diff --git a/resources/docs/KOR_docs/TUTORIAL_8_MODEL_OPTIMIZATION.md b/resources/docs/KOR_docs/TUTORIAL_8_MODEL_OPTIMIZATION.md
@@ -92,7 +92,7 @@ print(corpus)
 # 2. 우리는 예측하고 싶은 태그는 무엇인가요?
 tag_type = 'ner'
 # 3. 말뭉치에서 태그 사전 만들기
-tag_dictionary = corpus.make_label_dictionary(label_type=tag_type)
+tag_dictionary = corpus.make_label_dictionary(label_type=tag_type, add_unk=False)
 print(tag_dictionary.idx2item)
 # 4. 임베딩 초기화하기
 embedding_types: List[TokenEmbeddings] = [

diff --git a/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md b/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md
@@ -104,7 +104,7 @@ print(corpus)
 label_type = 'ner'
 
 # 3. make the label dictionary from the corpus
-label_dict = corpus.make_label_dictionary(label_type=label_type)
+label_dict = corpus.make_label_dictionary(label_type=label_type, add_unk=False)
 print(label_dict)
 
 # 4. initialize embedding stack with Flair and GloVe
@@ -159,7 +159,7 @@ print(corpus)
 label_type = 'ner'
 
 # 3. make the label dictionary from the corpus
-label_dict = corpus.make_label_dictionary(label_type=label_type)
+label_dict = corpus.make_label_dictionary(label_type=label_type, add_unk=False)
 print(label_dict)
 
 # 4. initialize fine-tuneable transformer embeddings WITH document context

diff --git a/resources/docs/TUTORIAL_8_MODEL_OPTIMIZATION.md b/resources/docs/TUTORIAL_8_MODEL_OPTIMIZATION.md
@@ -168,7 +168,7 @@ print(corpus)
 tag_type = 'ner'
 
 # 3. make the tag dictionary from the corpus
-tag_dictionary = corpus.make_label_dictionary(label_type=tag_type)
+tag_dictionary = corpus.make_label_dictionary(label_type=tag_type, add_unk=False)
 print(tag_dictionary.idx2item)
 
 # 4. initialize embeddings

diff --git a/tests/test_models.py b/tests/test_models.py
@@ -28,7 +28,7 @@ def test_sequence_tagger_no_crf(results_base_path, tasks_base_path):
         data_folder=tasks_base_path / "trivial" / "trivial_bioes",
         column_format={0: "text", 1: "ner"},
     )
-    tag_dictionary = corpus.make_label_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)
 
     # tagger without CRF
     tagger: SequenceTagger = SequenceTagger(
@@ -78,7 +78,7 @@ def test_sequence_tagger_with_crf(results_base_path, tasks_base_path):
         data_folder=tasks_base_path / "trivial" / "trivial_bioes",
         column_format={0: "text", 1: "ner"},
     )
-    tag_dictionary = corpus.make_label_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)
 
     # tagger without CRF
     tagger: SequenceTagger = SequenceTagger(
@@ -128,7 +128,7 @@ def test_sequence_tagger_stacked(results_base_path, tasks_base_path):
         data_folder=tasks_base_path / "trivial" / "trivial_bioes",
         column_format={0: "text", 1: "ner"},
     )
-    tag_dictionary = corpus.make_label_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)
 
     # tagger without CRF
     tagger: SequenceTagger = SequenceTagger(
@@ -178,7 +178,7 @@ def test_sequence_tagger_transformer_finetune(results_base_path, tasks_base_path
         data_folder=tasks_base_path / "trivial" / "trivial_bioes",
         column_format={0: "text", 1: "ner"},
     )
-    tag_dictionary = corpus.make_label_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)
 
     # tagger without CRF
     tagger: SequenceTagger = SequenceTagger(

diff --git a/tests/test_sequence_tagger.py b/tests/test_sequence_tagger.py
@@ -79,7 +79,7 @@ def test_all_tag_proba_embedding():
 @pytest.mark.integration
 def test_train_load_use_tagger(results_base_path, tasks_base_path):
     corpus = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"})
-    tag_dictionary = corpus.make_label_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)
 
     tagger: SequenceTagger = SequenceTagger(
         hidden_size=64,
@@ -117,7 +117,7 @@ def test_train_load_use_tagger(results_base_path, tasks_base_path):
 @pytest.mark.integration
 def test_train_load_use_tagger_empty_tags(results_base_path, tasks_base_path):
     corpus = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"})
-    tag_dictionary = corpus.make_label_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)
 
     tagger: SequenceTagger = SequenceTagger(
         hidden_size=64,
@@ -157,7 +157,7 @@ def test_train_load_use_tagger_disjunct_tags(results_base_path, tasks_base_path)
         data_folder=tasks_base_path / "fashion_disjunct",
         column_format={0: "text", 3: "ner"},
     )
-    tag_dictionary = corpus.make_label_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)
 
     tagger: SequenceTagger = SequenceTagger(
         hidden_size=64,
@@ -220,7 +220,7 @@ def test_train_load_use_tagger_large(results_base_path, tasks_base_path):
 @pytest.mark.integration
 def test_train_load_use_tagger_flair_embeddings(results_base_path, tasks_base_path):
     corpus = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"})
-    tag_dictionary = corpus.make_label_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)
 
     tagger: SequenceTagger = SequenceTagger(
         hidden_size=64,
@@ -257,7 +257,7 @@ def test_train_load_use_tagger_flair_embeddings(results_base_path, tasks_base_pa
 @pytest.mark.integration
 def test_train_load_use_tagger_adam(results_base_path, tasks_base_path):
     corpus = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"})
-    tag_dictionary = corpus.make_label_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)
 
     tagger: SequenceTagger = SequenceTagger(
         hidden_size=64,
@@ -298,7 +298,7 @@ def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path):
     corpus_2 = flair.datasets.NER_GERMAN_GERMEVAL(base_path=tasks_base_path).downsample(0.1)
 
     corpus = MultiCorpus([corpus_1, corpus_2])
-    tag_dictionary = corpus.make_label_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)
 
     tagger: SequenceTagger = SequenceTagger(
         hidden_size=64,
@@ -340,7 +340,7 @@ def test_train_resume_tagger(results_base_path, tasks_base_path):
     corpus_2 = flair.datasets.NER_GERMAN_GERMEVAL(base_path=tasks_base_path).downsample(0.1)
 
     corpus = MultiCorpus([corpus_1, corpus_2])
-    tag_dictionary = corpus.make_label_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)
 
     model: SequenceTagger = SequenceTagger(
         hidden_size=64,
@@ -367,7 +367,7 @@ def test_train_resume_tagger(results_base_path, tasks_base_path):
 @pytest.mark.integration
 def test_find_learning_rate(results_base_path, tasks_base_path):
     corpus = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"})
-    tag_dictionary = corpus.make_label_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)
 
     tagger: SequenceTagger = SequenceTagger(
         hidden_size=64,