Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make add_unk optional and don't use it for ner #2839

Merged
merged 1 commit into from
Jun 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/ner/run_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def main():
logger.info(corpus)

tag_type: str = "ner"
tag_dictionary = corpus.make_label_dictionary(tag_type)
tag_dictionary = corpus.make_label_dictionary(tag_type, add_unk=False)
logger.info(tag_dictionary)

embeddings = TransformerWordEmbeddings(
Expand Down
7 changes: 5 additions & 2 deletions flair/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1415,12 +1415,15 @@ def __str__(self) -> str:
_len_dataset(self.test) if self.test else 0,
)

def make_label_dictionary(self, label_type: str, min_count: int = -1) -> Dictionary:
def make_label_dictionary(self, label_type: str, min_count: int = -1, add_unk: bool = True) -> Dictionary:
"""
Creates a dictionary of all labels assigned to the sentences in the corpus.
:return: dictionary of labels
"""
label_dictionary: Dictionary = Dictionary(add_unk=True)
if min_count > 0 and not add_unk:
raise ValueError("Cannot require a minimum count if no unk-token is created.")

label_dictionary: Dictionary = Dictionary(add_unk=add_unk)
label_dictionary.span_labels = False

assert self.train
Expand Down
4 changes: 2 additions & 2 deletions resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ corpus = NCBI_DISEASE()
print(corpus)

# 2. make the tag dictionary from the corpus
tag_dictionary = corpus.make_label_dictionary(label_type="ner")
tag_dictionary = corpus.make_label_dictionary(label_type="ner", add_unk=False)

# 3. initialize embeddings
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
Expand Down Expand Up @@ -134,7 +134,7 @@ embedding_types = [
embeddings = StackedEmbeddings(embeddings=embedding_types)

# 3. initialize sequence tagger
tag_dictionary = corpus.make_label_dictionary(label_type="ner")
tag_dictionary = corpus.make_label_dictionary(label_type="ner", add_unk=False)

tagger = SequenceTagger(
hidden_size=256,
Expand Down
6 changes: 3 additions & 3 deletions resources/docs/KOR_docs/TUTORIAL_7_TRAINING_A_MODEL.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ print(corpus)
# 2. 어떤 레이블을 예측하고 싶으신가요?
label_type = 'ner'
# 3. 말뭉치에서 레이블 사전 만들기
label_dict = corpus.make_label_dictionary(label_type=label_type)
label_dict = corpus.make_label_dictionary(label_type=label_type, add_unk=False)
print(label_dict)
# 4. Flair 및 GloVe로 임베딩 스택 초기화하기
embedding_types = [
Expand Down Expand Up @@ -137,7 +137,7 @@ print(corpus)
# 2. 어떤 레이블을 예측하고 싶으신가요?
label_type = 'ner'
# 3. 말뭉치에서 레이블 사전 만들기
label_dict = corpus.make_label_dictionary(label_type=label_type)
label_dict = corpus.make_label_dictionary(label_type=label_type, add_unk=False)
print(label_dict)
# 4. 문서 컨텍스트로 미세 조정 가능한 변환기 임베딩 초기화
embeddings = TransformerWordEmbeddings(
Expand Down Expand Up @@ -311,7 +311,7 @@ corpus: Corpus = WNUT_17().downsample(0.1)
# 2. 어떤 레이블을 예측하고 싶으신가요?
label_type = 'ner'
# 3. 말뭉치에서 레이블 사전 만들기
label_dict = corpus.make_label_dictionary(label_type=label_type)
label_dict = corpus.make_label_dictionary(label_type=label_type, add_unk=False)
# 4. 임베딩 초기화하기
embedding_types: List[TokenEmbeddings] = [
WordEmbeddings('glove')
Expand Down
2 changes: 1 addition & 1 deletion resources/docs/KOR_docs/TUTORIAL_8_MODEL_OPTIMIZATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ print(corpus)
# 2. 우리는 예측하고 싶은 태그는 무엇인가요?
tag_type = 'ner'
# 3. 말뭉치에서 태그 사전 만들기
tag_dictionary = corpus.make_label_dictionary(label_type=tag_type)
tag_dictionary = corpus.make_label_dictionary(label_type=tag_type, add_unk=False)
print(tag_dictionary.idx2item)
# 4. 임베딩 초기화하기
embedding_types: List[TokenEmbeddings] = [
Expand Down
4 changes: 2 additions & 2 deletions resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ print(corpus)
label_type = 'ner'

# 3. make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
label_dict = corpus.make_label_dictionary(label_type=label_type, add_unk=False)
print(label_dict)

# 4. initialize embedding stack with Flair and GloVe
Expand Down Expand Up @@ -159,7 +159,7 @@ print(corpus)
label_type = 'ner'

# 3. make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
label_dict = corpus.make_label_dictionary(label_type=label_type, add_unk=False)
print(label_dict)

# 4. initialize fine-tuneable transformer embeddings WITH document context
Expand Down
2 changes: 1 addition & 1 deletion resources/docs/TUTORIAL_8_MODEL_OPTIMIZATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ print(corpus)
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_label_dictionary(label_type=tag_type)
tag_dictionary = corpus.make_label_dictionary(label_type=tag_type, add_unk=False)
print(tag_dictionary.idx2item)

# 4. initialize embeddings
Expand Down
8 changes: 4 additions & 4 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_sequence_tagger_no_crf(results_base_path, tasks_base_path):
data_folder=tasks_base_path / "trivial" / "trivial_bioes",
column_format={0: "text", 1: "ner"},
)
tag_dictionary = corpus.make_label_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)

# tagger without CRF
tagger: SequenceTagger = SequenceTagger(
Expand Down Expand Up @@ -78,7 +78,7 @@ def test_sequence_tagger_with_crf(results_base_path, tasks_base_path):
data_folder=tasks_base_path / "trivial" / "trivial_bioes",
column_format={0: "text", 1: "ner"},
)
tag_dictionary = corpus.make_label_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)

# tagger without CRF
tagger: SequenceTagger = SequenceTagger(
Expand Down Expand Up @@ -128,7 +128,7 @@ def test_sequence_tagger_stacked(results_base_path, tasks_base_path):
data_folder=tasks_base_path / "trivial" / "trivial_bioes",
column_format={0: "text", 1: "ner"},
)
tag_dictionary = corpus.make_label_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)

# tagger without CRF
tagger: SequenceTagger = SequenceTagger(
Expand Down Expand Up @@ -178,7 +178,7 @@ def test_sequence_tagger_transformer_finetune(results_base_path, tasks_base_path
data_folder=tasks_base_path / "trivial" / "trivial_bioes",
column_format={0: "text", 1: "ner"},
)
tag_dictionary = corpus.make_label_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)

# tagger without CRF
tagger: SequenceTagger = SequenceTagger(
Expand Down
16 changes: 8 additions & 8 deletions tests/test_sequence_tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def test_all_tag_proba_embedding():
@pytest.mark.integration
def test_train_load_use_tagger(results_base_path, tasks_base_path):
corpus = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"})
tag_dictionary = corpus.make_label_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)

tagger: SequenceTagger = SequenceTagger(
hidden_size=64,
Expand Down Expand Up @@ -117,7 +117,7 @@ def test_train_load_use_tagger(results_base_path, tasks_base_path):
@pytest.mark.integration
def test_train_load_use_tagger_empty_tags(results_base_path, tasks_base_path):
corpus = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"})
tag_dictionary = corpus.make_label_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)

tagger: SequenceTagger = SequenceTagger(
hidden_size=64,
Expand Down Expand Up @@ -157,7 +157,7 @@ def test_train_load_use_tagger_disjunct_tags(results_base_path, tasks_base_path)
data_folder=tasks_base_path / "fashion_disjunct",
column_format={0: "text", 3: "ner"},
)
tag_dictionary = corpus.make_label_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)

tagger: SequenceTagger = SequenceTagger(
hidden_size=64,
Expand Down Expand Up @@ -220,7 +220,7 @@ def test_train_load_use_tagger_large(results_base_path, tasks_base_path):
@pytest.mark.integration
def test_train_load_use_tagger_flair_embeddings(results_base_path, tasks_base_path):
corpus = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"})
tag_dictionary = corpus.make_label_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)

tagger: SequenceTagger = SequenceTagger(
hidden_size=64,
Expand Down Expand Up @@ -257,7 +257,7 @@ def test_train_load_use_tagger_flair_embeddings(results_base_path, tasks_base_pa
@pytest.mark.integration
def test_train_load_use_tagger_adam(results_base_path, tasks_base_path):
corpus = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"})
tag_dictionary = corpus.make_label_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)

tagger: SequenceTagger = SequenceTagger(
hidden_size=64,
Expand Down Expand Up @@ -298,7 +298,7 @@ def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path):
corpus_2 = flair.datasets.NER_GERMAN_GERMEVAL(base_path=tasks_base_path).downsample(0.1)

corpus = MultiCorpus([corpus_1, corpus_2])
tag_dictionary = corpus.make_label_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)

tagger: SequenceTagger = SequenceTagger(
hidden_size=64,
Expand Down Expand Up @@ -340,7 +340,7 @@ def test_train_resume_tagger(results_base_path, tasks_base_path):
corpus_2 = flair.datasets.NER_GERMAN_GERMEVAL(base_path=tasks_base_path).downsample(0.1)

corpus = MultiCorpus([corpus_1, corpus_2])
tag_dictionary = corpus.make_label_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)

model: SequenceTagger = SequenceTagger(
hidden_size=64,
Expand All @@ -367,7 +367,7 @@ def test_train_resume_tagger(results_base_path, tasks_base_path):
@pytest.mark.integration
def test_find_learning_rate(results_base_path, tasks_base_path):
corpus = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"})
tag_dictionary = corpus.make_label_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner", add_unk=False)

tagger: SequenceTagger = SequenceTagger(
hidden_size=64,
Expand Down