In [1]:
from flair.data import Sentence
from flair.models import SequenceTagger

# load tagger
tagger = SequenceTagger.load("flair/ner-french")



2023-03-21 15:27:20,607 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, B-LOC, E-LOC, S-LOC, B-MISC, E-MISC, B-PER, E-PER, S-PER, I-MISC, I-PER, I-LOC, S-MISC, B-ORG, E-ORG, I-ORG, S-ORG, <START>, <STOP>


In [6]:
sentence = Sentence("The grass is green .'")

In [16]:
tagger.predict(sentence)

In [17]:
print(sentence.to_tagged_string())

Sentence[6]: "George Washington went to Washington ." → ["George Washington"/PER, "Washington"/LOC]


In [18]:
for entity in sentence.get_spans('ner'):
    print(entity)

Span[0:2]: "George Washington" → PER (0.7136)
Span[4:5]: "Washington" → LOC (0.6779)


### We can get confidence score of each of the predicted entities

In [20]:
print(sentence.to_dict(tag_type='ner')['ner'])

[{'value': 'PER', 'confidence': 0.7136043012142181}, {'value': 'LOC', 'confidence': 0.677897036075592}]


In [37]:
import urllib.request
from pathlib import Path
import pandas as pd

def download_file(url, output_file):
  Path(output_file).parent.mkdir(parents=True, exist_ok=True)
  urllib.request.urlretrieve (url, output_file)

conllpp_train = pd.read_csv('https://raw.githubusercontent.com/ZihanWangKi/CrossWeigh/master/data/conllpp_train.txt', sep =" ", header=None)


In [26]:
conllpp_dev = pd.read_csv('https://raw.githubusercontent.com/ZihanWangKi/CrossWeigh/master/data/conllpp_dev.txt',  sep =" ", header=None)

In [27]:
conllpp_test = pd.read_csv('https://raw.githubusercontent.com/ZihanWangKi/CrossWeigh/master/data/conllpp_test.txt', sep = " ", header=None)

In [45]:
conllpp_train.to_csv('conllpp_train.csv', sep= " ")
conllpp_dev.to_csv('conllpp_dev.csv', sep= " ")
conllpp_test.to_csv('conllpp_test.csv', sep= " ")

In [47]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus
columns = {0: 'text', 3: 'ner'}
corpus: Corpus = ColumnCorpus('data/', columns,
                              train_file='conllpp_train.txt',
                              test_file='conllpp_test.txt',
                              dev_file='conllpp_dev.txt')

2023-03-21 16:31:44,609 Reading data from data
2023-03-21 16:31:44,611 Train: data/conllpp_train.txt
2023-03-21 16:31:44,612 Dev: data/conllpp_dev.txt
2023-03-21 16:31:44,613 Test: data/conllpp_test.txt


In [48]:
import pandas as pd
data = [[len(corpus.train), len(corpus.test), len(corpus.dev)]]
# Prints out the dataset sizes of train test and development in a table.
pd.DataFrame(data, columns=["Train", "Test", "Development"])

Unnamed: 0,Train,Test,Development
0,14987,3684,3466


In [51]:
import flair
from typing import List
from flair.trainers import ModelTrainer
from flair.models import SequenceTagger
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings

tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

# For faster training and smaller models, we can comment out the flair embeddings.
# This will significantly affect the performance though.
embedding_types: List[TokenEmbeddings] = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train('model/conllpp',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=50,
              embeddings_storage_mode='gpu')

  tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)


2023-03-21 16:34:18,569 SequenceTagger predicts: Dictionary with 3 tags: O, <START>, <STOP>
2023-03-21 16:34:18,800 ----------------------------------------------------------------------------------------------------
2023-03-21 16:34:18,802 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4196, out_features=4196, bias=True)
  (rnn): LSTM(4196, 256, batch_first=True, b