In [None]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings, TransformerWordEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.data import Sentence
from flair.models import SequenceTagger

As the training was carried out on several computers and virtual machines, this file does not contain the training outputs, but they are in the folder "training_outputs".

## LOAD DATASETS

In [None]:
columns = {0: 'text', 1: 'ner'}


label_name_map = {'FUND': 'Funding Agency',
                  'IND': 'Person',
                  'COR': 'Corporation',
                  'GRNB': 'Grant Number',
                  'UNI': 'University',
                  'MISC': 'Miscellaneous'
                  }
label_type = 'ner'

### Corpus 1

In [None]:
data_folder = r'data/corpus1'
corpus1: Corpus = ColumnCorpus(data_folder, columns,
                              label_name_map=label_name_map,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')
label_dict1 = corpus1.make_label_dictionary(label_type=label_type)

### Corpus2

In [None]:
data_folder = r'data/corpus2'
corpus2: Corpus = ColumnCorpus(data_folder, columns,
                              label_name_map=label_name_map,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')
label_dict2 = corpus2.make_label_dictionary(label_type=label_type)

### Corpus3

In [None]:
data_folder = r'data/corpus3'
corpus3: Corpus = ColumnCorpus(data_folder, columns,
                              label_name_map=label_name_map,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')
label_dict3 = corpus3.make_label_dictionary(label_type=label_type)

### Corpus4

In [None]:
data_folder = r'data/corpus4'
corpus4: Corpus = ColumnCorpus(data_folder, columns,
                              label_name_map=label_name_map,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')
label_dict4 = corpus4.make_label_dictionary(label_type=label_type)

### Corpus1 with silver dataset

In [None]:
data_folder = r'data/corpus1_silver'
corpus1_silver: Corpus = ColumnCorpus(data_folder, columns,
                              label_name_map=label_name_map,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')
label_dict1_silver = corpus1_silver.make_label_dictionary(label_type=label_type)

### Corpus4 with silver dataset

In [None]:
data_folder = r'data/corpus4_silver'
corpus4_silver: Corpus = ColumnCorpus(data_folder, columns,
                              label_name_map=label_name_map,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')
label_dict4_silver = corpus4_silver.make_label_dictionary(label_type=label_type)

# Flair model (for each corpus)

In [None]:
stacked_embeddings = StackedEmbeddings([
                                        WordEmbeddings('glove'),
                                        FlairEmbeddings('news-forward'),
                                        FlairEmbeddings('news-backward'),
                                       ])
model = SequenceTagger(hidden_size=256,
                        embeddings=stacked_embeddings,
                        tag_dictionary=label_dict1,
                        tag_type=label_type)

trainer = ModelTrainer(model, corpus1)

trainer.train('resources/taggers/flair_1',
              learning_rate=0.1,
              mini_batch_size=32,
              mini_batch_chunk_size=4,
              max_epochs=100)

In [None]:
stacked_embeddings = StackedEmbeddings([
                                        WordEmbeddings('glove'),
                                        FlairEmbeddings('news-forward'),
                                        FlairEmbeddings('news-backward'),
                                       ])
model = SequenceTagger(hidden_size=256,
                        embeddings=stacked_embeddings,
                        tag_dictionary=label_dict2,
                        tag_type=label_type)

trainer = ModelTrainer(model, corpus2)

trainer.train('resources/taggers/flair_2',
              learning_rate=0.1,
              mini_batch_size=32,
              mini_batch_chunk_size=4,
              max_epochs=100)

In [None]:
stacked_embeddings = StackedEmbeddings([
                                        WordEmbeddings('glove'),
                                        FlairEmbeddings('news-forward'),
                                        FlairEmbeddings('news-backward'),
                                       ])
model = SequenceTagger(hidden_size=256,
                        embeddings=stacked_embeddings,
                        tag_dictionary=label_dict3,
                        tag_type=label_type)

trainer = ModelTrainer(model, corpus3)

trainer.train('resources/taggers/flair_3',
              learning_rate=0.1,
              mini_batch_size=32,
              mini_batch_chunk_size=4,
              max_epochs=100)

In [None]:
stacked_embeddings = StackedEmbeddings([
                                        WordEmbeddings('glove'),
                                        FlairEmbeddings('news-forward'),
                                        FlairEmbeddings('news-backward'),
                                       ])
model = SequenceTagger(hidden_size=256,
                        embeddings=stacked_embeddings,
                        tag_dictionary=label_dict4,
                        tag_type=label_type)

trainer = ModelTrainer(model, corpus4)

trainer.train('resources/taggers/flair_4',
              learning_rate=0.1,
              mini_batch_size=32,
              mini_batch_chunk_size=4,
              max_epochs=100)

# Flair model (corpus 1 with silver dataset)

In [None]:
stacked_embeddings = StackedEmbeddings([
                                        WordEmbeddings('glove'),
                                        FlairEmbeddings('news-forward'),
                                        FlairEmbeddings('news-backward'),
                                       ])
model = SequenceTagger(hidden_size=256,
                        embeddings=stacked_embeddings,
                        tag_dictionary=label_dict1_silver,
                        tag_type=label_type)

trainer = ModelTrainer(model, corpus1_silver)

trainer.train('resources/taggers/flair_1_silver',
              learning_rate=0.1,
              mini_batch_size=32,
              mini_batch_chunk_size=4,
              max_epochs=100)

# BERT model (corpus4)

In [None]:
embeddings = TransformerWordEmbeddings(
    model='bert-base-uncased',
    layers="-1",
    subtoken_pooling="first",
    fine_tune=True,
    use_context=True,
)

model = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=label_dict4,
    tag_type='ner',
    use_crf=False,
    use_rnn=False,
    reproject_embeddings=False,
)

trainer = ModelTrainer(model, corpus4)

trainer.train('resources/taggers/bert',
              learning_rate=0.01,
              mini_batch_size=4,
              mini_batch_chunk_size=2,
              max_epochs=20,
              embeddings_storage_mode='none',
              weight_decay=0.,
              )

## XLnet - base (corpus4)

In [None]:
embeddings = TransformerWordEmbeddings(
    model='xlnet-base-cased',
    layers="-1",
    subtoken_pooling="first",
    fine_tune=True,
    use_context=True,
)

model = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=label_dict4,
    tag_type='ner',
    use_crf=False,
    use_rnn=False,
    reproject_embeddings=False,
)

trainer = ModelTrainer(model, corpus4)

trainer.train('resources/taggers/xlnet-base',
              learning_rate=0.01,
              mini_batch_size=4,
              mini_batch_chunk_size=2,
              max_epochs=20,
              embeddings_storage_mode='none',
              weight_decay=0.,
              )

# XLnet - large (corpus4)

In [None]:
embeddings = TransformerWordEmbeddings(
    model='xlnet-large-cased',
    layers="-1",
    subtoken_pooling="first",
    fine_tune=True,
    use_context=True,
)

model = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=label_dict4,
    tag_type='ner',
    use_crf=False,
    use_rnn=False,
    reproject_embeddings=False,
)

trainer = ModelTrainer(model, corpus4)

trainer.train('resources/taggers/xlnet-large',
              learning_rate=0.01,
              mini_batch_size=4,
              mini_batch_chunk_size=2,
              max_epochs=20,
              embeddings_storage_mode='none',
              weight_decay=0.,
              )

# XLnet - large (corpus4 with silver)

In [None]:
embeddings = TransformerWordEmbeddings(
    model='xlnet-large-cased',
    layers="-1",
    subtoken_pooling="first",
    fine_tune=True,
    use_context=True,
)

model = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=label_dict4_silver,
    tag_type='ner',
    use_crf=False,
    use_rnn=False,
    reproject_embeddings=False,
)

trainer = ModelTrainer(model, corpus4_silver)


trainer.train('resources/taggers/xlnet-large-silver',
              learning_rate=0.01,
              mini_batch_size=4,
              mini_batch_chunk_size=2,
              max_epochs=20,
              embeddings_storage_mode='none',
              weight_decay=0.,
              )