In [1]:
from flair.data import Sentence
from flair.models import SequenceTagger
import csv
import shutil

In [2]:
csv_file_path = 'data/corpus_silver/new_acknowledgements.csv'

with open(csv_file_path, 'r', encoding = 'ISO-8859-1') as file:
    csv_reader = csv.reader(file,delimiter=';')
    header = next(csv_reader)
    
    silver_text = []
    for row in csv_reader:
        silver_text.append(Sentence(row[2]))

Due to model size, it is stored on google drive https://drive.google.com/file/d/1zayPmVqdljP2176SvnsliraFAEabArdZ/view?usp=sharing. To run next cell you need to download it and replace the path.

In [3]:
model = SequenceTagger.load("models/flair_corp4.pt")

2023-12-09 16:30:33,646 SequenceTagger predicts: Dictionary with 27 tags: O, S-Funding Agency, B-Funding Agency, E-Funding Agency, I-Funding Agency, S-Grant Number, B-Grant Number, E-Grant Number, I-Grant Number, S-Person, B-Person, E-Person, I-Person, S-University, B-University, E-University, I-University, S-Miscellaneous, B-Miscellaneous, E-Miscellaneous, I-Miscellaneous, S-Corporation, B-Corporation, E-Corporation, I-Corporation, <START>, <STOP>


In [4]:
def create_silver_standard_txt(path):
    model.predict(silver_text)
    
    label_map = {'Funding Agency':'FUND',
                  'Person':'IND',
                  'Corporation':'COR',
                  'Grant Number':'GRNB',
                  'University':'UNI',
                  'Miscellaneous':'MISC'
                  }
    
    with open(path, "w") as f:
        for sentence in silver_text:
            for entity in sentence.get_spans('ner'):
                prefix = 'B-'
                for token in entity:
                    token.set_label('ner', prefix + label_map[entity.tag], entity.score)
                    prefix = 'I-'

            for token in sentence:
                f.write(f"{token.text} {token.get_label('ner').value}")
                f.write("\n")
            f.write("\n")


In [5]:
create_silver_standard_txt(r'data/corpus1_silver/silver_set.txt')

In [6]:
corpus1_path = 'data/corpus1'
corpus_silver_path = 'data/corpus1_silver'
file_names = ['train.txt', 'dev.txt', 'test.txt']

for file_name in file_names:
    source_path = f'{corpus1_path}/{file_name}'
    destination_path = f'{corpus_silver_path}/{file_name}'
    shutil.copyfile(source_path, destination_path)

silver_set_path = f'{corpus_silver_path}/silver_set.txt'
train_path = f'{corpus_silver_path}/train.txt'
with open(train_path, 'a') as train_file, open(silver_set_path, 'r') as silver_set_file:
    for line in silver_set_file:
        train_file.write(line)


In [9]:
model.save('models/flair_corp1_silver4.pt')

In [None]:
corpus4_path = 'data/corpus4'
corpus_silver_path = 'data/corpus4_silver'
file_names = ['train.txt', 'dev.txt', 'test.txt']

for file_name in file_names:
    source_path = f'{corpus4_path}/{file_name}'
    destination_path = f'{corpus_silver_path}/{file_name}'
    shutil.copyfile(source_path, destination_path)
    
shutil.copyfile('data/corpus1_silver/silver_set.txt','data/corpus4_silver/silver_set.txt')

silver_set_path = f'data/corpus4_silver/silver_set.txt'
train_path = f'data/corpus4_silver/train.txt'
with open(train_path, 'a') as train_file, open(silver_set_path, 'r') as silver_set_file:
    for line in silver_set_file:
        train_file.write(line)