In [None]:
#!/usr/bin/env python3

"""
- Script name: spacy_models_train
- Author: Dan Bright, cosmoid@tuta.io
- Description: A script to train spaCy 
  language models for NER
"""


!pip install spacy spacy-transformers  # install transformers
#!python -m spacy download en_core_web_lg  # install base spaCy CNN model
!python -m spacy download en_core_web_trf  # install base spaCy transformer model

In [None]:
# imports
import spacy, json, glob, os, random
from spacy.tokens import DocBin
from spacy.util import filter_spans
from tqdm import tqdm
import locale

locale.getpreferredencoding = lambda: "UTF-8"  # Fix Colab local bug

In [None]:
# setup

def get_annotation_file_handles(train_url, ext, print_output=True):
    global annotated_files

    def construct_handles(url):
      # ensure url has trailing slash
      url = url + '/' if url[-1:] != '/' else url
      # load hand annotated examples
      annotated_files = glob.glob(url + f'*.{ext}')
      # sort based on filename
      annotated_files.sort(key=lambda x: os.path.basename(x))
      # print counted files to demonstrate success
      if print_output:
          print(f'Number of annotated files: {len(annotated_files)}')
      return annotated_files
    
    # construct handles for training data
    annotated_files = construct_handles(train_url)

def json_to_doc(print_output=False):
    # Load json into list of Python dicts
    global annotations
    def create_json(json_files):
      anno = []
      for f in json_files:
          with open(f, 'r', encoding='utf-8') as file:
              anno.append(json.loads(file.read()))
      if print_output:
          # print count of annotation dicts to verify success
          print(f'Number of annotations in files: {len(anno)}')
          # print first element (document), to verify
          print(f'Annotation sample: {anno[:1]}')
      return anno
    annotations = create_json(annotated_files)

def test_train_split(print_output=False):
    global annotations_training, annotations_dev
    random.shuffle(annotations)
    annotations_training = annotations[0:int(len(annotations)*0.8)]
    annotations_dev = annotations[len(annotations_training):]
    print(f'\nTraining data is {len(annotations_training)} documents, dev data is {len(annotations_dev)} documents.\n') if print_output else None
    print(f'\nFirst training document (to test randomisation): {annotations_training[0]}\n') if print_output else None

In [None]:
def setup(print_output=0, colab=0):
    """globals set here"""
    global annotated_files, labels_of_interest, docbin_object_path, docbin_object_training_filename, docbin_object_dev_filename
    google_drive_path = '/content/drive/MyDrive/'
    annotations_data_path = f'{google_drive_path if colab else "./"}data/train/json'
    annotations_data_filetype = 'json'
    docbin_object_path = f'{google_drive_path if colab else "./"}data/docbin/'  # important: remember trailing slash
    docbin_object_training_filename = 'training_data.spacy'
    docbin_object_dev_filename = 'dev_data.spacy'
    # define entity labels of interest
    labels_of_interest = ['ATC_CITY', 'ATC_STATE', 'ICDT_DATE', 'ICDT_TIME', 'ICDT_LOC', 'UAS_COLOR', 'UAS_SHAPE',
                          'UAS_HEADING', 'UAS_SIZE', 'UAS_REL_ALT', 'UAS_ACT_ALT', 'AC_ALT', 'AC_TYPE', 'AC_HEADING', 'FT_NAME','FL_OPTOR','FT_ROUTE']
    # run setup functions
    get_annotation_file_handles(annotations_data_path, annotations_data_filetype, print_output)
    json_to_doc(print_output)
    # split into train and dev data
    test_train_split(print_output)

In [None]:
def count_training_samples():
    # function to count the number of samples (paragraphs) for the training corpus
    return sum([len(doc['annotations']) for doc in annotations_training])

def count_dev_samples():
    # function to count the number of samples (paragraphs) for the training corpus
    return sum([len(doc['annotations']) for doc in annotations_dev])

def get_annotated_entities(annotations):
    # function to get all entities in a hand-annotated doc (all lines)
    return [line[1]['entities'] for line in annotations if line]

def get_text(annotations):
    # function to get all raw text from the hand-annotated doc (all lines)
    return [line[0] for line in annotations if line]

def get_annotations(print_output=False):
    global annotated_entities_training, annotated_entities_dev, annotated_text_training, annotated_text_dev
    """Note: Entities to be stored in the form [[[element1, element2]],[[element1, element2]]]
    """
    # run function to get all entities from all lines in all the passed-in hand-annotated docs
    annotated_entities_training = [get_annotated_entities(doc['annotations']) for doc in annotations_training]
    annotated_entities_dev = [get_annotated_entities(doc['annotations']) for doc in annotations_dev]
    # run function to get all text from all lines in all the passed-in hand-annotated docs
    annotated_text_training = [get_text(doc['annotations']) for doc in annotations_training]
    annotated_text_dev = [get_text(doc['annotations']) for doc in annotations_dev]
    """Note: Annotated text stored in form [[line1, line1],[line1, line2]]
    i.e., a list of document-lists of lines"""

    if print_output:
        # print total counts of annotated documents; lines and entities
        print(f'Number of training documents: {len(annotated_entities_training)}')
        print(f'Number of training samples (paragraphs) in training documents: {count_training_samples()}')
        print(f'Number of dev documents: {len(annotated_entities_dev)}')
        print(f'Number of dev samples (paragraphs) in dev documents: {count_dev_samples()}')
        print(f'Number of training lines: {sum([len(x) for x in annotated_entities_training])}')
        print(f'Number of dev lines: {sum([len(x) for x in annotated_entities_dev])}')
        print(f'Number of training entities: {sum([sum(len(y) for y in x ) for x in annotated_entities_training])}\n')
        print(f'Number of dev entities: {sum([sum(len(y) for y in x ) for x in annotated_entities_dev])}\n')
        # print first entity, of first line, of first doc, to verify entities
        print(f'Annotated entities training sample (doc 4, line 1): {annotated_entities_training[3][0]}\n')
        print(f'Annotated entities dev sample (doc 1, line 1): {annotated_entities_dev[0][0]}\n')
        # print sample of annotated text to verify
        print(f'Annotated text training sample (doc 2, line 1): {annotated_text_training[1][0]}\n')
        print(f'Annotated text dev sample (doc 1, line 1): {annotated_text_dev[0][0]}\n')


In [None]:
def compile_training_data(print_output):
    global training_data, dev_data
    training_data = dict()
    dev_data = dict()
    training_annotations = list()
    dev_annotations = list()

    for doc_idx, doc in enumerate(annotated_entities_training):
        for line_idx, line in enumerate(doc):
            ents = list()
            for ent in line:
                ents.append((ent[0], ent[1], ent[2]))
            training_annotations.append({'entities': ents, 'text': annotated_text_training[doc_idx][line_idx]})
    training_data['classes'] = labels_of_interest
    training_data['annotations'] = training_annotations

    for doc_idx, doc in enumerate(annotated_entities_dev):
        for line_idx, line in enumerate(doc):
            ents = list()
            for ent in line:
                ents.append((ent[0], ent[1], ent[2]))
            dev_annotations.append({'entities': ents, 'text': annotated_text_dev[doc_idx][line_idx]})
    dev_data['classes'] = labels_of_interest
    dev_data['annotations'] = dev_annotations

    # print sample of compiled annotation training data
    print(f'Training data sample (doc 1, line 1): {training_data.get("annotations")[2]}\n') if print_output else None
    # print sample of compiled annotation dev data
    print(f'Training data sample (doc 1, line 1): {dev_data.get("annotations")[2]}\n') if print_output else None


In [None]:
def prepare_training(print_output):

    nlp = spacy.blank("en") # load a new spacy model

    def create_spacy_file(path, filename, data):
      doc_bin = DocBin() # create a DocBin object
      skipped_ent_count = 0
      filtered_ents_count = 0
      for idx, training_line in tqdm(enumerate(data)):
          text = training_line['text']
          labels = training_line['entities']
          doc = nlp.make_doc(text) 
          ents = []
          skipped_ents = 0
          for start, end, label in labels:
              span = doc.char_span(start, end, label=label, alignment_mode="contract")
              if span is None:
                  skipped_ents += 1
              else:
                  ents.append(span)
          filtered_ents = filter_spans(ents)
          skipped_ent_count += skipped_ents
          filtered_ents_count += len(filtered_ents)
          doc.ents = filtered_ents 
          doc_bin.add(doc)
      print(f'Number of skipped entities: {skipped_ent_count}') if print_output else None
      print(f'Number of filtered entities: {filtered_ents_count}') if print_output else None
      doc_bin.to_disk(path + filename) # save the docbin object
  
    # create training data
    create_spacy_file(docbin_object_path, docbin_object_training_filename, data=training_data['annotations'])
    # create dev data
    create_spacy_file(docbin_object_path, docbin_object_dev_filename, data=dev_data['annotations'])

In [None]:
def mount_google_drive():
  from google.colab import drive
  drive.mount('/content/drive')

In [None]:
# run training
colab = 1  # boolean, True if using Google Colab, else False
mount_google_drive() if colab else None  # arguments: mount google drive if on colab? (boolean)
setup(1, colab)  # arguments: print output? (boolean), running on colab? (boolean)
get_annotations(1)  # arguements: print output? (boolean)
compile_training_data(1)  # arguements: print output? (boolean)
prepare_training(1)  # arguements: print output? (boolean)

In [None]:
# Train using CPU
# add defaults to base config created at https://spacy.io/usage/training#quickstart
#!python -m spacy init fill-config /content/drive/MyDrive/data/base_config_cnn_accuracy.cfg /content/drive/MyDrive/data/config.cfg

# train the model (CPU)
#!python -m spacy train /content/drive/MyDrive/data/config.cfg --output /content/drive/MyDrive/data/ --paths.train /content/drive/MyDrive/data/docbin/training_data.spacy --paths.dev /content/drive/MyDrive/data/docbin/dev_data.spacy

In [None]:
# Train using GPU
# add defaults to base config created at https://spacy.io/usage/training#quickstart

!python -m spacy init fill-config /content/drive/MyDrive/config/base_config_transformer_accuracy.cfg /content/drive/MyDrive/config/config.cfg

# train the model (GPU)
!python -m spacy train /content/drive/MyDrive/config/config.cfg --output /content/drive/MyDrive/models/ --paths.train /content/drive/MyDrive/data/docbin/training_data.spacy --paths.dev /content/drive/MyDrive/data/docbin/training_data.spacy --gpu-id 0