## Summary

This code is the runnable python notebook version of `bio_tagging_mergeA.py`

This file is used to BIO tagging on the pure texts for the mergeModelA, which convers information including names,
phone numbers and fax numbers

The output file will be in the format of spaCy Doc file.

special tricks applied: 
1. use the regex to tag the phone numbers and fax numbers with mutiple formats
   the spacy ruler support tag with regex but the behavior is not exactly same as the vanila regex.
   the vanila regex is used to solve the problem.


special dependencies:<br>
NONE

## Import

In [1]:
import os
import json
import spacy
from spacy.util import filter_spans
from spacy.tokens import Doc, DocBin
import collections
from tqdm import tqdm
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from pathlib import Path

## Load Data

In [None]:
# load the Cicero data
cicero_df = pd.read_csv('/content/drive/MyDrive/Cicero/cicero_officials_sample_2022-09-08.csv',\
                  error_bad_lines=False)

In [4]:
# load the pure texts of the webpages
with open("/content/drive/MyDrive/Cicero/cicero_officials_sample_2022-09-08_processed_raw_content.json", "r") as f:
      pure_text_dict = json.load(f)

## Bio tagging

In [5]:
output = Path("./mergeA/")

In [6]:
# detect if the output directory exists
if not os.path.exists(output):
    os.makedirs(output)

In [39]:
# helper function
def regex_search(web_content, unit, search_attribute, label, pattern_list, error_log):
  '''
  use regex to search the information in the web content and add the search result
  to the rule_pattern for bio tagging.

  Args:
    web_content: string. the content where we want to search information.
    unit: dictionary. the dictionary contains all the attribute.
    search_attribute: string. the attribute's name. 
    label: string. the name of the label for the attribute.
    pattern_list: list. the pattern list which will be added to ruler.
    error_log: defaultdict(dictionary). the dictionary to store the error instance.
  
  Returns:
    None
  '''
  pattern_list.append({"label":label, 'pattern':unit[search_attribute]})
  part_list = re.split('[^\w]',unit[search_attribute])
  try:
    regex_pattern = r"((\(({})\)(\s){{0,1}}?)|(({})(-|.)))?({})(-|.)({})".format(part_list[1],\
                                                                      part_list[1],\
                                                                      part_list[3],\
                                                                      part_list[4])
    regex_pattern = r"((\(({})\)(\s){{0,1}}?)|(({})(-|.)))?({})(-|.)({})".format(part_list[1],\
                                                                      part_list[1],\
                                                                      part_list[3],\
                                                                      part_list[4])
    search_result = re.search(regex_pattern,web_content)
    if search_result:
      pattern_list.append({"label":'PHONE', 'pattern':search_result.group()})
  except Exception as e:

    politician_id = unit['id']
    error_log[politician_id]['url'] = unit['url_1']
    error_log[politician_id]['search_attribute'] = search_attribute
    error_log[politician_id]['error'] = str(e)

In [41]:
# bio tagging
# the output file will be in the list first and then save as the spaCy Doc file
nlp = spacy.load('en_core_web_sm')
tagged_data = []

# create the error log
error_log = collections.defaultdict(dict)

for n in tqdm(range(len(cicero_df))):
    information_unit = cicero_df.iloc[n].dropna()
    politician_id = information_unit["id"]

    pure_text = pure_text_dict.get(str(politician_id), None)

    # skip the data point that we cannot find the pure text due to some reasons
    # for example, the url address is not valid
    if pure_text is None:
        continue

    ruler = nlp.add_pipe("span_ruler")
    bio_tag_pattern_list = []

    # tag the name
    # tag the full name, case insensitive by using the `LOWER` attribute
    bio_tag_pattern_list.append({"label":'NAME', 'pattern':[{'LOWER':information_unit['first_name'].lower()},\
                                                            {'LOWER':information_unit['last_name'].lower()}]})
    # tag the first name, case insensitive by using the `LOWER` attribute
    bio_tag_pattern_list.append({"label":'NAME', 'pattern':[{'LOWER':information_unit['first_name'].lower()}]})
    # tag the last name, case insensitive by using the `LOWER` attribute
    bio_tag_pattern_list.append({"label":'NAME', 'pattern':[{'LOWER':information_unit['last_name'].lower()}]})

    # tag the phone number
    if 'primary_phone_1' in information_unit.keys():
        regex_search(pure_text, information_unit, 'primary_phone_1', 'PHONE', bio_tag_pattern_list, error_log)
    
    if 'primary_phone_2' in information_unit.keys():
        regex_search(pure_text, information_unit, 'primary_phone_2', 'PHONE', bio_tag_pattern_list, error_log)

    if 'secondary_phone_1' in information_unit.keys():
        regex_search(pure_text, information_unit, 'secondary_phone_1', 'PHONE', bio_tag_pattern_list, error_log)
    
    if 'secondary_phone_2' in information_unit.keys():
        regex_search(pure_text, information_unit, 'secondary_phone_2', 'PHONE', bio_tag_pattern_list, error_log)
    
    # tag the fax number
    if 'primary_fax_1' in information_unit.keys():
        regex_search(pure_text, information_unit, 'primary_fax_1', 'FAX', bio_tag_pattern_list, error_log)
    
    if 'primary_fax_2' in information_unit.keys():
        regex_search(pure_text, information_unit, 'primary_fax_2', 'FAX', bio_tag_pattern_list, error_log)
    
    if 'secondary_fax_1' in information_unit.keys():
        regex_search(pure_text, information_unit, 'secondary_fax_1', 'FAX', bio_tag_pattern_list, error_log)
    
    if 'secondary_fax_2' in information_unit.keys():
        regex_search(pure_text, information_unit, 'secondary_fax_2', 'FAX', bio_tag_pattern_list, error_log)

    ruler.add_patterns(bio_tag_pattern_list)
    doc = nlp(pure_text)
    length = len(doc)//100
    for n in range(length+1):
        sub_doc = nlp(str(doc[n*100:(n+1)*100]))
        sub_doc.ents = filter_spans(sub_doc.spans["ruler"])
        tagged_data.append(sub_doc)
    
    # remove the ruler and initialize a new one for each politician/data point
    nlp.remove_pipe('span_ruler')

100%|██████████| 1919/1919 [16:19<00:00,  1.96it/s]


In [40]:
nlp.remove_pipe('span_ruler')

('span_ruler', <spacy.pipeline.span_ruler.SpanRuler at 0x7f9339899980>)

In [34]:
# remove the empty data point
tagged_data = [doc for doc in tagged_data if len(doc) > 0]

In [35]:
# split the tagged data into training set and test set
train, dev = train_test_split(tagged_data, test_size = 0.1, random_state=42)

In [36]:
train_db = DocBin()
for n in train:
    train_db.add(n)
train_db.to_disk(output / "train.spacy")

In [37]:
dev_db = DocBin()
for n in dev:
    dev_db.add(n)
dev_db.to_disk(output / "dev.spacy")