## Summary

This code is the runnable python notebook version of `bio_tagging_mergeB.py`

This file is used to BIO tagging on the pure texts for the modelB, which covers information including salutation,
party, state, county, and city.

The output file will be in the format of spaCy Doc file.

special tricks applied: 
1. use the external file that stores the US states information to tag the abbreviation and full name of the state information.
    the abbreviation will be tagged as "STATE", and the full name will be tagged as "STATE_F".


special dependencies:
1. the external file that stores the US states information `state.csv`.

## Import

In [None]:
import collections
import json
import os
import re
from pathlib import Path

import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from spacy.tokens import Doc, DocBin
from spacy.util import filter_spans
from tqdm import tqdm

## Load Data

In [None]:
# your own path to cicero data
CICERO_DATA_PATH = ''

# your own path to the state file
STATE_FILE_PATH = ''

# your own path to the pure text file
PURE_TEXT_PATH = ''

In [None]:
# load the Cicero data
cicero_df = pd.read_csv('CICERO_DATA_PATH',\
                  error_bad_lines=False)

In [None]:
# load the external file that stores the US states information
state_df = pd.read_csv("STATE_FILE_PATH")
state_abbr = state_df["Abbreviation"].tolist()
state_full = state_df["Full-Name"].tolist()
state_dict = dict(zip(state_abbr, state_full))

In [None]:
# load the pure texts of the webpages
with open("PURE_TEXT_PATH ", "r") as f:
      pure_text_dict = json.load(f)

## Bio Tagging

In [None]:
output = Path("./mergeB/")

In [None]:
# detect if the output directory exists
if not os.path.exists(output):
    os.makedirs(output)

In [None]:
nlp = spacy.load('en_core_web_sm')
# bio tagging
# the output will be stored in the list first and then save as the spaCy Doc file
tagged_data = []

for n in tqdm(range(len(cicero_df))):
    information_unit = cicero_df.iloc[n].dropna()
    politician_id = information_unit["id"]

    pure_text = pure_text_dict.get(str(politician_id), None)

    # skip the data point that we cannot find the pure text due to some reasons
    # for example, the url address is not valid
    if pure_text is None:
        continue

    ruler = nlp.add_pipe("span_ruler")
    bio_tag_pattern_list = []

    # reference: https://spacy.io/usage/rule-based-matching
    if "salutation" in information_unit.keys():
        salutation = information_unit["salutation"]
        bio_tag_pattern_list.append({
            "label": "SALUTATION",
            "pattern": salutation
        })

    if "party" in information_unit.keys():
        party = information_unit["party"]
        bio_tag_pattern_list.append({"label": "PARTY", "pattern": party})

    if "primary_state" in information_unit.keys():
        state = information_unit["primary_state"]
        bio_tag_pattern_list.append({"label": "STATE", "pattern": state})
        # get the full name of the state
        if state in state_abbr:
            state_full_name = state_dict[state]
            bio_tag_pattern_list.append({
                "label": 'STATE_F',
                'pattern': [{
                    'LOWER': state_full_name.lower()
                }]
            })

    if "secondary_state" in information_unit.keys():
        state = information_unit["secondary_state"]
        bio_tag_pattern_list.append({"label": "STATE", "pattern": state})
        # get the full name of the state
        if state in state_abbr:
            state_full_name = state_dict[state]
            bio_tag_pattern_list.append({
                "label": 'STATE_F',
                'pattern': [{
                    'LOWER': state_full_name.lower()
                }]
            })

    if "primary_county" in information_unit.keys():
        county = information_unit["primary_county"]
        bio_tag_pattern_list.append({"label": "COUNTY", "pattern": county})

    if "secondary_county" in information_unit.keys():
        county = information_unit["secondary_county"]
        bio_tag_pattern_list.append({"label": "COUNTY", "pattern": county})

    if "primary_city" in information_unit.keys():
        city = information_unit["primary_city"]
        bio_tag_pattern_list.append({"label": "CITY", "pattern": city})

    if "secondary_city" in information_unit.keys():
        city = information_unit["secondary_city"]
        bio_tag_pattern_list.append({"label": "CITY", "pattern": city})

    ruler.add_patterns(bio_tag_pattern_list)
    doc = nlp(pure_text)
    # split the doc into multiple chunks since the input of the model should be
    # less than 512 tokens
    length = len(doc) // 100
    for n in range(length + 1):
        sub_doc = nlp(str(doc[n * 100:(n + 1) * 100]))
        # the filter_spans function is used to remove the overlapping entities
        sub_doc.ents = filter_spans(sub_doc.spans["ruler"])
        tagged_data.append(sub_doc)

    # remove the ruler and initialize a new one for each politician/data point
    nlp.remove_pipe("span_ruler")

100%|██████████| 1919/1919 [14:32<00:00,  2.20it/s]


In [None]:
# remove the empty data point
tagged_data = [doc for doc in tagged_data if len(doc) > 0]

In [None]:
# split the tagged data into training set and test set
train, dev = train_test_split(tagged_data, test_size = 0.1, random_state=42)

In [None]:
train_db = DocBin()
for n in train:
    train_db.add(n)
train_db.to_disk(output / "train.spacy")

In [None]:
dev_db = DocBin()
for n in dev:
    dev_db.add(n)
dev_db.to_disk(output / "dev.spacy")