In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm
import spacy

from spacy.training import Example
from spacy.tokens import DocBin

from thinc.api import Config

In [2]:
COLAB = False

In [None]:
DATA_DIR = "gdrive/MyDrive/Work/quantify-news/" if COLAB else "data/"
PROJECT_DIR = "gdrive/MyDrive/Work/quantify-news/" if COLAB else "./"

FULL_TEXT_TRAIN_PATH = DATA_DIR + "newsarticles_article_train.parquet"
FULL_TEXT_DEV_PATH = DATA_DIR + "newsarticles_article_dev.parquet"
FULL_TEXT_TEST_PATH = DATA_DIR + "newsarticles_article_test.parquet"

USER_LABELS_TRAIN_PATH = DATA_DIR + "newsarticles_usercoding_train.csv"
USER_LABELS_DEV_PATH = DATA_DIR + "newsarticles_usercoding_dev.csv"
USER_LABELS_TEST_PATH = DATA_DIR + "newsarticles_usercoding_test.csv"

DATA_TRAIN_BIN_PATH = DATA_DIR + "ner_train.spacy"
DATA_DEV_BIN_PATH = DATA_DIR + "ner_dev.spacy"
DATA_TEST_BIN_PATH = DATA_DIR + "ner_test.spacy"

SPACY_CONFIG_PATH = PROJECT_DIR + ("spacy_base_config_colab.cfg" if COLAB else "spacy_base_config.cfg")


# Read Data

In [None]:
article_data_train = pd.read_parquet(FULL_TEXT_TRAIN_PATH)
article_data_dev = pd.read_parquet(FULL_TEXT_DEV_PATH)
article_data_test = pd.read_parquet(FULL_TEXT_TEST_PATH)

In [None]:
loc_data_train = pd.read_csv(USER_LABELS_TRAIN_PATH)
loc_data_dev = pd.read_csv(USER_LABELS_DEV_PATH)
loc_data_test = pd.read_csv(USER_LABELS_TEST_PATH)

In [None]:
spacy_config = Config().from_disk(SPACY_CONFIG_PATH)

In [None]:
BATCH_SIZE = int(spacy_config['nlp']['batch_size'])

# Pre Process

## Merge Labels and Context

In [None]:
def preproc(article_data, loc_data):
    article_data['bodytext'] = (article_data['bodytext']
                                  .str.replace('\n',' ')
                                  .str.replace(u'\xa0', u' '))
    
    loc_data['loc_text'] = (loc_data['loc_text']
                                  .str.replace('\n',' ')
                                  .str.replace(u'\xa0', u' '))
    
    loc_texts = loc_data.groupby('article_id',as_index=False).agg({'loc_start':list, 'loc_end':list, 'loc_text':list})
    ner_data = article_data.merge(loc_texts, left_on='id', right_on='article_id', how='inner')
    return ner_data

In [None]:
ner_data_train = preproc(article_data_train, loc_data_train)
ner_data_dev = preproc(article_data_dev, loc_data_dev)
ner_data_test = preproc(article_data_test, loc_data_test)

In [None]:
del article_data_train
del article_data_dev
del article_data_test

## Batch by length for inference

**TODO**: I"m not sure if this affects anything because spacy might randomize
and re-shuffle the data anyways. And its batcher does group by similar-word-counts.

In [None]:
def batch_data(ner_data):
    q3 = ner_data.bodytext.str.len().quantile(.75)
    q1 = ner_data.bodytext.str.len().quantile(.25)
    iqr = q3 - q1
    # left_outliers = ner_data.bodytext.str.len() < (q1 - 1.5 * iqr)
    right_outliers = ner_data.bodytext.str.len() > (q3 + 1.5 * iqr)
    # outliers = left_outliers | right_outliers
    ner_data['very_long'] = right_outliers

    df_batch_size = BATCH_SIZE * 4
    ner_data_sorted = ner_data.sort_values('bodytext', key=lambda x: x.str.len())
    return np.array_split(ner_data_sorted, len(ner_data) // df_batch_size)

In [None]:
batches_train = batch_data(ner_data_train)
batches_dev = batch_data(ner_data_dev)
batches_test = batch_data(ner_data_test)

## Save batches to binary format 

In [None]:
nlp_blank = spacy.blank("en")

In [None]:
def binarize_data(batches, output):
    doc_bin = DocBin(store_user_data=True)
    for batch in tqdm(batches):
        batch_size = 1 if batch['very_long'].any() else BATCH_SIZE
        entities = batch.apply(lambda row: list(zip(row.loc_start, row.loc_end, ['NEWS_LOC']*len(row.loc_start))), axis=1)
        docs = nlp_blank.pipe(batch.bodytext, batch_size=batch_size, disable=['ner'])
        examples = [Example.from_dict(doc, {"entities": ent}) for doc, ent in zip(docs, entities)]
        for eg in examples:
            doc_bin.add(eg.reference)
    doc_bin.to_disk(output)

In [None]:
binarize_data(batches_train, DATA_TRAIN_BIN_PATH)
binarize_data(batches_dev, DATA_DEV_BIN_PATH)
binarize_data(batches_test, DATA_TEST_BIN_PATH)