In [1]:
import pandas as pd
import numpy as np

from transformers import pipeline, AutoTokenizer

from tqdm import tqdm
import spacy

In [2]:
COLAB = False

In [None]:
DATA_DIR = "gdrive/MyDrive/Work/quantify-news/" if COLAB else "data/"

FULL_TEXT_TRAIN_PATH = DATA_DIR + "newsarticles_article_train.parquet"
FULL_TEXT_DEV_PATH = DATA_DIR + "newsarticles_article_dev.parquet"
FULL_TEXT_TEST_PATH = DATA_DIR + "newsarticles_article_test.parquet"

USER_LABELS_TRAIN_PATH = DATA_DIR + "newsarticles_usercoding_train.csv"
USER_LABELS_DEV_PATH = DATA_DIR + "newsarticles_usercoding_dev.csv"
USER_LABELS_TEST_PATH = DATA_DIR + "newsarticles_usercoding_test.csv"    

GEOCODED_PATH = DATA_DIR + "newsarticles_trainedlocation.parquet"

# Read Data

In [4]:
article_data = pd.read_parquet(FULL_TEXT_TRAIN_PATH)

In [5]:
loc_data = pd.read_csv(USER_LABELS_TRAIN_PATH)

In [6]:
loc_texts = loc_data.groupby('article_id',as_index=False).agg({'loc_start':list, 'loc_end':list, 'loc_text':list})
ner_data = article_data.merge(loc_texts, left_on='id', right_on='article_id', how='inner')

# Test Pretrained Model

In [7]:
from dataclasses import dataclass

@dataclass
class RowScore:
    true_pos: int
    false_pos: int
    false_neg: int

    def __init__(self, true_pos, false_pos, false_neg):
        if true_pos + false_pos == 0:
            self.precision = 0
        else:
            self.precision = true_pos / (true_pos + false_pos)
        
        if true_pos + false_neg == 0:
            self.recall = 0
        else:
            self.recall = true_pos / (true_pos + false_neg)
        
        if self.precision == 0 or self.recall == 0:
            self.f1 = 0
        else:
            self.f1 = 2 / (1 / self.precision + 1 / self.recall)
    
    def __str__(self):
        return f"RowScore(precision={self.precision:.2e}, recall={self.recall:.2e}, f1={self.f1:.2e})"

    def __add__(self, other):
        if not isinstance(other, RowScore):
            raise TypeError("other must be RowScore")
        return RowScore(self.true_pos + other.true_pos, 
                        self.false_pos + other.false_pos, 
                        self.false_neg + other.false_neg)
    
    def f1_strict(self):
        print(self.true_pos, self.false_pos, self.false_neg)
        print("Strict index and text matches:")
        print("Precision: {:.4f}".format(self.precision))
        print("Recall: {:.4f}".format(self.recall))
        print("F1: {:.4f}".format(self.f1))
        return self.f1

In [8]:
@dataclass
class Entity:
    label: str
    start: int
    end: int
    text: str

def predict_batch(batch, model, batch_size):
    batch_entities = model(batch.bodytext.to_list(), batch_size=batch_size)
    return dict(zip(batch.id, batch_entities))

def score_batch(batch, model, batch_size):
    total_score = RowScore(0,0,0)
    batch_entities = model(batch.bodytext.to_list(), batch_size=batch_size)
    for row, entities in zip(batch.itertuples(), batch_entities):
        total_score += score_row(row, entities)
    return total_score

def score_row(row, entities):
    y_pred = {e.text.strip() for e in entities if e.label == 'LOC'}
    y_true = {t.strip() for t in row.loc_text}
    true_pos = len(y_true & y_pred)
    false_pos = len(y_pred - y_true)
    false_neg = len(y_true - y_pred)
    return RowScore(true_pos, false_pos, false_neg)

## HuggingFace

In [9]:
# Play around with other models from https://huggingface.co/docs/transformers/v4.17.0/en/index#supported-models
BASE_MODEL_CLF = "albert-base-v2" # "distilbert-base-uncased"
BASE_MODEL_NER = "dbmdz/bert-large-cased-finetuned-conll03-english" # "distilbert-base-uncased"

# OUTPUT_MODEL_CLF = "albert-base-v2-chi-loc" # "distilbert-chi-loc"
# OUTPUT_MODEL_NER = "dbmz-bert-loc-ner" # "distilbert-chi-loc"

In [10]:
ner_pipe = pipeline("ner", model=BASE_MODEL_NER, aggregation_strategy='average')

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [11]:
PIPE_BATCH_SIZE = 32
def hgf_scorer(texts, batch_size):
    entities = ner_pipe(texts, batch_size=batch_size)
    entities = [Entity(e['entity_group'], e['start'], e['end'], e['word']) for e in entities]
    return entities

#### Optimize Batch Size

In [12]:
import time
batch_sizes = [1, 2, 4, 8, 16, 32]
texts = ner_data.bodytext.iloc[:512].to_list()

In [13]:
# Commented out because batching didn't speed anything up.
# for batch_size in batch_sizes:
#     start = time.time()
#     _ = ner_pipe(texts, batch_size=batch_size)
#     end = time.time()
#     print(f"Batch size {batch_size}: {end - start:.4f} sec")

In [14]:
# Commented out because use_cache didn't speed anything up
# ner_pipe.model.config.use_cache = True  # May speed up generation tasks
# for batch_size in batch_sizes:
#     start = time.time()
#     _ = ner_pipe(texts, batch_size=batch_size)
#     end = time.time()
#     print(f"Batch size {batch_size}: {end - start:.4f} sec")

In [15]:
# Commented out because padding didn't speed anything up.
# ner_pipe.model.config.use_cache = False
# tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NER)
# start = time.time()
# tokens = tokenizer(texts, padding="longest", truncation=True, return_tensors="pt")
# _ = ner_pipe.model(**tokens)#, batch_size=batch_size)
# end = time.time()
# print(f"Batch size (?): {end - start:.4f} sec")

#### Actually Run

In [16]:
# Commented out because it's too damn slow > 1sec per article.
# total_score = RowScore(0,0,0,0)
# DF_BATCH_SIZE = PIPE_BATCH_SIZE * 4
# ner_data_sorted = ner_data.sort_values('bodytext', key=lambda x: x.str.len())
# batches = np.array_split(ner_data_sorted, len(ner_data) // DF_BATCH_SIZE)
# for batch in tqdm(batches):
#     total_score += score_batch(batch, hgf_scorer)
# total_score.f1_strict()
# total_score.f1_relax()

## Spacy

In [17]:
nlp = spacy.load("en_core_web_md")

In [18]:
def spacy_ner(doc):
    loc_labels = ['FAC', # Buildings, airports, highways, bridges, etc.
                  'ORG', # Companies, agencies, institutions, etc.
                  'GPE', # Countries, cities, states
                  'LOC' # Non-GPE locations, mountain ranges, bodies of water
                  'EVENT'] # Named events (e.g., "World Cup")

    matches = []
    for token in doc:
        if token.ent_type_ in loc_labels:
            match = Entity("LOC", token.idx, token.idx + len(token.text), token.text)
            matches.append(match)
    return matches

def spacy_scorer(texts, **kwargs):
    docs = nlp.pipe(texts, 
                    **kwargs,
                    disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
    entities = [spacy_ner(d) for d in docs]
    return entities

#### Optimize Batch Size

In [19]:
# 128 is the winner and 64 is 2nd.
# for batch_size in batch_sizes:
#     start = time.time()
#     _ = spacy_scorer(texts, batch_size=batch_size)
#     end = time.time()
#     print(f"Batch size {batch_size}: {end - start:.4f} sec")

#### Optimize nlp.pipe


In [20]:
# Commented out because multiprocessing didnt speed anything up.
# for batch_size in [16,32,64,128]:
#     start = time.time()
#     _ = spacy_scorer(texts, batch_size=batch_size, n_process=2)
#     end = time.time()
#     print(f"Batch size {batch_size}: {end - start:.4f} sec")

#### Actually Run

In [21]:
q3 = ner_data.bodytext.str.len().quantile(.75)
q1 = ner_data.bodytext.str.len().quantile(.25)
iqr = q3 - q1
left_outliers = ner_data.bodytext.str.len() < (q1 - 1.5 * iqr)
right_outliers = ner_data.bodytext.str.len() > (q3 + 1.5 * iqr)
outliers = left_outliers | right_outliers
ner_data['very_long'] = right_outliers

In [22]:
PIPE_BATCH_SIZE = 64
DF_BATCH_SIZE = PIPE_BATCH_SIZE * 4
ner_data_sorted = ner_data.sort_values('bodytext', key=lambda x: x.str.len())
ner_data_sorted = ner_data_sorted.iloc[:1000] # XXX: SAMPLING FOR TESTING!
batches = np.array_split(ner_data_sorted, len(ner_data) // DF_BATCH_SIZE)
entities = dict()
for batch in tqdm(batches):
    batch_size = 1 if batch['very_long'].any() else PIPE_BATCH_SIZE
    entities |= predict_batch(batch, spacy_scorer, batch_size=batch_size)

  return bound(*args, **kwds)
100%|██████████| 12/12 [00:05<00:00,  2.06it/s]


In [23]:
ner_data_sorted['ents'] = ner_data_sorted['id'].map(entities)
ner_data_sorted['ent_text'] = ner_data_sorted['ents'].apply(lambda xs: [x.text for x in xs])
ner_data_sorted['score'] = ner_data_sorted.apply(lambda x: score_row(x, x.ents), axis=1)

In [24]:
ner_data_sorted[['loc_text','ent_text','score']]

Unnamed: 0,loc_text,ent_text,score
1873,[Chicago],[Chicago],"RowScore(precision=1.00e+00, recall=1.00e+00, ..."
333,[first block of North Lorel Avenue],"[Tyler, LaRiviere]","RowScore(precision=0.00e+00, recall=0.00e+00, ..."
2200,[2100 block of South Keeler.],[Chicago],"RowScore(precision=0.00e+00, recall=0.00e+00, ..."
1872,[5200 block of West Grand Avenue.],[Chicago],"RowScore(precision=0.00e+00, recall=0.00e+00, ..."
884,[Chicago],"[Chicago, Times]","RowScore(precision=5.00e-01, recall=1.00e+00, ..."
...,...,...,...
3123,[Mount Sinai Hospital.],"[CHICAGO, Chicago, Amanda, Morris, the, Chicag...","RowScore(precision=0.00e+00, recall=0.00e+00, ..."
700,"[100 block of South State Street, Monroe stop,...","[CTA, Red, Line, Loop, South, State, Street, C...","RowScore(precision=0.00e+00, recall=0.00e+00, ..."
760,[33rd St. and Damen.],"[CHICAGO, McKinley, Park, Target]","RowScore(precision=0.00e+00, recall=0.00e+00, ..."
1646,[2000 block of South Michigan Avenue],"[Chicago, Northwestern, Memorial, Hospital, Co...","RowScore(precision=0.00e+00, recall=0.00e+00, ..."


This is missing all the "[NUM] block of [STREET]" locations.