In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm
import spacy

In [None]:
COLAB = False

In [None]:
DATA_DIR = "gdrive/MyDrive/Work/qjn/" if COLAB else "data/"

FULL_TEXT_TRAIN_PATH = DATA_DIR + "newsarticles_article_train.parquet"
FULL_TEXT_DEV_PATH = DATA_DIR + "newsarticles_article_dev.parquet"
FULL_TEXT_TEST_PATH = DATA_DIR + "newsarticles_article_test.parquet"

USER_LABELS_TRAIN_PATH = DATA_DIR + "newsarticles_usercoding_train.csv"
USER_LABELS_DEV_PATH = DATA_DIR + "newsarticles_usercoding_dev.csv"
USER_LABELS_TEST_PATH = DATA_DIR + "newsarticles_usercoding_test.csv"    

GEOCODED_PATH = DATA_DIR + "newsarticles_trainedlocation.parquet"

# Read Data

In [None]:
article_data = pd.read_parquet(FULL_TEXT_TRAIN_PATH)

In [None]:
loc_data = pd.read_csv(USER_LABELS_TRAIN_PATH)

In [None]:
loc_texts = loc_data.groupby('article_id',as_index=False).agg({'loc_start':list, 'loc_end':list, 'loc_text':list})
ner_data = article_data.merge(loc_texts, left_on='id', right_on='article_id', how='inner')

# NER

## Test Pretrained Model

In [None]:
from dataclasses import dataclass

@dataclass
class RowScore:
    true_pos: int
    false_pos: int
    false_neg: int

    def __init__(self, true_pos, false_pos, false_neg):
        if true_pos + false_pos == 0:
            self.precision = 0
        else:
            self.precision = true_pos / (true_pos + false_pos)
        
        if true_pos + false_neg == 0:
            self.recall = 0
        else:
            self.recall = true_pos / (true_pos + false_neg)
        
        if self.precision == 0 or self.recall == 0:
            self.f1 = 0
        else:
            self.f1 = 2 / (1 / self.precision + 1 / self.recall)
    
    def __str__(self):
        return f"RowScore(precision={self.precision:.2e}, recall={self.recall:.2e}, f1={self.f1:.2e})"

    def __add__(self, other):
        if not isinstance(other, RowScore):
            raise TypeError("other must be RowScore")
        return RowScore(self.true_pos + other.true_pos, 
                        self.false_pos + other.false_pos, 
                        self.false_neg + other.false_neg)
    
    def f1_strict(self):
        print(self.true_pos, self.false_pos, self.false_neg)
        print("Strict index and text matches:")
        print("Precision: {:.4f}".format(self.precision))
        print("Recall: {:.4f}".format(self.recall))
        print("F1: {:.4f}".format(self.f1))
        return self.f1

In [None]:
@dataclass
class Entity:
    label: str
    start: int
    end: int
    text: str

def predict_batch(batch, model, batch_size):
    batch_entities = model(batch.bodytext.to_list(), batch_size=batch_size)
    return dict(zip(batch.id, batch_entities))

def score_batch(batch, model, batch_size):
    total_score = RowScore(0,0,0)
    batch_entities = model(batch.bodytext.to_list(), batch_size=batch_size)
    for row, entities in zip(batch.itertuples(), batch_entities):
        total_score += score_row(row, entities)
    return total_score

def score_row(row, entities):
    y_pred = {e.text.strip() for e in entities if e.label == 'LOC'}
    y_true = {t.strip() for t in row.loc_text}
    true_pos = len(y_true & y_pred)
    false_pos = len(y_pred - y_true)
    false_neg = len(y_true - y_pred)
    return RowScore(true_pos, false_pos, false_neg)

### Spacy

In [None]:
nlp = spacy.load("en_core_web_md")

In [None]:
def spacy_ner(doc):
    loc_labels = ['FAC', # Buildings, airports, highways, bridges, etc.
                  'ORG', # Companies, agencies, institutions, etc.
                  'GPE', # Countries, cities, states
                  'LOC' # Non-GPE locations, mountain ranges, bodies of water
                  'EVENT'] # Named events (e.g., "World Cup")

    matches = []
    for token in doc:
        if token.ent_type_ in loc_labels:
            match = Entity("LOC", token.idx, token.idx + len(token.text), token.text)
            matches.append(match)
    return matches

def spacy_scorer(texts, **kwargs):
    docs = nlp.pipe(texts, 
                    **kwargs,
                    disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
    entities = [spacy_ner(d) for d in docs]
    return entities

def spacy_doccer(texts, **kwargs):
    docs = nlp.pipe(texts, 
                    **kwargs,
                    disable=["ner"])
    return docs

#### Optimize Batch Size

In [None]:
# 128 is the winner and 64 is 2nd.
# for batch_size in batch_sizes:
#     start = time.time()
#     _ = spacy_scorer(texts, batch_size=batch_size)
#     end = time.time()
#     print(f"Batch size {batch_size}: {end - start:.4f} sec")

#### Optimize nlp.pipe


In [None]:
### Optimize nlp.pipe
# for batch_size in [16,32,64,128]:
#     start = time.time()
#     _ = spacy_scorer(texts, batch_size=batch_size, n_process=2)
#     end = time.time()
#     print(f"Batch size {batch_size}: {end - start:.4f} sec")

#### Actually Run

In [None]:
q3 = ner_data.bodytext.str.len().quantile(.75)
q1 = ner_data.bodytext.str.len().quantile(.25)
iqr = q3 - q1
left_outliers = ner_data.bodytext.str.len() < (q1 - 1.5 * iqr)
right_outliers = ner_data.bodytext.str.len() > (q3 + 1.5 * iqr)
outliers = left_outliers | right_outliers
ner_data['very_long'] = right_outliers

In [None]:
PIPE_BATCH_SIZE = 64
DF_BATCH_SIZE = PIPE_BATCH_SIZE * 4
ner_data_sorted = ner_data.sort_values('bodytext', key=lambda x: x.str.len())
ner_data_sorted = ner_data_sorted.iloc[:1000] # XXX: SAMPLING FOR TESTING!
batches = np.array_split(ner_data_sorted, len(ner_data) // DF_BATCH_SIZE)
entities = dict()
for batch in tqdm(batches):
    batch_size = 1 if batch['very_long'].any() else PIPE_BATCH_SIZE
    entities |= predict_batch(batch, spacy_scorer, batch_size=batch_size)

In [None]:
ner_data_sorted['ents'] = ner_data_sorted['id'].map(entities)
ner_data_sorted['ent_text'] = ner_data_sorted['ents'].apply(lambda xs: [x.text for x in xs])
ner_data_sorted['score'] = ner_data_sorted.apply(lambda x: score_row(x, x.ents), axis=1)

In [None]:
ner_data_sorted[['loc_text','ent_text','score']]

This is missing all the "[NUM] block of [STREET]" locations.

## Fine-Tune Spacy

In [None]:
from spacy.training import Example
from spacy.util import minibatch
import random
from spacy.tokens import DocBin, Doc

In [None]:
docs = dict()
for batch in tqdm(batches):
    batch_size = 1 if batch['very_long'].any() else PIPE_BATCH_SIZE
    docs |= predict_batch(batch, spacy_doccer, batch_size=batch_size)

In [None]:
OFFSET = 9 # IDK what is causing this.
docs = ner_data_sorted['id'].map(docs)
entities = ner_data_sorted.apply(lambda r: list(zip(r.loc_start, r.loc_end, r.loc_text)), axis=1)
examples = [Example.from_dict(d, {'entities': e}) for d,e in zip(docs, entities)]


In [None]:
# TODO: I am fixing the indexes which are jank. 
# Might as well push this to the data processing ipynb actually.
# Currently OK-ish but need to handle when keyword appears multiple times.

loc_starts = ner_data_wide.apply(lambda row: [row.bodytext.find(t) for t in row.loc_text], axis=1)

In [None]:
ner_data_wide.loc_text.apply(lambda x: len(x) - len(set(x))).value_counts()

In [None]:
'abc'.fin

In [None]:
ner_data_wide.iloc[0]

In [None]:
for row in ner_data_long.itertuples():
    for txt in row.loc_text:
        assert txt in row.bodytext

In [None]:
spacy.training.offsets_to_biluo_tags(nlp.make_doc(ner_data_sorted.iloc[0].bodytext), entities.iloc[0])

In [None]:
train_examples = [Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]}) for doc in ner_data_sorted.doc]

# NER + SR

**TODO:**
* Current spacy model does not resolve coreferences across sentence boundaries.
* Keyword similarity is way too many false positives.

In [None]:
from spacyspanbert.spanbert import SpanBERT 
spanbert = SpanBERT("./pretrained_spanbert")  

In [None]:
def dep_tree(token):
    while token.head and token.head != token:
        token = token.head
    sub = [t.text for t in token.subtree]
    return [token.text] + sub

In [None]:
def spacy_ner_sr(text):
    # Process text
    doc = nlp(text)

    loc_labels = ['FAC', # Buildings, airports, highways, bridges, etc.
                  'ORG', # Companies, agencies, institutions, etc.
                  'GPE', # Countries, cities, states
                  'LOC' # Non-GPE locations, mountain ranges, bodies of water
                  'EVENT'] # Named events (e.g., "World Cup")
    topic_keywords = ['crime','arrest','police']
    topics = [nlp(t)[0] for t in topic_keywords]

    matches = []
    for token in doc:
        # Look for location entity first because fewer of these than word embeddings.
        if token.ent_type_ in loc_labels: # and token.dep_ in ['prep','pobj']
            # Check for crime-related word in sentence.
            sent = [t for t in token.sent if t.has_vector]
            for tok in sent:
                sim = np.mean([t.similarity(tok) for t in topics])
                if sim > .5:
                    match = {"keyword": tok.text,
                            "similarity": sim,
                            "location": token.text,
                            "context": [x.text for x in token.sent if not x.text.isspace()]}
                    matches.append(match)
    return matches

In [None]:
matches = []
# This works. Just commented out because it takes 3 minuts.
for text in tqdm(article_data.bodytext):
    matches.extend(spacy_ner_sr(text))

In [None]:
matches = pd.DataFrame.from_records(matches)

In [None]:
matches

In [None]:
matches.groupby(['keyword'])['similarity'].mean()

In [None]:
matches.groupby(['keyword','location'])['similarity'].mean().sort_values(ascending=False).head(20)

In [None]:
matches.groupby(['keyword'])['similarity'].mean().sort_values(ascending=False).head(20)

## Fine Tune

**TODO:**
* Split train/dev/test
* Fine-tune distilber or someone to just extract the location text somehow
* Do EDA on the other trained model to try to check how many locations were geocodable.
* DONT CONFUSE THE NEIGHBORHOOD CLASSIFIER WITH THE NER!