In [1]:
import pandas as pd
from zipfile import ZipFile
import pyarrow.parquet as pq
import gzip
import json
import pyarrow as pa
import os

In [11]:
COLAB = True

In [16]:
DATA_DIR = "gdrive/MyDrive/Work/quantify-news/data/" if COLAB else "data/"
PROJECT_DIR = "gdrive/MyDrive/Work/quantify-news/" if COLAB else "./"

FULL_TEXT_DEV_PATH = DATA_DIR + "newsarticles_article_dev.parquet"
PRED_BIN__PATH = DATA_DIR + "newsarticles_ner_pred_dev.spacy"
DATA_TRAIN_BIN_PATH = DATA_DIR + "ner_train.spacy"

GPU_ID = "0" if COLAB else "-1"

# For optuna outer loop
BEST_MODEL_OPT_PATH = PROJECT_DIR + "models/model-best/"


In [19]:
if COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')
    # assert spacy.require_gpu()

    import locale
    print(locale.getpreferredencoding())
    def getpreferredencoding(do_setlocale=True):
        return 'UTF-8'
    locale.getpreferredencoding = getpreferredencoding

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
UTF-8


In [23]:
article_data = pd.read_parquet(FULL_TEXT_DEV_PATH)

In [24]:
article_data['bodytext'] = (article_data['bodytext']
                                  .str.replace('\n',' ')
                                  .str.replace(u'\xa0', u' '))


In [25]:
article_data = article_data[article_data.relevant]

In [31]:
import sys
from itertools import islice
from packaging.version import Version
if Version(sys.version.split(" ")[0]) < Version("3.12"):
    def batched(iterable, n):
        "Batch data into tuples of length n. The last batch may be shorter."
        # batched('ABCDEFG', 3) --> ABC DEF G
        if n < 1:
            raise ValueError('n must be at least one')
        it = iter(iterable)
        while batch := tuple(islice(it, n)):
            yield batch
else:
    from itertools import batched


In [32]:
import spacy
from spacy import displacy
from spacy.tokens import DocBin
nlp = spacy.load(BEST_MODEL_OPT_PATH)

In [33]:
for batch in batched(article_data['bodytext'], 64):
    docs = nlp.pipe(batch)
    docs = list(filter(lambda d: any(map(lambda e: e.label_ == 'NEWS_LOC', d.ents)), docs))
    # sentence_spans = [list(doc.sents) for doc in docs]
    if docs:
        displacy.render(docs, style="ent")
        break