In [1]:
import multiprocessing

print("Number of cpu : ", multiprocessing.cpu_count())

Number of cpu :  16


In [2]:
import psycopg2
import spacy
from tqdm import tqdm
import csv
from dotenv import load_dotenv
import os

# the following code must be run only once in the environment
# python -m spacy download en_core_web_trf

In [3]:
nlp = spacy.load("en_core_web_trf")

In [4]:
load_dotenv(os.path.join("..", "settings", "local.env"))

db_host = os.getenv("DB_HOST")
db_port = os.getenv("DB_PORT")
db_name = os.getenv("DB_NAME")
db_username = os.getenv("DB_USERNAME")
db_password = os.getenv("DB_PASSWORD")
db_batch_size = 950 # Maximum of postgresql is 1000, some articles have up to 700 named entities.

In [5]:
conn = psycopg2.connect(database=db_name, user=db_username, password=db_password, host=db_host, port=db_port)
conn.autocommit=True
cur = conn.cursor()

In [6]:
#make sure to change date to get required records
year = 2019
cur.execute("SELECT entryId, article FROM localnews.articles WHERE date_part('year',parseddate) = {}"
    " and entryId NOT IN (SELECT DISTINCT articleId FROM localnews.named_entities) LIMIT 1000".format(year))
records = cur.fetchall()

In [29]:
# Create texts for single processing
texts = []

for record in records:
    texts.append(record[1])

In [30]:
# Create texts for multi-processing
from spacy.tokens import Doc

if not Doc.has_extension("articleID"):
    Doc.set_extension("articleID", default=None)

text_tuples = []
for record in records:
    text_tuples.append((record[1], {"articleID": record[0]}))

Start tests

In [31]:
import timeit

timeit.timeit(lambda: nlp(texts[0]), number=10)

5.039042699965648

In [32]:
timeit.timeit(lambda: nlp(texts[0], disable=["tagger", "parser", "attribute_ruler", "lemmatizer"]), number=10)

4.78611860005185

In [36]:
timeit.timeit(lambda: nlp.pipe(text_tuples, n_process=8), number=1000)

0.00028639996889978647

In [37]:
timeit.timeit(lambda: nlp.pipe(text_tuples, n_process=8, disable=["tagger", "parser", "attribute_ruler", "lemmatizer"]), number=1000)

0.0003212000010535121

In [47]:
for entryId, article in tqdm(records, desc="Performing NER"):
    doc = nlp(article, disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])

Performing NER: 100%|██████████| 100/100 [01:03<00:00,  1.57it/s]


In [48]:
doc_tuples = nlp.pipe(text_tuples, as_tuples=True, n_process=8, disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])

Single processing solution without SQL 

In [64]:
values = []
for entryId, article in tqdm(records, desc="Performing Single-processing NER"):
    doc = nlp(article, disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])

    for ent in doc.ents:
        if ent.label_ not in ['DATE', 'CARDINAL', 'ORDINAL', 'QUANTITY', 'MONEY', 'TIME', 'PERCENT']:
            values.append((entryId, ent.text, ent.label_))

        if values.__len__() >= db_batch_size:
            #cur.executemany("INSERT INTO localnews.named_entities (articleId, entity, entityType) VALUES (%s, %s, %s)", values)
            values.clear()

if values.__len__() > 0:
    print()
    #cur.executemany("INSERT INTO localnews.named_entities (articleId, entity, entityType) VALUES (%s, %s, %s)", values)

Performing Single-processing NER: 100%|██████████| 100/100 [01:00<00:00,  1.66it/s]


Multi-processing NER

In [66]:
doc_tuples = nlp.pipe(text_tuples, as_tuples=True, n_process=4, batch_size=8, disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])

values = []
for doc, context in tqdm(doc_tuples, desc="Performing Multi-processing NER"):    
    for ent in doc.ents:
        if ent.label_ not in ['DATE', 'CARDINAL', 'ORDINAL', 'QUANTITY', 'MONEY', 'TIME', 'PERCENT']:
            values.append((context["articleID"], ent.text, ent.label_))

    if values.__len__() >= db_batch_size:
        #cur.executemany("INSERT INTO localnews.named_entities (articleId, entity, entityType) VALUES (%s, %s, %s)", values)
        values.clear()

if values.__len__() > 0:
    #cur.executemany("INSERT INTO localnews.named_entities (articleId, entity, entityType) VALUES (%s, %s, %s)", values)
    values.clear()

Performing Multi-processing NER: 100it [01:06,  1.50it/s]
