In [1]:
import psycopg2
import spacy
from tqdm import tqdm
import csv
from dotenv import load_dotenv
import os

# the following code must be run only once in the environment
# python -m spacy download en_core_web_trf

In [2]:
nlp = spacy.load("en_core_web_trf")

In [4]:
load_dotenv(os.path.join("..", "settings", "local.env"))

db_host = os.getenv("DB_HOST")
db_port = os.getenv("DB_PORT")
db_name = os.getenv("DB_NAME")
db_username = os.getenv("DB_USERNAME")
db_password = os.getenv("DB_PASSWORD")
db_batch_size = 950  # Maximum of postgresql is 1000, some articles have up to 700 named entities.

In [5]:
conn = psycopg2.connect(database=db_name, user=db_username, password=db_password, host=db_host, port=db_port)
conn.autocommit=True
cur = conn.cursor()

In [11]:
# run this code if you do not have the table named localnews.named_entities
cur.execute("CREATE TABLE localnews.named_entities ("
    "entryid integer PRIMARY KEY GENERATED ALWAYS AS IDENTITY,"
    "articleId INTEGER NOT NULL REFERENCES localnews.articles(entryid),"
    "entity VARCHAR(2054) NOT NULL,"
    "entityType VARCHAR(500) NOT NULL)"
)

In [8]:
#make sure to change date to get required records
year = 2020
cur.execute("SELECT entryId, article FROM localnews.articles WHERE date_part('year',parseddate) = {}"
    " and entryId NOT IN (SELECT DISTINCT articleId FROM localnews.named_entities)".format(year))
records = cur.fetchall()

In [9]:
values = []
for entryId, article in tqdm(records, desc="Performing NER"):
    doc = nlp(article, disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])
    for ent in doc.ents:
        if ent.label_ not in ['DATE', 'CARDINAL', 'ORDINAL', 'QUANTITY', 'MONEY', 'TIME', 'PERCENT']:
            values.append((entryId, ent.text, ent.label_))

        if values.__len__() >= db_batch_size:
            cur.executemany("INSERT INTO localnews.named_entities (articleId, entity, entityType) VALUES (%s, %s, %s)", values)
            values.clear()

if values.__len__() > 0:
    cur.executemany("INSERT INTO localnews.named_entities (articleId, entity, entityType) VALUES (%s, %s, %s)", values)

Performing NER: 100%|██████████| 24026/24026 [3:06:37<00:00,  2.15it/s]   


In [None]:
# with open('ner_results_{}.csv'.format(year), 'w', newline='') as csvfile:
#     fieldnames = ['entryId', 'type', 'entity']
#     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#     writer.writeheader()

#     # Perform NER and store the results in the CSV file
#     for entryId, article in tqdm(records, desc="Performing NER"):
#         doc = nlp(article)
#         for ent in doc.ents:
#             if ent.label_ not in ['DATE', 'CARDINAL', 'ORDINAL', 'QUANTITY', 'MONEY', 'TIME', 'PERCENT']:
#                 writer.writerow({'entryId': entryId, 'entity': ent.text, 'type': ent.label_})

# # Close the cursor and the connection
# cur.close()
# conn.close()

In [None]:
# conn = psycopg2.connect(database="decodeMT", user="postgres", password="123", host="localhost", port="5432")

# cur = conn.cursor()

# cur.execute("CREATE TABLE localnews.named_entities (entry_id INTEGER, entity VARCHAR(2048), entity_type VARCHAR(2048))")

# # read in the CSV file and insert the data into the table
# with open("./ner_results.csv", "r", encoding="utf-8") as f:
#     reader = csv.reader(f)
#     total_rows = sum(1 for row in reader)
#     f.seek(0)
#     next(reader) # skip header row
#     for i, row in tqdm(enumerate(reader, start=1), total=total_rows, desc="Inserting data"):
#         cur.execute("INSERT INTO localnews.named_entities (entry_id, entity_type, entity) VALUES (%s, %s, %s)", (row[0], row[1], row[2]))
    

# # commit the changes and close the cursor and connection objects
# conn.commit()
# cur.close()
# conn.close()