In [1]:
from tqdm.notebook import tqdm
import fullmetalalchemy as fa
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_trf")
engine = fa.create.create_engine('sqlite:///data3.db')

In [2]:
df = pd.read_csv('ner.csv')
df = df.reset_index(drop=False)[['text']]
df['text_id'] = df['text'].apply(hash).astype(str)
df['id'] = df.index
df = df[['id', 'text_id', 'text']]

text_table = fa.create.create_table_from_records(
    table_name='text',
    records=df.to_dict('records'),
    primary_key=['id'],
    engine=engine,
    if_exists='replace',
    autoincrement=True
)

In [3]:
entities_table = fa.create.create_table(
    table_name='entities',
    column_names=['id', 'entity', 'label', 'text_id'], 
    column_types=[int, str, str, str],
    primary_key=['id'],
    engine=engine,
    autoincrement=True,
    if_exists='replace',
)

In [4]:
total = fa.features.get_row_count(text_table)
total

47959

In [5]:
def insert_entities(ents, text_id: str, session) -> None:
    records = [{'entity': ent.text, 'label': ent.label_, 'text_id': text_id}
                  for ent in ents]
    fa.insert.insert_records_session(entities_table, records, session)

In [None]:
with fa.create.create_session(engine) as session:
    text_chunks = fa.select.select_column_values_chunks(text_table, 'text', 1000, session)
    text_id_chunks = fa.select.select_column_values_chunks(text_table, 'id', 1000, session)
    
    for text_chunk, id_chunk in tqdm(zip(text_chunks, text_id_chunks), total=total/1000):
        pipe = nlp.pipe(zip(text_chunk, id_chunk), as_tuples=True, n_process=-1)
        for doc, text_id in pipe:
            insert_entities(doc.ents, text_id, session)

  0%|          | 0/47.959 [00:00<?, ?it/s]