In [1]:

%pprint
import sys
sys.path.insert(1, '../py')

Pretty printing has been turned OFF


In [2]:

from datetime import timedelta
# from edstan_data import EdstanData
from frvrs_utils import FRVRSUtilities
from notebook_utils import NotebookUtilities
from pandas import DataFrame
import humanize
import matplotlib.pyplot as plt
import numpy as np
import os
import os.path as osp
import pandas as pd
import re

nu = NotebookUtilities(
    data_folder_path=osp.abspath('../data'),
    saves_folder_path=osp.abspath('../saves')
)
fu = FRVRSUtilities(
    data_folder_path=osp.abspath('../data'),
    saves_folder_path=osp.abspath('../saves')
)
from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=nu,
    secrets_json_path=os.path.abspath(os.path.join(nu.data_folder, 'secrets', 'itm_secrets.json'))
)


# Parse Domain Documents for Entities

Downloaded all documents from https://nextcentury.atlassian.net/wiki/spaces/ITMC/pages/2991849482/Domain+Documents and converted them all to PDF files and stored them in the data folder.

In [3]:

import logging

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


## Option 1: Use a Hugging Face NER model

In [4]:

from llama_index import postprocessor

dir(postprocessor)

['AutoPrevNextNodePostprocessor', 'CohereRerank', 'EmbeddingRecencyPostprocessor', 'FixedRecencyPostprocessor', 'KeywordNodePostprocessor', 'LLMRerank', 'LongContextReorder', 'LongLLMLinguaPostprocessor', 'MetadataReplacementPostProcessor', 'NERPIINodePostprocessor', 'PIINodePostprocessor', 'PrevNextNodePostprocessor', 'SentenceEmbeddingOptimizer', 'SentenceTransformerRerank', 'SimilarityPostprocessor', 'TimeWeightedPostprocessor', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'cohere_rerank', 'llm_rerank', 'longllmlingua', 'metadata_replacement', 'node', 'node_recency', 'optimizer', 'pii', 'sbert_rerank', 'types']

In [5]:

from llama_index import ServiceContext
from llama_index.postprocessor import NERPIINodePostprocessor
import PyPDF2
from llama_index.schema import TextNode
from llama_index.schema import NodeWithScore
import spacy

# Please set either the OPENAI_API_KEY environment variable or openai.api_key prior to initialization
os.environ['OPENAI_API_KEY'] = wsu.secrets_json['OPENAI_API_KEY']
service_context = ServiceContext.from_defaults()
processor = NERPIINodePostprocessor(service_context=service_context)

# Load the spaCy model
try: nlp = spacy.load('en_core_web_sm')
except OSError as e:
    print(str(e).strip())
    command_str = f'{sys.executable} -m spacy download en_core_web_sm --quiet'
    print(command_str)
    !{command_str}
    nlp = spacy.load('en_core_web_sm')
import en_core_web_sm
nlp = en_core_web_sm.load()

pdf_folder = '../data/Domain_Knowledge'
black_list = ['.ipynb_checkpoints', '$Recycle.Bin', '.git']


## Option 2: Use SpaCy

In [6]:

# load documents
entities = []
for sub_directory, directories_list, files_list in os.walk(pdf_folder):
    if all(map(lambda x: x not in sub_directory, black_list)):
        for file_name in files_list:
            if file_name.endswith('.pdf'):
                file_path = osp.join(sub_directory, file_name)
                with open(file_path, 'rb') as file:
                    pdf_reader = PyPDF2.PdfReader(file)
                    text = ''
                    for page_number in range(len(pdf_reader.pages)):
                        page = pdf_reader.pages[page_number]
                        text += page.extract_text()
                
                # Process the text
                node = TextNode(text=text)
                new_nodes = processor.postprocess_nodes([NodeWithScore(node=node)])
                doc = nlp(text)
                
                # Get mapping in metadata and add it as a row dictionary to the entities rows list
                for k, v in new_nodes[0].node.metadata['__pii_node_info__'].items():
                    metadata_dict = {
                        'file_path': file_path,
                        'type': k[0].split('_')[0],
                        'text': v
                    }
                    entities.append(metadata_dict)
                
                # Extract named entities
                for ent in doc.ents:
                    entities.append({
                        'file_path': file_path,
                        'type': ent.label_,
                        'text': ent.text, # Or lemma_, orth_, text_with_ws
                        'start_pos': ent.start_char, # Or start
                        'end_pos': ent.end_char # Or end
                    })
domain_doc_ners_df = DataFrame(entities)
nu.store_objects(domain_doc_ners_df=domain_doc_ners_df)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
No model was supplied, defaulted

ValueError: [E088] Text of length 1156393 exceeds maximum of 1000000. The parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.