In [2]:
import pandas as pd
import numpy as np
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import uuid
import json
import ollama.client as client



splitter = RecursiveCharacterTextSplitter(
    chunk_size = 800,
    chunk_overlap  = 100,
    length_function = len,
    is_separator_regex = False,
)

In [3]:
from transformers import pipeline

## Roberta based NER
#ner = pipeline("token-classification", model="2rtl3/mn-xlm-roberta-base-named-entity", aggregation_strategy="simple")
ner = pipeline("token-classification", model="dslim/bert-large-NER", aggregation_strategy="simple")


print("Number of parameters ->", ner.model.num_parameters()/1000000, "Mn")


  from .autonotebook import tqdm as notebook_tqdm
2024-04-28 21:09:30.327661: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-28 21:09:30.344830: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-28 21:09:30.978152: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-28 21:09:34.183049: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the model checkpoint at dslim/b

Number of parameters -> 332.538889 Mn


In [4]:
def row2NamedEntities(row):
    # print(row)
    ner_results = ner(row['text'])
    metadata = {'chunk_id': row['chunk_id']}
    entities = []
    for result in ner_results:
        entities = entities + [{'name': result['word'], 'entity': result['entity_group'], **metadata}]
        
    return entities

def dfText2DfNE(dataframe):
    ## Takes a dataframe from the parsed data and returns dataframe with named entities. 
    ## The input dataframe must have a text and a chunk_id column. 

    ## Using swifter for parallelism
    ## 1. Calculate named entities for each row of the dataframe. 
    results = dataframe.apply(row2NamedEntities, axis=1)

    ## Flatten the list of lists to one single list of entities. 
    entities_list = np.concatenate(results).ravel().tolist()

    ## Remove all NaN entities
    entities_dataframe = pd.DataFrame(entities_list).replace(' ', np.nan)
    entities_dataframe = entities_dataframe.dropna(subset=['entity'])

    ## Count the number of occurances per chunk id
    entities_dataframe = entities_dataframe.groupby(['name', 'entity', 'chunk_id']).size().reset_index(name='count')

    return entities_dataframe

In [5]:
## Temp chunk to use the same loader 
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

# ## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}")
# ## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")


## Dir PDF Loader
#loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
#loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)


  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:39<00:00, 39.36s/it]

Number of chunks =  23
An extensive literature search was performed, and 56 articles published in peer-reviewed journals between 2005 and 2021 were selected and analyzed. The corresponding authors' experiential knowledge served as the foundation for the analysis.





In [6]:
loader = PyPDFLoader("./data_input/cureus-0015-00000040274.pdf")
# loader = PyPDFDirectoryLoader("./data/kesy1dd")
#loader = DirectoryLoader(inputdirectory, show_progress=True)
pages = loader.load_and_split(text_splitter=splitter)
len(pages)


23

In [7]:

rows = []
for page in pages:
    row = {'text': page.page_content, **page.metadata, 'chunk_id': uuid.uuid4().hex}
    rows += [row]

df = pd.DataFrame(rows)


In [8]:
dfne = dfText2DfNE(df)


In [9]:
df_ne = dfne.groupby(['name', 'entity']).agg({'count': 'sum', 'chunk_id': ','.join}).reset_index()
df_ne.sort_values(by='count', ascending=False).head(100).reset_index()

Unnamed: 0,index,name,entity,count,chunk_id
0,42,India,LOC,30,"0866429890ad4076a2244db6ff07d754,41f7aae7ac074..."
1,64,Saxena,PER,5,"787a2469005b4cc1bb21228f2939f8ef,cffb568e9afd4..."
2,43,Indian,MISC,4,"652fa0ee84574a39aefe68cff71c51b9,66bf3788ce324..."
3,14,AS,MISC,4,"652fa0ee84574a39aefe68cff71c51b9,9c172d9214244..."
4,67,US,LOC,4,"487e9f79d79c4d3a90428cb5d3c77c24,652fa0ee84574..."
...,...,...,...,...,...
71,35,Government of,ORG,1,f720d5714d524c538ff64f44e9ca22a0
72,37,Health,ORG,1,9c172d92142443d796a5f923153adb31
73,1,##I,LOC,1,e528057150114290a74f0cb23460b534
74,39,Health Provide,ORG,1,cef148c6b82d4fbb9106f1728e5576dc


In [10]:
pages[1].page_content

"be addressed if India is to accelerate itsprogress toward achieving universal health coverage and its sustainable development goals (SDGs).The recent increase in the federal health budget offers an unprecedented opportunity to do this. This articleutilizes the ready materials, extract and analyze data, distill findings (READ) approach to adding to theauthors' experiential learning to analyze the health system in India. The growing divide between the publicand the burgeoning private health sector systems, with the latter's booming medical tourism industry andmedical schools, are analyzed along with the newly minted National Medical Council, to recommend policiesthat would help India achieve its SDGs.Categories: Public Health, Epidemiology/Public Health, Health PolicyKeywords: working conditions, indian public health standards, auxiliary nurse midwives, human resource, healthsector reform, india, health policyIntroduction And BackgroundIndia’s health indicators have improved in recent t

In [11]:


def extractConcepts(prompt: str, model='mistral-openorca:latest'):
    SYS_PROMPT = (
        "Your task is to extract the key entities mentioned in the users input.\n"
        "Entities may include - event, concept, person, place, object, document, organisation, artifact, misc, etc.\n"
        "Format your output as a list of json with the following structure.\n"
        "[{\n"
        "   \"entity\": The Entity string\n"
        "   \"importance\": How important is the entity given the context on a scale of 1 to 5, 5 being the highest.\n"
        "   \"type\": Type of entity\n"
        "}, { }]"
    )
    response, context = client.generate(model_name=model, system=SYS_PROMPT, prompt=prompt)
    return json.loads(response)


In [12]:
res = extractConcepts(prompt = pages[1].page_content)


An error occurred: 404 Client Error: Not Found for url: http://localhost:11434/api/generate


TypeError: the JSON object must be str, bytes or bytearray, not NoneType

In [None]:
res

[{'entity': 'infectious diseases', 'importance': 4},
 {'entity': 'disasters', 'importance': 3},
 {'entity': 'EMS', 'importance': 3},
 {'entity': 'RRA reports', 'importance': 2},
 {'entity': 'EIS bulletins', 'importance': 1.5},
 {'entity': 'DON reports', 'importance': 1},
 {'entity': 'WHO Regions', 'importance': 2},
 {'entity': 'IHR (2005) framework', 'importance': 3}]