In [35]:
import pandas as pd
import numpy as np
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import uuid
import json
import ollama.client as client



splitter = RecursiveCharacterTextSplitter(
    chunk_size = 800,
    chunk_overlap  = 100,
    length_function = len,
    is_separator_regex = False,
)

In [36]:
from transformers import pipeline

## Roberta based NER
ner = pipeline("token-classification", model="2rtl3/mn-xlm-roberta-base-named-entity", aggregation_strategy="simple")
#ner = pipeline("token-classification", model="dslim/bert-large-NER", aggregation_strategy="simple")


print("Number of parameters ->", ner.model.num_parameters()/1000000, "Mn")


Number of parameters -> 277.456901 Mn


In [38]:
def row2NamedEntities(row):
    # print(row)
    ner_results = ner(row['text'])
    metadata = {'chunk_id': row['chunk_id']}
    entities = []
    for result in ner_results:
        entities = entities + [{'name': result['word'], 'entity': result['entity_group'], **metadata}]
        
    return entities

def dfText2DfNE(dataframe):
    ## Takes a dataframe from the parsed data and returns dataframe with named entities. 
    ## The input dataframe must have a text and a chunk_id column. 

    ## Using swifter for parallelism
    ## 1. Calculate named entities for each row of the dataframe. 
    results = dataframe.apply(row2NamedEntities, axis=1)

    ## Flatten the list of lists to one single list of entities. 
    entities_list = np.concatenate(results).ravel().tolist()

    ## Remove all NaN entities
    entities_dataframe = pd.DataFrame(entities_list).replace(' ', np.nan)
    entities_dataframe = entities_dataframe.dropna(subset=['entity'])

    ## Count the number of occurances per chunk id
    entities_dataframe = entities_dataframe.groupby(['name', 'entity', 'chunk_id']).size().reset_index(name='count')

    return entities_dataframe

In [39]:
## Temp chunk to use the same loader 
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

# ## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}")
# ## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")


## Dir PDF Loader
#loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
#loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)


  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  1.94it/s]

Number of chunks =  21
Review

Overview of the public and private health sectors

The government-funded health sector, which is the provider of healthcare to vulnerable populations, has been chronically underfunded with 1.28% of the GDP. This translates to a healthcare expenditure of $2.7 per citizen per year. As a consequence, India has 0.7 public hospital beds per 100,000 people [2] now and 0.576 physicians per 1,000 population in 2000 [4], compared to the World Health Organization's recommended doctor-to-population ratio of 1:1,000 [5]. Since the inception of the National Health Mission (NHM) in 2005, the government has aimed to increase the quantum of services provided, but a lack of focus on quality has failed to make a dent in healthcare indicators [6]. At best, 37% of the population had any health insurance coverage in 2018 [7].





In [40]:
# #loader = PyPDFLoader("./data/GlobalPublicHealth2022.pdf")
# loader = PyPDFDirectoryLoader("./data/kesy1dd")
loader = DirectoryLoader(inputdirectory, show_progress=True)
pages = loader.load_and_split(text_splitter=splitter)
len(pages)


100%|██████████| 1/1 [00:00<00:00, 13.78it/s]


21

In [42]:

rows = []
for page in pages:
    row = {'text': page.page_content, **page.metadata, 'chunk_id': uuid.uuid4().hex}
    rows += [row]

df = pd.DataFrame(rows)


In [43]:
dfne = dfText2DfNE(df)


In [44]:
df_ne = dfne.groupby(['name', 'entity']).agg({'count': 'sum', 'chunk_id': ','.join}).reset_index()
df_ne.sort_values(by='count', ascending=False).head(100).reset_index()

Unnamed: 0,index,name,entity,count,chunk_id
0,25,India,LOC,30,"00234c790deb443d80e70b2e2317a99c,0271cd9765f84..."
1,4,ASHAs,MISC,5,"040cfeb81a1e4c56b6a194f433df9ecf,aee27861364c4..."
2,13,COVID-19,MISC,4,"1e19b3ed270d4e58a1e3c631c2106a78,aee27861364c4..."
3,34,National Medical Council,ORG,4,"010055a5231246e2a66f85ac6479d836,df327e30f04e4..."
4,3,ANMs,MISC,3,040cfeb81a1e4c56b6a194f433df9ecf
...,...,...,...,...,...
69,36,Nottingham,LOC,1,a78edd364a4e46d8ae611114241959c4
70,1,-Income Count,MISC,1,1e19b3ed270d4e58a1e3c631c2106a78
71,38,Online Training Management Information Systems,MISC,1,636c8a5239374716bcae64548e066470
72,39,Out-Of-Pocket (OOP),MISC,1,5de6b6a64ae649ad9c93f5a39b0c0675


In [45]:
pages[12].page_content

'Improve Efficiency\n\nThe public health system is currently unable to utilize even the 1% of the GDP that has traditionally been available to it. With a doubling of the federal government’s allocation in 2022, its capacity to do so must be increased. Increasing the financial allocation and its utilization capacity will enhance the health infrastructure and workforce capability. This requires institutional capacity building and competency-based training in the government healthcare system [37]. This will allow the country to align its health services to local priorities as India exhibits its diversity in its population and disease profiles. The recently started National Health Protection Mission has limited uptake The majority of Indians (up to 80.9% in urban\n\nand 85.9% in rural) still do not have health insurance [38].\n\nAccredit Health Facilities and in Practice Regulate the Private Health Sector\n\nThe public health sector has revised standards [39] for healthcare infrastructure,

In [46]:


def extractConcepts(prompt: str, model='mistral-openorca:latest'):
    SYS_PROMPT = (
        "Your task is to extract the key entities mentioned in the users input.\n"
        "Entities may include - event, concept, person, place, object, document, organisation, artifact, misc, etc.\n"
        "Format your output as a list of json with the following structure.\n"
        "[{\n"
        "   \"entity\": The Entity string\n"
        "   \"importance\": How important is the entity given the context on a scale of 1 to 5, 5 being the highest.\n"
        "   \"type\": Type of entity\n"
        "}, { }]"
    )
    response, context = client.generate(model_name=model, system=SYS_PROMPT, prompt=prompt)
    return json.loads(response)


In [48]:
res = extractConcepts(prompt = pages[1].page_content)


An error occurred: 404 Client Error: Not Found for url: http://localhost:11434/api/generate


TypeError: the JSON object must be str, bytes or bytearray, not NoneType

In [168]:
res

[{'entity': 'infectious diseases', 'importance': 4},
 {'entity': 'disasters', 'importance': 3},
 {'entity': 'EMS', 'importance': 3},
 {'entity': 'RRA reports', 'importance': 2},
 {'entity': 'EIS bulletins', 'importance': 1.5},
 {'entity': 'DON reports', 'importance': 1},
 {'entity': 'WHO Regions', 'importance': 2},
 {'entity': 'IHR (2005) framework', 'importance': 3}]