In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%%capture
!apt-get install poppler-utils
!apt-get install tesseract-ocr-all
# unstructured 0.11.5
# unstructured-inference 0.7.19
!pip install unstructured[all-docs] unstructured-inference

!pip install --upgrade nltk

!pip install pdfplumber

In [None]:
from unstructured.partition.pdf import partition_pdf
import pdfplumber

In [3]:
import json
import re
import os
import pathlib
from pathlib import Path
import statistics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [None]:
# Define parameters for Unstructured's library

## include_page_breaks
# include page breaks (default is False)
include_page_breaks = True

## strategy
# The strategy to use for partitioning the PDF. Valid strategies are "hi_res", "ocr_only", and "fast".
# When using the "hi_res" strategy, the function uses a layout detection model to identify document elements.
# hi_res" is used for analyzing PDFs and extracting table structure (default is "auto")
strategy = "hi_res"

## infer_table_structure
# Only applicable if `strategy=hi_res`.
# If True, any Table elements that are extracted will also have a metadata field named "text_as_html" where the table's text content is rendered into an html string.
# I.e., rows and cells are preserved.
# Whether True or False, the "text" field is always present in any Table element and is the text content of the table (no structure).

if strategy == "hi_res": infer_table_structure = True
else: infer_table_structure = False

## extract_element_types
# Get images of tables
if infer_table_structure == True: extract_element_types=['Table']
else: extract_element_types=None

## max_characters
# The maximum number of characters to include in a partition (document element)
# If None is passed, no maximum is applied.
# Only applies to the "ocr_only" strategy (default is 1500)
if strategy != "ocr_only": max_characters = None

## languages
# The languages to use for the Tesseract agent.
# To use a language, you'll first need to install the appropriate Tesseract language pack.
languages = ["eng"] # example if more than one "eng+por" (default is "eng")

## model_name
# @requires_dependencies("unstructured_inference")
# yolox: best model for table extraction. Other options are yolox_quantized, detectron2_onnx and chipper depending on file layout
# source: https://docs.unstructured.io/welcome
hi_res_model_name = "yolox"

In [None]:
path = "/content/drive/MyDrive/Colab Notebooks/IR/Documents"
file_names = os.listdir(path)
file_names

In [None]:
def num_percentage(text):
  numbers = 0
  for char in text:
    if char.isnumeric():
      numbers += 1
  return numbers / len(text)

In [None]:
def check_false_type(item):
  return len(item['text']) <= max_title_len and item['font_size'] > avg_nar_text_size

def find_previous_item(item_types, item_index):
  for i in range(item_index, -1, -1):
    if data[i]['type'] in item_types:
      return i
  return -1

def check_table_exist(item_index):
  if data[item_index-1]['type'] == 'Table':
    if 'text_as_html' in data[item_index-1]['metadata']:
      if data[item_index]['text'] in data[item_index-1]['metadata']['text_as_html']:
        return True
  if data[item_index+1]['type'] == 'Table':
    if 'text_as_html' in data[item_index+1]['metadata']:
      if data[item_index]['text'] in data[item_index+1]['metadata']['text_as_html']:
        return True
  return False

In [None]:
def check_list_item_parent(item_index):
  temp_parent_id = data[item_index]['metadata']['parent_id']
  last_list_item = find_previous_item('ListItem', item_index-1)
  new_item_index = item_index
  while new_item_index and (0 < (new_item_index - last_list_item) < 4):
    if data[item_index]['metadata']['coordinates']['points'][0][0] > threshold and data[last_list_item]['type'] == 'ListItem':
      temp_parent_id = data[last_list_item]['element_id']
    new_item_index = last_list_item
    last_list_item = find_previous_item('ListItem', new_item_index-1)
  return temp_parent_id

In [None]:
for file_name in file_names:
  filename = '/content/drive/MyDrive/Colab Notebooks/IR/Documents/' + file_name

  elements = partition_pdf(
          filename=filename,
          include_page_breaks=include_page_breaks,
          strategy=strategy,
          infer_table_structure=infer_table_structure,
          extract_element_types=extract_element_types,
          max_characters=max_characters,
          languages=languages,
          hi_res_model_name=hi_res_model_name,
          )

  from unstructured.staging.base import elements_to_json
  elements_to_json(elements, filename=f"{filename}.json")
  with open(f"{filename}.json", "r") as file:
    data = json.load(file)


  characters = ''
  char_sizes = []
  with pdfplumber.open(filename) as pdf:
    for page in pdf.pages:
      for char in page.chars:
        if char['text'] != ' ':
          characters = characters + char['text']
          char_sizes.append(char['size'])
  for element in data:
    element_temp = element['text'].replace(" ", "")
    sizes = []
    index = characters.find(element_temp)
    for i in range(index, index + len(element_temp)):
      sizes.append(char_sizes[i])
    element['font_size'] = round(statistics.median(sizes)) if sizes else 0
    characters = characters[:index] + characters[index + len(element_temp):]
    char_sizes = char_sizes[:index] + char_sizes[index + len(element_temp):]


  key = 'type'
  unique_elements = set()
  for item in data:
    unique_elements.add(item[key])
  print(unique_elements)
  import math
  max_title_len = -1
  total_num_percentage = 0
  nar_text_sizes = []
  title_sizes = []
  indentations = []
  for i, item in enumerate(data):
    if 'coordinates' in item['metadata']:
      indentations.append({'indent': item['metadata']['coordinates']['points'][0][0], 'index': i})
    if not 'parent_id' in item['metadata']:
      item['metadata']['parent_id'] = 'None'
    if item['type'] == 'Title':
      max_title_len = max(max_title_len, len(item['text']))
      total_num_percentage += num_percentage(item['text'])
      title_sizes.append(item['font_size'])
    elif item['type'] == 'NarrativeText':
      nar_text_sizes.append(item['font_size'])
  nar_text_sizes.sort()
  avg_nar_text_size = statistics.median(nar_text_sizes)
  avg_title_size = statistics.median(title_sizes)
  indent_indexes = [item['indent'] for item in indentations]
  normal_indent = statistics.median(indent_indexes)
  avg_title_num_percentage = math.ceil(100 * total_num_percentage / len(data)) / 100

  indented_list_item = []
  for i in indentations:
    if i['indent'] > normal_indent and data[i['index']]['type'] == 'ListItem':
      indented_list_item.append(i['indent'])
  if indented_list_item:
    indented = statistics.median(indented_list_item)
  else:
    indented = 0

  print(max_title_len)
  print(avg_nar_text_size)
  print(avg_title_size)
  print(normal_indent)
  print(indented)
  print(avg_title_num_percentage)

  threshold = (normal_indent + indented) / 2

  for index, item in enumerate(data):

    if len(item['text']) != 0 and num_percentage(item['text']) > 0.5:
      item['type'] = 'Number'

    if item['type'] == 'Title' and check_table_exist(index):
      item['type'] = 'Table'

    if item['type'] in ['NarrativeText', 'ListItem', 'Table']:

      if check_false_type(item):
        item['type'] = 'Title'

      if item['type'] in ['NarrativeText', 'ListItem', 'Table']:
        parent_index = find_previous_item(['Title', 'Image'], index)

        if parent_index == -1:
          item['metadata']['parent_id'] = 'None'

        else:
          item['metadata']['parent_id'] = data[parent_index]['element_id']

    if item['type'] == 'ListItem':
      item['metadata']['parent_id'] = check_list_item_parent(index)

    if item['type'] == 'Title':
      temp_parent_index = find_previous_item(['Title', 'Image'], index-1)

      while True:
        if temp_parent_index == -1:
          item['metadata']['parent_id'] = 'None'
          break

        elif data[temp_parent_index]['font_size'] <= item['font_size']:
          temp_parent_index = find_previous_item(['Title', 'Image'], temp_parent_index-1)

        else:
          item['metadata']['parent_id'] = data[temp_parent_index]['element_id']
          break

  name = f'{file_name}.json'

  with open(name, 'w') as file:
      json.dump(data, file, indent=4)


{'ListItem', 'NarrativeText', 'Header', 'Title', 'PageBreak'}
47
9.0
14
191.7
0
0.0
{'UncategorizedText', 'ListItem', 'NarrativeText', 'Title', 'PageBreak'}
38
9
17
262.4
265.0
0.06


In [4]:
path = "../../json_data"
json_names = os.listdir(path)
len(json_names)

153

In [5]:
documents = []
for name in json_names:

  with open(os.path.join(path, name), 'r') as f:
      data = json.load(f)
  document = ' '.join(item['text'] for item in data if 'text' in item)
  documents.append({'name': name, 'document': document})

In [6]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\d+", " ", text)
    words = text.split()
    text = " ".join([word for word in words if word not in stop_words])
    return text

sample_text = "This is an Example of NORMALIZATION with punctuation!"
normalized_text = normalize_text(sample_text)
print("Normalized Text:", normalized_text)

Normalized Text: example normalization punctuation


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
normalized_documents = []
for doc in documents:
  normalized_documents.append({'name': doc['name'], 'normalized_document': normalize_text(doc['document'])})

In [8]:
from nltk import pos_tag
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [9]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [10]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

def lemmatize_text_with_pos(text):
    words = text.split()
    pos_tags = pos_tag(words)
    lemmatized_words = [
        lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag))
        for word, tag in pos_tags
    ]
    return " ".join(lemmatized_words)

text = "medication"
lemmatized_text = lemmatize_text_with_pos(text)
print("Lemmatized Text:", lemmatized_text)

Lemmatized Text: medication


In [12]:
lemmatized_documents = []
for doc in normalized_documents:
  lemmatized_documents.append({'name': doc['name'], 'lemmatized_document': lemmatize_text_with_pos(doc['normalized_document'])})

In [13]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem_text(text):
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)

normalized_text = "running runs easil comparission"
stemmed_text = stem_text(normalized_text)
print("Stemmed Text:", stemmed_text)

Stemmed Text: run run easil compariss


In [14]:
stemmed_documents = []
for doc in normalized_documents:
  stemmed_documents.append({'name': doc['name'], 'stemmed_document': stem_text(doc['normalized_document'])})

In [15]:
print(stemmed_documents[0])

{'name': 'Acute Stress Disorder.pdf.json', 'stemmed_document': 'trauma stressor relat disord psychot featur delirium substanc medic induc disord psychot di order due anoth medic condit traumat brain injuri brain injuri occur context traumat event e g traumat accid bomb blast acceler deceler trauma symptom ptsd may appear event caus head trauma may also constitut psycholog traumat event tramaut brain injuri tbi relat neurocognit symptom mutual exclus may occur concurr symptom previous term postconcuss e g headach dizzi sensit light sound irrit concentr deficit occur brain injur non brain injur popul includ individu ptsd symp tom ptsd tbi relat neurocognit symptom overlap differenti diagnosi ptsd neurocognit disord symptom attribut tbi may possibl base presenc symptom distinct present wherea reexp rienc avoid characterist ptsd effect tbi persist disori entat confus specif tbi neurocognit effect ptsd comorbid individu ptsd like without ptsd symptom meet diagnost criteria least one mental 

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
import re

# Custom tokenizer to preserve compound terms
def custom_tokenizer(text):
    # Match hyphenated terms or alphanumeric words
    tokens = re.findall(r'\b\w+(?:-\w+)*\b', text.lower())
    return tokens

# Recreate the boolean vectorizer and matrix using the custom tokenizer
bool_vectorizer = CountVectorizer(binary=True, stop_words='english', tokenizer=custom_tokenizer)
bool_matrix = bool_vectorizer.fit_transform([doc['document'] for doc in documents])

# Print the updated vocabulary to confirm compound terms are preserved
print("Updated Vocabulary:", bool_vectorizer.vocabulary_)






In [18]:
import pandas as pd
# Get feature names (terms)
terms = bool_vectorizer.get_feature_names_out()

# Calculate the co-occurrence matrix
co_occurrence_matrix = bool_matrix.T @ bool_matrix  # Sparse matrix multiplication

# Convert to a dense format for readability
co_occurrence_dense = co_occurrence_matrix.toarray()

# Convert to pandas DataFrame for better readability
co_occurrence_df = pd.DataFrame(co_occurrence_dense, index=terms, columns=terms)

print("Co-Occurrence Matrix:")
print(co_occurrence_df)

Co-Occurrence Matrix:
            0  00  000  002  003  005  009  01  010  011  ...  ziness  \
0          82  14    6    2    1    1    0  11    1    1  ...       1   
00         14  20    1    1    0    0    0   7    1    1  ...       0   
000         6   1   10    0    0    0    0   1    0    0  ...       0   
002         2   1    0    2    1    1    0   0    0    0  ...       0   
003         1   0    0    1    1    1    0   0    0    0  ...       0   
...        ..  ..  ...  ...  ...  ...  ...  ..  ...  ...  ...     ...   
zones       1   1    0    0    0    0    0   0    0    0  ...       0   
zoophilia   0   0    0    0    0    0    0   0    0    0  ...       0   
zurelike    1   0    0    0    0    0    0   0    0    0  ...       0   
zures       4   1    1    0    0    0    0   1    0    0  ...       0   
é           2   2    0    0    0    0    0   2    0    0  ...       0   

           zippers  zlement  zodiazepines  zolpidem  zones  zoophilia  \
0                1        1 

In [19]:
n_bool_vectorizer = CountVectorizer(binary=True, stop_words='english', tokenizer= custom_tokenizer)
normalized_bool_matrix = n_bool_vectorizer.fit_transform([doc['normalized_document'] for doc in normalized_documents])



In [20]:
l_bool_vectorizer = CountVectorizer(binary=True, stop_words='english', tokenizer= custom_tokenizer)
lemmatized_bool_matrix = l_bool_vectorizer.fit_transform([doc['lemmatized_document'] for doc in lemmatized_documents])

In [21]:
s_bool_vectorizer = CountVectorizer(binary=True, stop_words='english', tokenizer= custom_tokenizer)
stemmed_bool_matrix = s_bool_vectorizer.fit_transform([doc['stemmed_document'] for doc in stemmed_documents])

In [22]:
# Convert the sparse matrix to a dense format
dense_matrix = bool_matrix.toarray()

# Get the first row
first_row = dense_matrix[0]

# Map column indices to terms using the vectorizer's vocabulary
index_to_term = {index: term for term, index in bool_vectorizer.vocabulary_.items()}

# Create a detailed view: include the column index, value, and term (if value is 1)
detailed_first_row = [
    {
        "index": i,
        "value": first_row[i],
        "term": index_to_term.get(i, None) if first_row[i] == 1 else None
    }
    for i in range(len(first_row))
]

# Print the entire row
print("First Row (All 1s and 0s):", first_row)

# Print detailed entries where the value is 1
print("Detailed Entries for 1s:")
for entry in detailed_first_row:
    if entry["value"] == 1:
        print(entry)

First Row (All 1s and 0s): [1 0 0 ... 0 0 0]
Detailed Entries for 1s:
{'index': 0, 'value': 1, 'term': '0'}
{'index': 28, 'value': 1, 'term': '1'}
{'index': 31, 'value': 1, 'term': '1-month'}
{'index': 33, 'value': 1, 'term': '10'}
{'index': 47, 'value': 1, 'term': '11'}
{'index': 61, 'value': 1, 'term': '12'}
{'index': 78, 'value': 1, 'term': '13'}
{'index': 90, 'value': 1, 'term': '14'}
{'index': 151, 'value': 1, 'term': '19'}
{'index': 167, 'value': 1, 'term': '2'}
{'index': 171, 'value': 1, 'term': '20'}
{'index': 194, 'value': 1, 'term': '21'}
{'index': 279, 'value': 1, 'term': '280'}
{'index': 280, 'value': 1, 'term': '281'}
{'index': 281, 'value': 1, 'term': '282'}
{'index': 282, 'value': 1, 'term': '283'}
{'index': 283, 'value': 1, 'term': '284'}
{'index': 284, 'value': 1, 'term': '285'}
{'index': 285, 'value': 1, 'term': '286'}
{'index': 307, 'value': 1, 'term': '3'}
{'index': 320, 'value': 1, 'term': '308'}
{'index': 426, 'value': 1, 'term': '4'}
{'index': 496, 'value': 1, 't

In [24]:
def retrieve_boolean_query(query, bool_matrix, bool_vectorizer):
    def process_term(term):
        """Returns the set of document indices where the term is present."""
        term_index = bool_vectorizer.vocabulary_.get(term.lower(), None)
        if term_index is not None:
            return set(bool_matrix[:, term_index].nonzero()[0])
        return set()

    def evaluate(expression):
        """Evaluates the parsed Boolean expression recursively."""
        if isinstance(expression, str):
            # Single term
            if expression.startswith("NOT "):
                term = expression[4:]
                return all_docs - process_term(term)
            return process_term(expression)
        elif isinstance(expression, list):
            operator = expression[0]
            if operator == "AND":
                return evaluate(expression[1]) & evaluate(expression[2])
            elif operator == "OR":
                return evaluate(expression[1]) | evaluate(expression[2])
            elif operator == "NOT":
                return all_docs - evaluate(expression[1])
        return set()

    def parse_query(query):
        """Parses a Boolean query into a nested structure."""
        tokens = query.replace("(", " ( ").replace(")", " ) ").split()
        def helper(tokens):
            stack = []
            while tokens:
                token = tokens.pop(0)
                if token == "(":
                    stack.append(helper(tokens))
                elif token == ")":
                    break
                elif token.upper() in {"AND", "OR", "NOT"}:
                    stack.append(token.upper())
                else:
                    stack.append(token)
            if "NOT" in stack:
                not_index = stack.index("NOT")
                # for example : [NOT, empty] = [NOT empty]
                stack[not_index:not_index+2] = [["NOT", stack[not_index+1]]]
            while "AND" in stack or "OR" in stack:
                for op in ("AND", "OR"):
                    if op in stack:
                        op_index = stack.index(op)
                        stack[op_index-1:op_index+2] = [[op, stack[op_index-1], stack[op_index+1]]]
                        break
            return stack[0]
        return helper(tokens)

    # Initialize set of all documents
    all_docs = set(range(bool_matrix.shape[0]))

    # Parse and evaluate the query
    parsed_query = parse_query(query)
    result_set = evaluate(parsed_query)

    result_docs = [documents[index]['name'][:-5] for index in result_set]

    return result_docs

query = "ICD-9-CM AND NOT CNS"
retrieved_docs = retrieve_boolean_query(query, bool_matrix, bool_vectorizer)
print(retrieved_docs)

['Substance Medication-Induced Depressive Disorder.pdf', 'Substance Medication-Induced Major or Mild Neurocognitive Disorder.pdf', 'Substance Medication-Induced Obsessive-Compulsive and Related Disorder.pdf', 'Alcohol-Related Disorders.pdf', 'Anorexia Nervosa.pdf', 'Substance Medication-Induced Psychotic Disorder.pdf', 'Substance Medication-Induced Sleep Disorder.pdf', 'Substance Medication-Induced.pdf', 'Cannabis-Related Disorders.pdf', 'Hallucinogen-Related Disorders.pdf', 'Inhalant-Related Disorders.pdf', 'Bipolar and Related Disorder Due to Another Medical Condition.pdf', 'Other (or Unknown) Substance–Related Disorders.pdf', 'Sedative-, Hypnotic-, or Anxiolytic-Related Disorders.pdf', 'Stimulant-Related Disorders.pdf', 'Tobacco-Related Disorders.pdf', 'Circadian Rhythm Sleep-Wake Disorders.pdf', 'Opioid-Related Disorders.pdf', 'Conversion Disorder (Functional Neurological Symptom Disorder).pdf', 'Delirium.pdf', 'Depressive Disorder Due to Another Medical Condition.pdf', 'Excoriatio

In [25]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform([doc['document'] for doc in documents])

In [26]:
n_vectorizer = TfidfVectorizer(stop_words='english')
n_tfidf_matrix = n_vectorizer.fit_transform([doc['normalized_document'] for doc in normalized_documents])

In [27]:
l_vectorizer = TfidfVectorizer(stop_words='english')
l_tfidf_matrix = l_vectorizer.fit_transform([doc['lemmatized_document'] for doc in lemmatized_documents])

In [28]:
s_vectorizer = TfidfVectorizer(stop_words='english')
s_tfidf_matrix = s_vectorizer.fit_transform([doc['stemmed_document'] for doc in stemmed_documents])

In [29]:
doc_names = [doc['name'] for doc in documents]
def retrieve_tfidf_query(query, tfidf_matrix, vectorizer, top_index):
  query_tfidf = vectorizer.transform([query])
  cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

  top_indices = cosine_similarities.argsort()[-top_index:][::-1]  # Top documents
  top_doc_names = [doc_names[i][:-5] for i in top_indices]
  return top_doc_names

In [30]:
from nltk.corpus import wordnet

# Function to get synonyms of a term
def get_synonyms(term):
    synonyms = set()
    for synset in wordnet.synsets(term):
        for lemma in synset.lemmas():
            synonyms.add(lemma.name().lower())
    return synonyms

# Build a thesaurus for all terms in the vocabulary
vocabulary = list(bool_vectorizer.vocabulary_.keys())
thesaurus = {term: get_synonyms(term) for term in vocabulary}

## replace with thesaurus

In [31]:
def filter_synonyms(term, synonyms, bool_matrix, vocabulary):
    term_index = vocabulary.get(term, None)
    if term_index is None:
        return []

    valid_synonyms = []
    for synonym in synonyms:
        synonym_index = vocabulary.get(synonym, None)
        if synonym_index is not None:
            # Calculate co-occurrence as cosine similarity
            term_vector = bool_matrix[:, term_index].toarray().flatten()
            synonym_vector = bool_matrix[:, synonym_index].toarray().flatten()
            similarity = cosine_similarity([term_vector], [synonym_vector])[0, 0]
            if similarity > 0.1:  # Threshold for meaningful relationships
                valid_synonyms.append((synonym, similarity))

    # Sort synonyms by relevance
    valid_synonyms.sort(key=lambda x: -x[1])
    return [synonym for synonym, _ in valid_synonyms]

In [32]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [33]:
import spacy
def filter_synonyms_with_embeddings(term, synonyms):
    nlp = spacy.load("en_core_web_md")
    term_vector = nlp(term).vector
    valid_synonyms = []

    for synonym in synonyms:
        synonym_vector = nlp(synonym).vector
        similarity = cosine_similarity([term_vector], [synonym_vector])[0, 0]
        if similarity > 0.6:  # Threshold for semantic similarity
            valid_synonyms.append((synonym, similarity))

    # Sort synonyms by similarity
    valid_synonyms.sort(key=lambda x: -x[1])
    return [synonym for synonym, _ in valid_synonyms]

In [34]:
from itertools import product

def expand_query(query, method, boolean_matrix, boolean_vectorizer):
    expanded_queries = set()  # Use a set to avoid duplicates
    logical_operators = {"AND", "OR", "NOT"}

    # Split the query into tokens while preserving logical operators
    query_tokens = query.split()

    # For each token, get its synonyms (or the token itself if no synonyms exist)
    token_synonyms = []
    for token in query_tokens:
        if token in logical_operators:
            # Keep logical operators as is
            token_synonyms.append([token])
        else:
            if method == 'replace_with_thesaurus':
                synonyms = thesaurus.get(token, [token])
            elif method == 'filtered_synonyms':
                synonyms = filter_synonyms(token, thesaurus.get(token, [token]), boolean_matrix, boolean_vectorizer.vocabulary_)
            elif method == 'filtered_embeddings':
                synonyms = filter_synonyms_with_embeddings(token, thesaurus.get(token, [token]))
            elif method == 'co-occurrence':
              co_occurrence_matrix = boolean_matrix.T @ boolean_matrix
              co_occurrence_dense = co_occurrence_matrix.toarray()  # Convert to dense for easier indexing
              vocab = boolean_vectorizer.get_feature_names_out()
              term_to_index = {term: idx for idx, term in enumerate(vocab)}
              if token in term_to_index:
                  token_idx = term_to_index[token]
                  co_occurring_terms = np.argsort(-co_occurrence_dense[token_idx])  # Descending order
                  synonyms = [vocab[idx] for idx in co_occurring_terms if idx != token_idx][:2]  # Top 5 related terms
              else:
                  synonyms = [token]
            else:
                synonyms = [token]
            token_synonyms.append(synonyms)

    # Generate all combinations of synonyms for the query
    for combination in product(*token_synonyms):
        expanded_queries.add(" ".join(combination))

    return list(expanded_queries)

In [38]:
with open("../data/evaluation/updated-queries.json", 'r') as f:
    standard_queries = json.load(f)

retrieved_results = {}
standard_results = {}

# Populate the dictionaries
for query in standard_queries:
    query_id = query["query_id"]
    retrieved_doc_names = retrieve_tfidf_query(query["query_text"], s_tfidf_matrix, s_vectorizer, 20)

    # Update the dictionaries
    retrieved_results[query_id] = retrieved_doc_names
    standard_results[query_id] = [response["doc_name"] for response in query["responses"]]

# Print results
print("Standard Results:", standard_results)
print("##############")
print("Retrieved Results:", retrieved_results)


Standard Results: {1: ['Acute Stress Disorder.pdf', 'Adjustment Disorders.pdf', 'Generalized Anxiety Disorder.pdf', 'Cyclothymic Disorder.pdf', 'Schizotypal Personality Disorder.pdf', 'Schizoaffective Disorder.pdf', 'Anxiety Disorder Due to Another Medical Condition.pdf', 'Conversion Disorder (Functional Neurological Symptom Disorder).pdf', 'Disruptive Mood Dysregulation Disorder.pdf', 'Schizoid Personality Disorder.pdf', 'Other Specified & UnSpecified Dissociative Disorder.pdf', 'Paranoid Personality Disorder.pdf', 'Antisocial Personality Disorder (Personality Disorders).pdf', 'Oppositional Defiant Disorder.pdf', 'Depersonalization Derealization Disorder.pdf', 'Illness Anxiety Disorder.pdf', 'Schizophreniform Disorder.pdf', 'Persistent Depressive Disorder (Dysthymia).pdf', 'Body Dysmorphic Disorder.pdf', 'Substance Medication-Induced Depressive Disorder.pdf'], 2: ['Agoraphobia.pdf', 'Premenstrual Dysphoric Disorder.pdf', 'Somatic Symptom Disorder.pdf', 'Other Specified Depressive Diso

In [39]:
def calculate_precision_recall(retrieved, relevant):
    retrieved_set = set(retrieved)
    relevant_set = set(relevant)
    true_positives = len(retrieved_set & relevant_set)

    precision = true_positives / len(retrieved_set) if retrieved_set else 0
    recall = true_positives / len(relevant_set) if relevant_set else 0
    return precision, recall

In [40]:
query_ids = list(range(1, len(standard_queries) + 1))

In [41]:
def calculate_precision_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    relevant_set = set(relevant)
    true_positives = len(set(retrieved_k) & relevant_set)
    return true_positives / k


In [42]:
import math
def calculate_dcg(retrieved, relevant, relevance, k):
    dcg = sum((relevance.get(doc, 0) / math.log2(i + 2)) for i, doc in enumerate(retrieved[:k]))
    return dcg

def calculate_idcg(relevant, relevance, k):
    sorted_relevant = sorted(relevant, key=lambda x: relevance.get(x, 0), reverse=True)
    return sum((relevance.get(sorted_relevant[i], 0) / math.log2(i + 2)) for i in range(min(k, len(sorted_relevant))))

def calculate_ndcg(retrieved, relevant, relevance, k):
    dcg = calculate_dcg(retrieved, relevant, relevance, k)
    idcg = calculate_idcg(relevant, relevance, k)
    return dcg / idcg if idcg > 0 else 0

In [43]:
def calculate_map(retrieved, relevant):
    relevant_set = set(relevant)
    precisions = []
    for i, doc in enumerate(retrieved):
        if doc in relevant_set:
            precisions.append(len(set(retrieved[:i+1]) & relevant_set) / (i+1))
    return sum(precisions) / len(relevant_set) if relevant_set else 0

In [44]:
def calculate_reciprocal_rank(retrieved, relevant):
    relevant_set = set(relevant)
    for i, doc in enumerate(retrieved):
        if doc in relevant_set:
            return 1 / (i + 1)
    return 0

In [279]:
precisions = []
recalls = []
p_at_5 = []
p_at_10 = []
p_at_15 = []
ndcg = []
map = []
rr_sum = 0

for query_id in standard_results:
    retrieved = retrieved_results.get(query_id, [])
    relevant = standard_results[query_id]

    # Calculate precision and recall
    precision, recall = calculate_precision_recall(retrieved, relevant)
    precisions.append(precision)
    recalls.append(recall)

    # Precision@K
    p_at_5.append(calculate_precision_at_k(retrieved, relevant, 5))
    p_at_10.append(calculate_precision_at_k(retrieved, relevant, 10))
    p_at_15.append(calculate_precision_at_k(retrieved, relevant, 15))

    # Create the relevance dictionary
    relevance = {doc: (len(relevant) - i) for i, doc in enumerate(relevant)}

    # Calculate NDCG
    ndcg.append(calculate_ndcg(retrieved, relevant, relevance, 20))

    # Calculate MAP
    map.append(calculate_map(retrieved, relevant))

    # Calculate RR
    rr_sum += calculate_reciprocal_rank(retrieved, relevant)

# Calculate MRR
mrr = rr_sum / len(standard_results)
print(min(ndcg[:26]))
print(max(ndcg[:26]))
print(min(map[:26]))
print(max(map[:26]))

0.0
0.9219657088738421
0.0
0.766483059424236


In [280]:
from tabulate import tabulate

# Generate query IDs dynamically based on the number of queries
query_ids = list(range(1, len(standard_results) + 1))

# Combine all metrics into a single table
table_data = []
for i, query_id in enumerate(query_ids):
    table_data.append([
        query_id,
        precisions[i],
        recalls[i],
        p_at_5[i],
        p_at_10[i],
        p_at_15[i],
        ndcg[i],
        map[i]
    ])

# Define the table headers
headers = ["Query ID", "Precision", "Recall", "P@5", "P@10", "P@15", "NDCG", "MAP"]

# Print the table
print(tabulate(table_data, headers=headers, tablefmt="grid"))

# Print MRR as a separate metric since it's an aggregate across all queries
print(f"\nMean Reciprocal Rank (MRR): {mrr:.4f}")


+------------+-------------+----------+-------+--------+-----------+-----------+------------+
|   Query ID |   Precision |   Recall |   P@5 |   P@10 |      P@15 |      NDCG |        MAP |
|          1 |        0.4  | 0.4      |   0.4 |    0.5 | 0.533333  | 0.575642  | 0.24895    |
+------------+-------------+----------+-------+--------+-----------+-----------+------------+
|          2 |        0.25 | 0.25     |   0.6 |    0.5 | 0.333333  | 0.386487  | 0.182381   |
+------------+-------------+----------+-------+--------+-----------+-----------+------------+
|          3 |        0.35 | 0.35     |   0.6 |    0.5 | 0.4       | 0.547279  | 0.233943   |
+------------+-------------+----------+-------+--------+-----------+-----------+------------+
|          4 |        0.5  | 0.5      |   1   |    0.8 | 0.6       | 0.759049  | 0.467225   |
+------------+-------------+----------+-------+--------+-----------+-----------+------------+
|          5 |        0.7  | 0.7      |   1   |    1   | 0.9

In [294]:
retrieved_results = {}

# Populate the dictionaries
for query in standard_queries[25:-1]:
    query_id = query["query_id"]
    retrieved_doc_names = retrieve_boolean_query(query["query_text"], stemmed_bool_matrix, s_bool_vectorizer)

    # Update the dictionaries
    retrieved_results[query_id] = retrieved_doc_names

{26: [], 27: ['Motor Disorders.pdf', 'Posttraumatic Stress Disorder.pdf', 'Excoriation (Skin-Picking) Disorder.pdf', 'Trichotillomania (Hair-Pulling Disorder).pdf', 'Obsessive-Compulsive Disorder.pdf', 'Obsessive-Compulsive and Related Disorders.pdf', 'Body Dysmorphic Disorder.pdf', 'Hoarding Disorder.pdf'], 28: [], 29: ['Intellectual Disabilities.pdf', 'Communication Disorders.pdf'], 30: [], 31: [], 32: [], 33: [], 34: [], 35: ['Acute Stress Disorder.pdf', 'Agoraphobia.pdf', 'Substance Medication-Induced.pdf', 'Anxiety Disorder Due to Another Medical Condition.pdf', 'Anxiety Disorders.pdf', 'Unspecified Bipolar and Related Disorder.pdf', 'Bipolar I Disorder.pdf', 'Bipolar II Disorder.pdf', 'Hallucinogen-Related Disorders.pdf', 'Stimulant-Related Disorders.pdf', 'Caffeine-Related Disorders.pdf', 'Conversion Disorder (Functional Neurological Symptom Disorder).pdf', 'Depersonalization Derealization Disorder.pdf', 'Generalized Anxiety Disorder.pdf', 'Illness Anxiety Disorder.pdf', 'Nightm

In [295]:
# Initialize metrics lists
precisions = []
recalls = []
p_at_5 = []
p_at_10 = []
p_at_15 = []
ndcg = []
map = []
rr_sum = 0

# Get a list of query IDs
query_ids = list(standard_results.keys())  # Get all keys as a list

# Process only queries starting from the 26th
for query_id in query_ids[25:-1]:  # Adjust for zero-based index
    retrieved = retrieved_results.get(query_id, [])

    relevant = standard_results[query_id]

    # Calculate precision and recall
    precision, recall = calculate_precision_recall(retrieved, relevant)
    precisions.append(precision)
    recalls.append(recall)

    # Precision@K
    p_at_5.append(calculate_precision_at_k(retrieved, relevant, 5))
    p_at_10.append(calculate_precision_at_k(retrieved, relevant, 10))
    p_at_15.append(calculate_precision_at_k(retrieved, relevant, 15))

    # Create the relevance dictionary
    relevance = {doc: (len(relevant) - i) for i, doc in enumerate(relevant)}

    # Calculate NDCG
    ndcg.append(calculate_ndcg(retrieved, relevant, relevance, 20))

    # Calculate MAP
    map.append(calculate_map(retrieved, relevant))

    # Calculate RR
    rr_sum += calculate_reciprocal_rank(retrieved, relevant)

# Calculate MRR for the subset of queries
mrr = rr_sum / len(query_ids[25:-1])

set()
{'Anxiety Disorder Due to Another Medical Condition.pdf', 'Generalized Anxiety Disorder.pdf', 'Circadian Rhythm Sleep-Wake Disorders.pdf', 'Panic Disorder.pdf', 'Other Specified & UnSpecified Somatic Symptom and Related Disorder.pdf', 'Somatic Symptom and Related Disorders.pdf', 'Conversion Disorder (Functional Neurological Symptom Disorder).pdf', 'Obsessive-Compulsive and Related Disorder Due to Another Medical Condition.pdf', 'Somatic Symptom Disorder.pdf', 'Separation Anxiety Disorder.pdf', 'Other Specified & UnSpecified Obsessive-Compulsive and Related Disorder.pdf', 'Factitious Disorder.pdf', 'Panic Attack Specifier.pdf', 'Depersonalization Derealization Disorder.pdf', 'Body Dysmorphic Disorder.pdf', 'Psychological Factors Affecting Other Medical Conditions.pdf', 'Obsessive-Compulsive Disorder.pdf', 'Illness Anxiety Disorder.pdf'}
set()
{'Bulimia Nervosa.pdf', 'Conversion Disorder (Functional Neurological Symptom Disorder).pdf', 'Anorexia Nervosa.pdf', 'Obsessive-Compulsive 

In [296]:
from tabulate import tabulate

# Generate the query IDs for the subset you processed
processed_query_ids = query_ids[25:-1]  # Same subset as used in the earlier loop

# Combine all metrics into a single table
table_data = []
for i, query_id in enumerate(processed_query_ids):
    table_data.append([
        query_id,
        precisions[i],
        recalls[i],
        map[i]
    ])

# Define the table headers
headers = ["Query ID", "Precision", "Recall", "MAP"]

# Print the table
print(tabulate(table_data, headers=headers, tablefmt="grid"))

# Print MRR as a separate metric since it's an aggregate across the subset of queries
print(f"\nMean Reciprocal Rank (MRR): {mrr:.4f}")


+------------+-------------+----------+----------+
|   Query ID |   Precision |   Recall |      MAP |
|         26 |    0        | 0        | 0        |
+------------+-------------+----------+----------+
|         27 |    1        | 0.615385 | 0.615385 |
+------------+-------------+----------+----------+
|         28 |    0        | 0        | 0        |
+------------+-------------+----------+----------+
|         29 |    1        | 0.666667 | 0.666667 |
+------------+-------------+----------+----------+
|         30 |    0        | 0        | 0        |
+------------+-------------+----------+----------+
|         31 |    0        | 0        | 0        |
+------------+-------------+----------+----------+
|         32 |    0        | 0        | 0        |
+------------+-------------+----------+----------+
|         33 |    0        | 0        | 0        |
+------------+-------------+----------+----------+
|         34 |    0        | 0        | 0        |
+------------+-------------+---

In [47]:
expanded_queries1 = expand_query("Serum HIV testing", "replace_with_thesaurus", bool_matrix, bool_vectorizer)
expanded_queries2 = expand_query("serum hiv testing", "filtered_synonyms", bool_matrix, bool_vectorizer)
expanded_queries3 = expand_query("Serum HIV testing", "filtered_embeddings", bool_matrix, bool_vectorizer)
expanded_queries4 = expand_query("Serum HIV testing", "co-occurrence", bool_matrix, bool_vectorizer)

# Store the expanded queries in a list for processing
expanded_queries = [expanded_queries1, expanded_queries2, expanded_queries3, expanded_queries4]
expanded_query_names = ["replace_with_thesaurus", "filtered_synonyms", "filtered_embeddings", "co-occurrence"]

# Step 2: Initialize storage for metrics
all_ndcg = []
query_results = []

# Step 3: Process each expanded query
for i, query_list in enumerate(expanded_queries):
    # Combine all metrics for each expanded query
    precisions = []
    recalls = []
    p_at_5 = []
    p_at_10 = []
    p_at_15 = []
    ndcg = []
    map = []
    rr_sum = 0

    # Retrieve and evaluate for each query in the expanded list
    for query_text in query_list:
        retrieved = retrieve_tfidf_query(query_text, tfidf_matrix, vectorizer, 20)
        relevant = standard_results[5]  # Replace with your ground-truth matching for the query

        # Calculate precision and recall
        precision, recall = calculate_precision_recall(retrieved, relevant)
        precisions.append(precision)
        recalls.append(recall)

        # Precision@K
        p_at_5.append(calculate_precision_at_k(retrieved, relevant, 5))
        p_at_10.append(calculate_precision_at_k(retrieved, relevant, 10))
        p_at_15.append(calculate_precision_at_k(retrieved, relevant, 15))

        # Create relevance dictionary
        relevance = {doc: (len(relevant) - i) for i, doc in enumerate(relevant)}

        # Calculate NDCG
        ndcg_value = calculate_ndcg(retrieved, relevant, relevance, 20)
        ndcg.append(ndcg_value)

        # Calculate MAP
        map_value = calculate_map(retrieved, relevant)
        map.append(map_value)

        # Calculate RR
        rr_sum += calculate_reciprocal_rank(retrieved, relevant)

    # Store NDCG values and results for each expanded query
    all_ndcg.append((max(ndcg), expanded_query_names[i], precisions, recalls, p_at_5, p_at_10, p_at_15, ndcg, map))
    query_results.append({
        "query_name": expanded_query_names[i],
        "precisions": precisions,
        "recalls": recalls,
        "p_at_5": p_at_5,
        "p_at_10": p_at_10,
        "p_at_15": p_at_15,
        "ndcg": ndcg,
        "map": map
    })

# Step 4: Find the expanded query with the highest NDCG value
best_query = max(all_ndcg, key=lambda x: x[0])
print(f"Best query expansion: {best_query[1]} with NDCG = {best_query[0]}")

# Step 5: Generate the table for the best expanded query
from tabulate import tabulate

# Extract results for the best query
best_query_results = next(item for item in query_results if item["query_name"] == best_query[1])

# Combine all metrics into a single table
table_data = []
for i in range(len(best_query_results["ndcg"])):
    table_data.append([
        i + 1,  # Query ID
        best_query_results["precisions"][i],
        best_query_results["recalls"][i],
        best_query_results["p_at_5"][i],
        best_query_results["p_at_10"][i],
        best_query_results["p_at_15"][i],
        best_query_results["ndcg"][i],
        best_query_results["map"][i]
    ])

# Define the table headers
headers = ["Query ID", "Precision", "Recall", "P@5", "P@10", "P@15", "NDCG", "MAP"]

# Print the table
print(tabulate(table_data, headers=headers, tablefmt="grid"))

Best query expansion: replace_with_thesaurus with NDCG = 0.8876211235337029
+------------+-------------+----------+-------+--------+----------+----------+----------+
|   Query ID |   Precision |   Recall |   P@5 |   P@10 |     P@15 |     NDCG |      MAP |
|          1 |        0.65 |     0.65 |   1   |    1   | 0.8      | 0.841157 | 0.613496 |
+------------+-------------+----------+-------+--------+----------+----------+----------+
|          2 |        0.7  |     0.7  |   1   |    1   | 0.933333 | 0.874084 | 0.7      |
+------------+-------------+----------+-------+--------+----------+----------+----------+
|          3 |        0.7  |     0.7  |   1   |    0.9 | 0.733333 | 0.864632 | 0.622672 |
+------------+-------------+----------+-------+--------+----------+----------+----------+
|          4 |        0.7  |     0.7  |   0.8 |    0.8 | 0.8      | 0.887621 | 0.602453 |
+------------+-------------+----------+-------+--------+----------+----------+----------+
|          5 |        0.