In [None]:
import numpy as np
import pandas as pd
import re

Let's add a list with our thesaurus

In [None]:
# Thesaurus dictionary
thesaurus_dic = {
    "blood": ["bld", "plasma", "plas","serum","ser"],
    "plasma": ["plas"],
    "white blood cells": ["wbc","leukocytes", "lymphocytes", "monocytes"]
}

print(thesaurus_dic["white blood cells"])

Load data from the loinc_dataset document.

In [None]:
df = pd.read_csv('RWD_loinc_dataset_wbc.csv')

# Checking db's head
print(df.head())

We introduce our query and a list with the different composites we will search with

In [None]:
querry = "white blood cells count"
query_id = "white_blood_cells_count"
composites = ["white blood cells", "blood", "cells","count"]

## First Dataset
We need to calculate our elementary clues X for all composite/document pairs

For each composite, we will look for documents where it appears in the name, component, and system, and count how many times each composite appears in each of the three properties of each row. X1 is the number of times it appears in long_common_name, X3 the number of times it appears in component, and X4 the number of times it appears in system. Our Y is the is relevant field, X2 is the IDF (which I will program myself), querry_id is always "white_blood_cells_count", and composite is the composite used.

In [None]:
# Ensure elementary_clues_df is empty before filling
elementary_clues_df = pd.DataFrame(columns=['loinc_num', 'query_id', 'composite_clue', 'TF', 'IDF', 'is_in_component', 'is_in_system', 'relevance'])

# Calculate total number of documents
N = len(df)

# Calculate document frequency for original composites (for IDF of the composite itself)
doc_freq_composites = {}
for composite in composites:
    lower_composite = composite.lower()
    # Get the composite and its synonyms, including the composite itself
    terms_to_check_doc_freq = [lower_composite]
    if composite in thesaurus_dic:
        terms_to_check_doc_freq.extend([synonym.lower() for synonym in thesaurus_dic[composite]])

    # Count the number of documents where ANY of the terms in this group appear (using whole word match)
    count = df.apply(lambda row: any(re.search(r'\b' + re.escape(term) + r'\b', str(row['long_common_name']).lower()) or
                                     re.search(r'\b' + re.escape(term) + r'\b', str(row['component']).lower()) or
                                     re.search(r'\b' + re.escape(term) + r'\b', str(row['system']).lower()) for term in terms_to_check_doc_freq), axis=1).sum()
    doc_freq_composites[composite] = count

# Pre-calculate terms to count for each original composite (composite + its synonyms)
terms_to_count_map = {}
for composite in composites:
    lower_composite = composite.lower()
    terms = [lower_composite]
    if composite in thesaurus_dic:
        terms.extend([synonym.lower() for synonym in thesaurus_dic[composite]])
    terms_to_count_map[composite] = terms


# Iterate over each row in the main DataFrame (df)
for index, row in df.iterrows():
    loinc_num = row['loinc_num']
    long_common_name = str(row['long_common_name']).lower()
    component = str(row['component']).lower()
    system = str(row['system']).lower()
    relevance = row['relevance'] # This will be our Y

    # Iterate over each original composite in the composites_pre list
    for composite in composites:
        lower_composite = composite.lower()
        terms_to_count = terms_to_count_map[composite]

        # Check if ANY term related to this composite (composite or synonyms) is present in the document (using whole word match)
        any_term_found = any(re.search(r'\b' + re.escape(term) + r'\b', long_common_name) or
                             re.search(r'\b' + re.escape(term) + r'\b', component) or
                             re.search(r'\b' + re.escape(term) + r'\b', system) for term in terms_to_count)


        # If at least one related term was found in the document
        if any_term_found:
            # Calculate the total counts for X1, X3, X4 by summing occurrences of all related terms in the document (using whole word match)
            total_x1 = sum(len(re.findall(r'\b' + re.escape(term) + r'\b', long_common_name)) for term in terms_to_count)

            x3 = int(sum(len(re.findall(r'\b' + re.escape(term) + r'\b', component)) for term in terms_to_count) > 0)
            x4 = int(sum(len(re.findall(r'\b' + re.escape(term) + r'\b', system)) for term in terms_to_count) > 0)

            # Calculate X2 (IDF) for the ORIGINAL composite term
            x2 = np.log10(N / (doc_freq_composites.get(composite, 0)))

            # Create a new row dictionary for the original composite in this document
            new_row = {
                'loinc_num': loinc_num,
                'query_id': query_id,
                'composite_clue': composite,
                'TF': total_x1,
                'IDF': x2,
                'is_in_component': x3,
                'is_in_system': x4,
                'relevance': relevance
            }
            # Add the new row to the elementary_clues_df DataFrame
            elementary_clues_df.loc[len(elementary_clues_df)] = new_row

display(elementary_clues_df)

elementary_clues_df.to_csv('elementary_clues.csv', index=False, decimal=',')

## Second Dataset

Our second dataset will have 5 columns

1.   loinc_number
2.   querry_id
3.   Z → which is the initial relevance (calculed in the first logarithmic regresion)
4.   N → which is the number of composites on a document
5.   relevance (Y)






In [None]:
# Create the second dataset DataFrame
second_dataset_df = pd.DataFrame(columns=['loinc_num', 'query_id', 'Z', 'N', 'relevance'])

# Iterate over each row in the original DataFrame (df)
for index, row in df.iterrows():
    loinc_num = row['loinc_num']
    long_common_name = str(row['long_common_name']).lower()
    component = str(row['component']).lower()
    system = str(row['system']).lower()
    relevance = row['relevance']
    query_id = "white_blood_cells_count"

    # Calculate N: number of unique composites or their thesaurus found in long_common_name, component, or system
    found_composites = set()
    # Iterate over each original composite in the composites list
    for composite in composites:
        lower_composite = composite.lower()
        # Get the composite and its synonyms, including the composite itself
        terms_to_check = [lower_composite]
        if composite in thesaurus_dic:
            terms_to_check.extend([synonym.lower() for synonym in thesaurus_dic[composite]])

        # Check if ANY of the terms related to this composite (composite or synonyms) is present in the document (using whole word match)
        if any(re.search(r'\b' + re.escape(term) + r'\b', long_common_name) or
               re.search(r'\b' + re.escape(term) + r'\b', component) or
               re.search(r'\b' + re.escape(term) + r'\b', system) for term in terms_to_check):
            found_composites.add(composite)


    n_count = len(found_composites)

    # Create a new row dictionary for the second dataset
    new_row = {
        'loinc_num': loinc_num,
        'query_id': query_id,
        'Z': 0,  # Z is always 0
        'N': n_count,
        'relevance': relevance
    }

    # Add the new row to the second_dataset_df DataFrame
    second_dataset_df.loc[len(second_dataset_df)] = new_row

# Display the second dataset DataFrame
display(second_dataset_df)
second_dataset_df.to_csv('second_ds.csv', index=False, decimal=',')