In [33]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix, hstack
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GroupKFold
from sentence_transformers import SentenceTransformer
from adarank import AdaRank
from adarankv2 import AdaRankv2
from metrics import NDCGScorer
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import save_npz, load_npz


In [None]:


#define file path
val_file_path = 'Input/loinc_query_terms_testing.xlsx'
train_file_path = 'Input/loinc_query_terms_training.xlsx'

# Load pretrained model vector embedding model
# We are using a medical specific model from huggingface. https://huggingface.co/ls-da3m0ns/bge_large_medical 'ls-da3m0ns/bge_large_medical'
# To improve performance, model can be switched to a more general & lightwight model (replace with "all-MiniLM-L6-v2")
embedder = SentenceTransformer('ls-da3m0ns/bge_large_medical')
#embedder = SentenceTransformer('all-MiniLM-L6-v2') # lightweight model

In [35]:
# Function to safely calculate similarity between a query and a field
def calculate_embedding_similarity(query, field):
    if pd.isna(query) or pd.isna(field):
        return 0
    query_embedding = embedder.encode([str(query)])[0]
    field_embedding = embedder.encode([str(field)])[0]
    return cosine_similarity([query_embedding], [field_embedding])[0][0]


In [None]:
#Read the file and merge all sheets into one dataframe
xls = pd.ExcelFile(train_file_path)
dataframes = []
for sheet_name in xls.sheet_names:
    temp_df = pd.read_excel(xls, sheet_name=sheet_name)
    temp_df['query'] = sheet_name  # Each sheet name is used as the query text
    dataframes.append(temp_df)
merged_df = pd.concat(dataframes, ignore_index=True)

# ------------------------------
# Compute lexical similarity for long_common_name

# We want to compute, for each row, the similarity between the query and the long_common_name.
# First, build a TF-IDF vectorizer fitted on the union of all queries and names.
corpus = pd.concat([merged_df['query'], merged_df['long_common_name']])
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

# Transform the query and long_common_name columns
X_query = vectorizer.transform(merged_df['query'])
X_name_tfidf = vectorizer.transform(merged_df['long_common_name'])

# Compute cosine similarity for each row
cosine_sim = np.array([cosine_similarity(X_query[i], X_name_tfidf[i])[0, 0] 
                         for i in range(X_query.shape[0])])
merged_df['name_cosine_sim'] = cosine_sim

# Now use the computed cosine similarity as the feature for the name field.
X_name_lexical_similarity = csr_matrix(merged_df[['name_cosine_sim']].values)

# ------------------------------
# Compute lexical similarity for features property, system, component, long_common_name

# Create similarity scores for features, loinc name measurement, system, and component
# Apply a similarity function to each row, based on vector embeddings 
merged_df['name_similarity'] = merged_df.apply(
    lambda row: calculate_embedding_similarity(row['query'], row['long_common_name']), axis=1)
merged_df['property_similarity'] = merged_df.apply(
    lambda row: calculate_embedding_similarity(row['query'], row['property']), axis=1)
merged_df['system_similarity'] = merged_df.apply(
    lambda row: calculate_embedding_similarity(row['query'], row['system']), axis=1)
merged_df['component_similarity'] = merged_df.apply(
    lambda row: calculate_embedding_similarity(row['query'], row['component']), axis=1)

# Convert to sparse matrices
X_name_semantic_similarity = csr_matrix(merged_df[['name_similarity']].values)
X_property_similarity = csr_matrix(merged_df[['property_similarity']].values)
X_system_similarity = csr_matrix(merged_df[['system_similarity']].values)
X_component_similarity = csr_matrix(merged_df[['component_similarity']].values)

# ------------------------------
# Process numerical feature: 'rank' using StandardScaler
# Treat 0 (NaN) values as very high ranks by replacing them with the largest number 
max_rank = merged_df['rank'].max()
merged_df['rank'] = merged_df['rank'].replace(0, max_rank + 1)

scaler = StandardScaler()
X_rank = scaler.fit_transform(merged_df[['rank']])
X_rank_sparse = csr_matrix(X_rank)

# ------------------------------
# Combine all features into one sparse matrix.
X = hstack(
    [   X_rank_sparse,
        X_name_semantic_similarity, 
        X_name_lexical_similarity,
        X_system_similarity, 
        X_component_similarity
        ])

# Labels and query identifiers
y = merged_df['relevant'].values
# convert query strings to integers -> required for AdaRank
merged_df['qid_numeric'] = pd.factorize(merged_df['query'])[0]
qid = merged_df['qid_numeric'].values

In [None]:
merged_df.to_csv('idxdata/indexed_dataset.csv', index=False)
# Save the feature matrix X to a file for later retrieval
save_npz('idxdata/X_sparse.npz', X)
# Save the labels y to a file for later retrieval
np.save('idxdata/y.npy', y)
# Save the query identifiers qid to a file for later retrieval
np.save('idxdata/qid.npy', qid)

In [None]:

X = load_npz('idxdata/X_sparse.npz')
merged_df = pd.read_csv('idxdata/indexed_dataset.csv')
y = np.load('idxdata/y.npy')
qid = np.load('idxdata/qid.npy') 
# Create a string array capturing the name of the features -> used later during evaluation
feature_names = ['rank',
                 'name_semantic_sim', 
                 'name_lexical_sim',
                 'system_semantic_similarity', 
                 'component_semantic_similarity', 
                 ]

In [None]:
# Query-aware train/test splitting
# we disbale te random state due to the small dataset
splitter = GroupShuffleSplit(test_size=0.2, random_state=None)
train_idx, test_idx = next(splitter.split(X, y, groups=qid))
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
qid_train, qid_test = qid[train_idx], qid[test_idx]

# Create DataFrames for the train and test sets using the respective indices
train_df = merged_df.iloc[train_idx].copy()
test_df = merged_df.iloc[test_idx].copy()

#Debugging prints
print("Unique queries in train:", np.unique(qid[train_idx]))
print("Unique queries in test:", np.unique(qid[test_idx]))
print("Unique queries and their frequencies:", np.unique(qid, return_counts=True))

#print("X_train:\n", X_train.toarray())
#print("y_train:", y_train)

#Check basic statistics 
#We can confirm that all selected features plus label have a meaningful variance/std dev so we dont need to apply a feature selection before 
print("X_train summary stats (mean, std):", np.mean(X_train.toarray(), axis=0), np.std(X_train.toarray(), axis=0))
#print("y_train distribution:", np.unique(y_train, return_counts=True))
print("y_train summary stats (mean, std):", np.mean(y_train, axis=0), np.std(y_train, axis=0))

# ------------------------------
# Train and evaluate AdaRank
model = AdaRankv2(max_iter=100, estop=10, scorer=NDCGScorer(k=5))
#model = AdaRank(max_iter=100, estop=100, scorer=NDCGScorer(k=5))
model.fit(X_train, y_train, qid_train)

# test NNDCG for different values of k
for k in (1, 2, 3, 4, 5, 10, 20):
    y_pred = model.predict(X_test, qid_test) 
    score = NDCGScorer(k=k)(y_test, y_pred, qid_test).mean()   
    print(f"NDCG Score {score}, K {k}")

Unique queries in train: [0 2 3 4]
Unique queries in test: [1]
Unique queries and their frequencies: (array([0, 1, 2, 3, 4]), array([ 97,  97,  97, 100,  99]))
X_train summary stats (mean, std): [0.13366907 0.42985444 0.0666258  0.30803767 0.3964655 ] [0.97326114 0.12833275 0.11768371 0.04521437 0.14848953]
y_train summary stats (mean, std): 0.11959287531806616 0.3244848524834767
NDCG Score 1.0, K 1
NDCG Score 1.0, K 2
NDCG Score 1.0, K 3
NDCG Score 1.0, K 4
NDCG Score 1.0, K 5
NDCG Score 1.0, K 10
NDCG Score 1.0, K 20


In [None]:
# Showcasing some predictions from the test set

# Create a DataFrame for the test set using the test indices
test_df = merged_df.iloc[test_idx].copy()
test_df['y_true'] = y_test
test_df['y_pred'] = y_pred

#Check how many distinct queries are in the test set
unique_test_queries = np.unique(qid_test)
print("Number of unique queries in test:", len(unique_test_queries))
print(unique_test_queries)

all_queries = merged_df['query'].unique()
print("Number of distinct queries overall:", len(all_queries))
print(all_queries)

# for each query in the test set, print the top 10 predictions (sorted by predicted score)
# Model seems to predcit well, however could be due to the small dataset and overfitting 
print("Top Predictions per Query:")
for query, group in test_df.groupby('query'):
    sorted_group = group.sort_values(by='y_pred', ascending=False)
    print(f"Query: {query}")
    print(sorted_group[['long_common_name', 'y_true', 'y_pred']].head(10))
    print("-" * 40)

Number of unique queries in test: 1
[1]
Number of distinct queries overall: 5
['glucose in blood' 'bilirubin in plasma' 'White blood cells count'
 'cholesterol in blood' 'PrThr calcium oxalate crystals']
Top Predictions per Query:
Query: bilirubin in plasma
                                      long_common_name  y_true    y_pred
172     Bilirubin.direct [Presence] in Serum or Plasma       1  2.425374
106  Bilirubin.direct [Mass/volume] in Serum or Plasma       1  2.336588
111   Bilirubin.total [Mass/volume] in Serum or Plasma       1  2.322472
171  Bilirubin.direct/Bilirubin.total in Serum or P...       1  2.315887
125  Bilirubin.indirect [Mass/volume] in Serum or P...       1  2.270974
168  Bilirubin.indirect [Mass/volume] in Serum or P...       1  2.270974
173  Bilirubin.conjugated/Bilirubin.total in Serum ...       1  2.267906
170  Bilirubin.indirect [Moles/volume] in Serum or ...       1  2.253523
129  Bilirubin.indirect [Mass or Moles/volume] in S...       1  2.237082
169  Bilirub

In [None]:
# Check how the model weights the features 
coef_zip = model.coef_
# Print feature coefficients explicitly
print(coef_zip)
for name, coef in zip(feature_names, coef_zip):
    print(f"Feature: {name}, Importance (coef): {coef:.4f}")

[0.         2.65165245 0.         0.         0.        ]
Feature: rank, Importance (coef): 0.0000
Feature: name_semantic_sim, Importance (coef): 2.6517
Feature: name_lexical_sim, Importance (coef): 0.0000
Feature: system_semantic_similarity, Importance (coef): 0.0000
Feature: component_semantic_similarity, Importance (coef): 0.0000


In [None]:
#Refactor code to prepare data 
xls_val = pd.ExcelFile(val_file_path)
dataframes_val = []
for sheet_name in xls_val.sheet_names:
    temp_df = pd.read_excel(xls_val, sheet_name=sheet_name)
    temp_df['query'] = sheet_name  # Each sheet name is the query text
    dataframes_val.append(temp_df)
val_df = pd.concat(dataframes_val, ignore_index=True)

# ------------------------------
# Compute Lexical Similarity for 'long_common_name'

corpus = pd.concat([val_df['query'], val_df['long_common_name']])
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
X_query_val = vectorizer.transform(val_df['query'])
X_name_tfidf_val = vectorizer.transform(val_df['long_common_name'])
cosine_sim_val = np.array([
    cosine_similarity(X_query_val[i], X_name_tfidf_val[i])[0, 0] 
    for i in range(X_query_val.shape[0])
])
val_df['name_lexical_similarity'] = cosine_sim_val
X_name_lexical_similarity_val = csr_matrix(val_df[['name_lexical_similarity']].values)

# ------------------------------
# Compute Semantic Similarities

val_df['name_semantic_similarity'] = val_df.apply(
    lambda row: calculate_embedding_similarity(row['query'], row['long_common_name']), axis=1)
val_df['property_semantic_similarity'] = val_df.apply(
    lambda row: calculate_embedding_similarity(row['query'], row['property']), axis=1)
val_df['system_semantic_similarity'] = val_df.apply(
    lambda row: calculate_embedding_similarity(row['query'], row['system']), axis=1)
val_df['component_semantic_similarity'] = val_df.apply(
    lambda row: calculate_embedding_similarity(row['query'], row['component']), axis=1)

X_name_semantic_similarity_val = csr_matrix(val_df[['name_semantic_similarity']].values)
X_property_similarity_val = csr_matrix(val_df[['property_semantic_similarity']].values)
X_system_similarity_val = csr_matrix(val_df[['system_semantic_similarity']].values)
X_component_similarity_val = csr_matrix(val_df[['component_semantic_similarity']].values)

# ------------------------------
# Process Numerical Feature: 'rank'
# Replace zeros (or NaN) with a high rank value as before
max_rank_val = val_df['rank'].max()
val_df['rank'] = val_df['rank'].replace(0, max_rank_val + 1)
# Use the same scaler from training.
scaler_val = StandardScaler().fit(merged_df[['rank']])
X_rank_val = scaler_val.transform(val_df[['rank']])
X_rank_sparse_val = csr_matrix(X_rank_val)

# ------------------------------
# Combine All Features into One Matrix
# Use the same ordering as during training.
X_val = hstack([
    X_rank_sparse_val,
    X_name_semantic_similarity_val, 
    X_name_lexical_similarity_val,
    X_system_similarity_val, 
    X_component_similarity_val
])

In [None]:
val_df['qid_numeric'] = pd.factorize(val_df['query'])[0]
qid_val = val_df['qid_numeric'].values

# Predict Using the Fitted AdaRank Model
# Assume your AdaRank model has already been trained and is available as 'model'
y_val_pred = model.predict(X_val, qid_val)

# Add the predictions to the DataFrame for inspection.
val_df['y_pred'] = y_val_pred
""" # Print Example Predictions Grouped by Query
print("Validation Predictions:")
for query, group in val_df.groupby('query'):
    print(f"\nQuery: {query}")
    # Adjust column names as needed; here we print the document 'long_common_name' and its prediction.
    print(group[['long_common_name', 'y_pred']]) """

Validation Predictions:

Query: Carbon volume
                                      long_common_name    y_pred
200        Creatinine [Mass/volume] in Serum or Plasma  1.390240
201           Calcium [Mass/volume] in Serum or Plasma  1.421104
202  Carbon dioxide, total [Moles/volume] in Serum ...  1.726032
203                                   Respiratory rate  1.150598
204   Bilirubin.total [Mass/volume] in Serum or Plasma  1.406505
..                                                 ...       ...
395                  Calcium [Mass/volume] in Specimen  1.550551
396  Base excess standard in Arterial blood by calc...  1.203031
397  Bacteria identified in Bone by Anaerobe+Aerobe...  0.765750
398            Creatinine [Mass/time] in 24 hour Urine  1.057399
399         Blasts/Leukocytes in Blood by Manual count  0.938901

[200 rows x 2 columns]

Query: Creatinine in Blood or Urine
                                      long_common_name    y_pred
0          Creatinine [Mass/volume] in Serum or 

In [None]:
# Function to assign ranking within each query group
def assign_ranking(group):
    group = group.sort_values(by='y_pred', ascending=False).copy()
    group['AdaRank Ranking'] = range(1, len(group) + 1)
    return group

# Create a dictionary where each key is a query and value is the ranked DataFrame for that query
query_groups = {query: assign_ranking(group) for query, group in val_df.groupby('query')}

# Write each query's results to a separate sheet in an Excel file
output_file = 'Results/validation_results_by_query.xlsx'
with pd.ExcelWriter(output_file) as writer:
    for query, df_group in query_groups.items():
        # Excel sheet names can have a maximum of 31 characters, so we truncate if necessary
        sheet_name = query if len(query) <= 31 else query[:31]
        df_group.to_excel(writer, sheet_name=sheet_name, index=False)

print(f"Validation results saved to {output_file}")

Validation results saved to validation_results_by_query.xlsx
