In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix, hstack
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GroupKFold
from sentence_transformers import SentenceTransformer
from adarank import AdaRank
from adarankv2 import AdaRankv2
from metrics import NDCGScorer
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import save_npz, load_npz


  from .autonotebook import tqdm as notebook_tqdm


In [2]:


#define file path
file_path = 'loinc_ranks_query_moreterms_v2.xlsx'

# Load pretrained model vector embedding model
# We are using a medical specific model from huggingface. https://huggingface.co/ls-da3m0ns/bge_large_medical 'ls-da3m0ns/bge_large_medical'
# To improve performance, model can be swithced to a more general & lightwight model (replace with "all-MiniLM-L6-v2")
# embedder = SentenceTransformer('ls-da3m0ns/bge_large_medical')
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
# Function to safely calculate similarity between a query and a field
def calculate_embedding_similarity(query, field):
    if pd.isna(query) or pd.isna(field):
        return 0
    query_embedding = embedder.encode([str(query)])[0]
    field_embedding = embedder.encode([str(field)])[0]
    return cosine_similarity([query_embedding], [field_embedding])[0][0]


In [4]:
#Read the file and merge all sheets into one dataframe
xls = pd.ExcelFile(file_path)
dataframes = []
for sheet_name in xls.sheet_names:
    temp_df = pd.read_excel(xls, sheet_name=sheet_name)
    temp_df['query'] = sheet_name  # Each sheet name is used as the query text
    dataframes.append(temp_df)
merged_df = pd.concat(dataframes, ignore_index=True)

# ------------------------------
# Compute lexical similarity for long_common_name

# We want to compute, for each row, the similarity between the query and the long_common_name.
# First, build a TF-IDF vectorizer fitted on the union of all queries and names.
corpus = pd.concat([merged_df['query'], merged_df['long_common_name']])
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

# Transform the query and long_common_name columns
X_query = vectorizer.transform(merged_df['query'])
X_name_tfidf = vectorizer.transform(merged_df['long_common_name'])

# Compute cosine similarity for each row
cosine_sim = np.array([cosine_similarity(X_query[i], X_name_tfidf[i])[0, 0] 
                         for i in range(X_query.shape[0])])
merged_df['name_cosine_sim'] = cosine_sim

# Now use the computed cosine similarity as the feature for the name field.
X_name_lexical_similarity = csr_matrix(merged_df[['name_cosine_sim']].values)

# ------------------------------
# Compute lexical similarity for features property, system, component, long_common_name

# Create similarity scores for features, loinc name measurement, system, and component
# Apply a similarity function to each row, based on vector embeddings 
merged_df['name_similarity'] = merged_df.apply(
    lambda row: calculate_embedding_similarity(row['query'], row['long_common_name']), axis=1)
merged_df['property_similarity'] = merged_df.apply(
    lambda row: calculate_embedding_similarity(row['query'], row['property']), axis=1)
merged_df['system_similarity'] = merged_df.apply(
    lambda row: calculate_embedding_similarity(row['query'], row['system']), axis=1)
merged_df['component_similarity'] = merged_df.apply(
    lambda row: calculate_embedding_similarity(row['query'], row['component']), axis=1)

# Convert to sparse matrices
X_name_semantic_similarity = csr_matrix(merged_df[['name_similarity']].values)
X_property_similarity = csr_matrix(merged_df[['property_similarity']].values)
X_system_similarity = csr_matrix(merged_df[['system_similarity']].values)
X_component_similarity = csr_matrix(merged_df[['component_similarity']].values)

# ------------------------------
# Process numerical feature: 'rank' using StandardScaler
# Treat 0 (NaN) values as very high ranks by replacing them with the largest number 
max_rank = merged_df['rank'].max()
merged_df['rank'] = merged_df['rank'].replace(0, max_rank + 1)

scaler = StandardScaler()
X_rank = scaler.fit_transform(merged_df[['rank']])
X_rank_sparse = csr_matrix(X_rank)

# ------------------------------
# Combine all features into one sparse matrix.
X = hstack(
    [
        X_name_lexical_similarity, 
        X_name_semantic_similarity, 
        X_system_similarity, 
        X_component_similarity, 
        X_rank_sparse])
# Create a string array capturing the name of the features -> used later during evaluation
feature_names = [
    'name_lexical_sim', 
                 'name_semantic_sim', 
                 'system_semantic_similarity', 
                 'component_semantic_similarity', 
                 'rank']

# Labels and query identifiers
y = merged_df['relevant'].values
# convert query strings to integers -> required for AdaRank
merged_df['qid_numeric'] = pd.factorize(merged_df['query'])[0]
qid = merged_df['qid_numeric'].values

In [5]:
merged_df.to_csv('idxdata/indexed_dataset.csv', index=False)
# Save the feature matrix X to a file for later retrieval
save_npz('idxdata/X_sparse.npz', X)
# Save the labels y to a file for later retrieval
np.save('idxdata/y.npy', y)
# Save the query identifiers qid to a file for later retrieval
np.save('idxdata/qid.npy', qid)

In [6]:
merged_df = pd.read_csv('idxdata/indexed_dataset.csv')
feature_names = [
    'name_lexical_sim', 
                 'name_semantic_sim', 
                 'system_semantic_similarity', 
                 'component_semantic_similarity', 
                 'rank']
X = load_npz('idxdata/X_sparse.npz')
y = np.load('idxdata/y.npy')
qid = np.load('idxdata/qid.npy') 

In [8]:
# Query-aware train/test splitting
# we disbale te random state due to the small dataset
splitter = GroupShuffleSplit(test_size=0.2, random_state=None)
train_idx, test_idx = next(splitter.split(X, y, groups=qid))
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
qid_train, qid_test = qid[train_idx], qid[test_idx]

# Create DataFrames for the train and test sets using the respective indices
train_df = merged_df.iloc[train_idx].copy()
test_df = merged_df.iloc[test_idx].copy()

#Debugging prints
print("Unique queries in train:", np.unique(qid[train_idx]))
print("Unique queries in test:", np.unique(qid[test_idx]))
print("Unique queries and their frequencies:", np.unique(qid, return_counts=True))

#print("X_train:\n", X_train.toarray())
#print("y_train:", y_train)

#Check basic statistics 
#We can confirm that all selected features plus label have a meaningful variance/std dev so we dont need to apply a feature selection before 
print("X_train summary stats (mean, std):", np.mean(X_train.toarray(), axis=0), np.std(X_train.toarray(), axis=0))
#print("y_train distribution:", np.unique(y_train, return_counts=True))
print("y_train summary stats (mean, std):", np.mean(y_train, axis=0), np.std(y_train, axis=0))

# ------------------------------
# Train and evaluate AdaRank
model = AdaRankv2(max_iter=100, estop=10, verbose=True, scorer=NDCGScorer(k=5))
model.fit(X_train, y_train, qid_train)

# test NNDCG for different values of k
for k in (1, 2, 3, 4, 5, 10, 20):
    y_pred = model.predict(X_test, qid_test) 
    score = NDCGScorer(k=k)(y_test, y_pred, qid_test).mean()   
    print(f"NDCG Score {score}, K {k}")

Unique queries in train: [1 2 3 4]
Unique queries in test: [0]
Unique queries and their frequencies: (array([0, 1, 2, 3, 4]), array([ 97,  97,  97, 100,  99]))
X_train summary stats (mean, std): [0.07249098 0.23106312 0.0455172  0.16608415 0.13096125] [0.12270613 0.20418464 0.05759793 0.2104454  0.97543962]
y_train summary stats (mean, std): 0.1297709923664122 0.3360513084435899
Iteration 1: train 1.0000


ValueError: shapes (4,1) and (4,) not aligned: 1 (dim 1) != 4 (dim 0)

In [None]:
# Showcasing some predictions from the test set

# Create a DataFrame for the test set using the test indices
test_df = merged_df.iloc[test_idx].copy()
test_df['y_true'] = y_test
test_df['y_pred'] = y_pred

#Check how many distinct queries are in the test set
unique_test_queries = np.unique(qid_test)
print("Number of unique queries in test:", len(unique_test_queries))
print(unique_test_queries)

all_queries = merged_df['query'].unique()
print("Number of distinct queries overall:", len(all_queries))
print(all_queries)

# for each query in the test set, print the top 10 predictions (sorted by predicted score)
# Model seems to predcit well, however could be due to the small dataset and overfitting 
print("Top Predictions per Query:")
for query, group in test_df.groupby('query'):
    sorted_group = group.sort_values(by='y_pred', ascending=False)
    print(f"Query: {query}")
    print(sorted_group[['long_common_name', 'y_true', 'y_pred']].head(10))
    print("-" * 40)

Number of unique queries in test: 1
[4]
Number of distinct queries overall: 5
['glucose in blood' 'bilirubin in plasma' 'White blood cells count'
 'cholesterol in blood' 'PrThr calcium oxalate crystals']
Top Predictions per Query:
Query: PrThr calcium oxalate crystals
                                      long_common_name  y_true    y_pred
392  Calcium oxalate dihydrate crystals [Presence] ...       1  3.696418
391  Calcium oxalate crystals [Presence] in Urine s...       1  3.554758
393  Calcium oxalate crystals [Presence] in Urine b...       1  3.255568
394  Calcium oxalate crystals [Presence] in Urine s...       1  3.086415
395  Calcium oxalate dihydrate crystals [Presence] ...       1  3.081308
398  Calcium oxalate monohydrate crystals [Presence...       1  3.033698
396  Calcium oxalate dihydrate crystals [Presence] ...       1  3.001708
397  Calcium oxalate dihydrate crystals [Presence] ...       1  2.746387
399  Calcium oxalate monohydrate crystals [Presence...       1  2.712515
4

In [None]:
# Check how the model weights the features 
coef = model.get_coef
# Print feature coefficients explicitly
for name, coef in zip(feature_names, model.coef_):
    print(f"Feature: {name}, Importance (coef): {coef:.4f}")

Feature: name_lexical_sim, Importance (coef): 9.5569
Feature: name_semantic_sim, Importance (coef): 0.0000
Feature: system_semantic_similarity, Importance (coef): 0.0000
Feature: component_semantic_similarity, Importance (coef): 0.0000
Feature: rank, Importance (coef): 0.0000


In [None]:
val_query_1 = 'thyroid in blood'
val_query_2 = 'MSCnc ser plasma'

