In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix, hstack
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GroupKFold
from sentence_transformers import SentenceTransformer
from adarankv2 import AdaRank2
from adarank import AdaRank
from metrics import NDCGScorer
from sklearn.preprocessing import LabelEncoder


#define file path
file_path = 'loinc_ranks_query_moreterms.xlsx'
# Load pretrained model vector embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [2]:
# Function to safely calculate similarity between a query and a field
def calculate_embedding_similarity(query, field):
    if pd.isna(query) or pd.isna(field):
        return 0
    query_embedding = embedder.encode([str(query)])[0]
    field_embedding = embedder.encode([str(field)])[0]
    return cosine_similarity([query_embedding], [field_embedding])[0][0]


In [6]:
#Read the
xls = pd.ExcelFile(file_path)
dataframes = []
for sheet_name in xls.sheet_names:
    temp_df = pd.read_excel(xls, sheet_name=sheet_name)
    temp_df['query'] = sheet_name  # Each sheet name is used as the query text
    dataframes.append(temp_df)
merged_df = pd.concat(dataframes, ignore_index=True)

# ------------------------------
# Compute cosine similarity using TF-IDF
# We want to compute, for each row, the cosine similarity between the query and the long_common_name.
# First, build a TF-IDF vectorizer fitted on the union of all queries and names.
corpus = pd.concat([merged_df['query'], merged_df['long_common_name']])
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

# Transform the query and long_common_name columns
X_query = vectorizer.transform(merged_df['query'])
X_name_tfidf = vectorizer.transform(merged_df['long_common_name'])

# Compute cosine similarity for each row
cosine_sim = np.array([cosine_similarity(X_query[i], X_name_tfidf[i])[0, 0] 
                         for i in range(X_query.shape[0])])
merged_df['name_cosine_sim'] = cosine_sim

# Now use the computed cosine similarity as the feature for the name field.
X_name = csr_matrix(merged_df[['name_cosine_sim']].values)

# ------------------------------
# Process additional features
# Drop loin_ID as queries are natural text 
# X_loinc_ID = TfidfVectorizer().fit_transform(merged_df['loinc_num'])

# Create similarity scores for features measurement, system, and component
# Apply a similarity function to each row, based on vector embeddings 
merged_df['measurement_similarity'] = merged_df.apply(
    lambda row: calculate_embedding_similarity(row['query'], row['long_common_name']), axis=1)
merged_df['system_similarity'] = merged_df.apply(
    lambda row: calculate_embedding_similarity(row['query'], row['system']), axis=1)
merged_df['component_similarity'] = merged_df.apply(
    lambda row: calculate_embedding_similarity(row['query'], row['component']), axis=1)

# Convert to sparse matrices
X_measurement_similarity = csr_matrix(merged_df[['measurement_similarity']].values)
X_system_similarity = csr_matrix(merged_df[['system_similarity']].values)
X_component_similarity = csr_matrix(merged_df[['component_similarity']].values)

# ------------------------------
# Process numerical feature: 'rank' (scaled)
scaler = StandardScaler()
X_rank = scaler.fit_transform(merged_df[['rank']])
X_rank_sparse = csr_matrix(X_rank)

# ------------------------------
# Combine all features into one sparse matrix.
# Here, we use the computed cosine similarity (X_name) not the full TF-IDF matrix.
X = hstack([X_name, X_measurement_similarity, X_system_similarity, 
            X_component_similarity, X_rank_sparse])

# Labels and query identifiers
y = merged_df['inSearch'].values
# Use a label encoder to convert query strings to integers
merged_df['qid_numeric'] = pd.factorize(merged_df['query'])[0]
qid = merged_df['qid_numeric'].values

In [8]:

# ------------------------------
# Query-aware train/test splitting
splitter = GroupShuffleSplit(test_size=0.2, random_state=None)
train_idx, test_idx = next(splitter.split(X, y, groups=qid))

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
qid_train, qid_test = qid[train_idx], qid[test_idx]

# Create DataFrames for the train and test sets using the respective indices
train_df = merged_df.iloc[train_idx].copy()
test_df = merged_df.iloc[test_idx].copy()

print("Unique queries in train:", np.unique(qid[train_idx]))
print("Unique queries in test:", np.unique(qid[test_idx]))

print("Unique queries and their frequencies:", np.unique(qid, return_counts=True))


print("X_train:\n", X_train.toarray())
print("y_train:", y_train)
#Check basic statistics to identify issues
print("X_train summary stats (mean, std):", np.mean(X_train.toarray(), axis=0), np.std(X_train.toarray(), axis=0))
print("y_train distribution:", np.unique(y_train, return_counts=True))
print("y_train summary stats (mean, std):", np.mean(y_train, axis=0), np.std(y_train, axis=0))

# ------------------------------
# Train and evaluate AdaRank
model = AdaRank(max_iter=100, estop=10, verbose=True, scorer=NDCGScorer(k=5))
model.fit(X_train, y_train, qid_train)

for k in (1, 2, 3, 4, 5, 10, 20):
    score = NDCGScorer(k=k)(y_test, y_pred, qid_test).mean()
    y_pred = model.predict(X_test, qid_test)    
    print(f"NDCG Score {score}, K {k}")

Unique queries in train: [1 2 3]
Unique queries in test: [0]
Unique queries and their frequencies: (array([0, 1, 2, 3]), array([ 97,  97,  97, 100]))
X_train:
 [[ 0.          0.29758579  0.04382028  0.30065224 -0.37530678]
 [ 0.15084318  0.32261932 -0.00544647  0.18759102 -0.46530982]
 [ 0.03663183  0.20796832  0.13492452  0.08134238 -0.41042425]
 ...
 [ 0.01716352  0.18149579  0.01239809 -0.00329014 -0.47089141]
 [ 0.0167173   0.17618269  0.01239809 -0.0077756  -0.47089141]
 [ 0.01817638  0.18151717  0.01239809  0.02372467  0.27983161]]
y_train: [0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1
 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

NameError: name 'EPSILON' is not defined

In [None]:
# Create a DataFrame for the test set using the test indices
test_df = merged_df.iloc[test_idx].copy()
test_df['y_true'] = y_test
test_df['y_pred'] = y_pred

#debugging tests 
unique_test_queries = np.unique(qid_test)
print("Number of unique queries in test:", len(unique_test_queries))
print(unique_test_queries)

all_queries = merged_df['query'].unique()
print("Number of distinct queries overall:", len(all_queries))
print(all_queries)

# for each query in the test set, print the top 3 predictions (sorted by predicted score)
print("Top Predictions per Query:")
for query, group in test_df.groupby('query'):
    sorted_group = group.sort_values(by='y_pred', ascending=False)
    print(f"Query: {query}")
    print(sorted_group[['loinc_num', 'long_common_name', 'y_true', 'y_pred']].head(50))
    print("-" * 40)

Number of unique queries in test: 1
['bilirubin in plasma']
Number of distinct queries overall: 4
['glucose in blood' 'bilirubin in plasma' 'White blood cells count'
 'cholesterol in blood']
Top Predictions per Query:
Query: bilirubin in plasma
    loinc_num                                   long_common_name  y_true  \
98     1742-6  Alanine aminotransferase [Enzymatic activity/v...       0   
99    20565-8      Carbon dioxide, total [Moles/volume] in Blood       0   
171   35672-5  Bilirubin.direct/Bilirubin.total in Serum or P...       1   
170   14630-8  Bilirubin.indirect [Moles/volume] in Serum or ...       1   
169   35192-4  Bilirubin.indirect [Mass or Moles/volume] in S...       1   
168    1971-1  Bilirubin.indirect [Mass/volume] in Serum or P...       1   
167   15153-0  Deprecated Indirect bilirubin [Mass/volume] in...       1   
166   50189-0  Neonatal bilirubin panel [Mass/volume] - Serum...       1   
165   33899-6  Bilirubin.conjugated+indirect [Moles/volume] i...       