In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sentence_transformers import CrossEncoder
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 2. Load the glossary
glossary_df = pd.read_csv("../data/1b_glossary_descriptions.csv")            # your glossary CSV

# Method 1
glossary_terms = glossary_df['Glossary']
glossary_terms = glossary_terms.dropna().tolist()

# Method 2
glossary_full = glossary_df['Glossary'] + ' can be defined as '+ glossary_df['Description']
glossary_full = glossary_full.dropna().tolist()

# Method 3
glossary_desc = glossary_df['Description'].dropna().tolist()

print(glossary_full[0])

Assets can be defined as Resources owned by a company (e.g., cash, inventory, equipment).


In [3]:
def build_full_text(row):
    text = f"{row['Glossary']} can be defined as {row['Description']}"
    if pd.notnull(row['Formulas, if any']):
        text += f" Its Formula is:  {row['Formulas, if any']}"
    return text

glossary_full = glossary_df.apply(build_full_text, axis=1)
print(glossary_full[0])


Assets can be defined as Resources owned by a company (e.g., cash, inventory, equipment).


In [38]:
# 3. Load your NL queries alongside their ground-truth glossary terms
#    Input CSV must have columns: 'NL_Query' and 'GT_Glossary'
#queries_df = pd.read_csv("../data/1a_simpler_dataset_noisy_nl_to_glossary_gt.csv")

queries_df = pd.read_csv("../data/1a_tougher_dataset_nl_to_glossary_gt.csv")

In [39]:
# 4. Initialize the embedding model
#model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')
#model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
model = SentenceTransformer('BAAI/bge-large-en-v1.5')

In [40]:
# 5. Pre-compute embeddings for all glossary terms
term_embeddings = model.encode(glossary_full, convert_to_tensor=True, normalize_embeddings=True)

In [48]:

reranker = CrossEncoder("../models/stage1_cross_encoder_finetuned_MiniLM_noisyhardnegative_v3_withdesc")

In [49]:
# 6. Iterate through each query and find the best matching glossary term
results = []
y_true = []
y_pred = []
for _, row in queries_df.iterrows():
    query_text = row['NL_Query']
    original = row['GT_Glossary']
    
    # Embed the query
    q_emb = model.encode(query_text, convert_to_tensor=True, normalize_embeddings=True)

    # Compute cosine similarity to all glossary terms
    sims = util.cos_sim(q_emb, term_embeddings)
    #print(sims)
    
    # Identify best match
    #best_idx = torch.argmax(sims).item()
    #predicted = glossary_terms[best_idx]
    #score = sims[0][best_idx].item()

    ###### Adding a re-ranker ######
    top_k = 5  # How many to pass to reranker
    top_k_indices = torch.topk(sims[0], k=top_k).indices.tolist()
    top_k_sims = [sims[0][i].item() for i in top_k_indices]

    # Retrieve top-k glossary terms
    top_k_full = [glossary_full[i] for i in top_k_indices]

    # Prepare query-term pairs for reranking
    pairs = [(query_text, term) for term in top_k_full]

    # Rerank top-k candidates
    rerank_scores = reranker.predict(pairs)
    best_rerank_idx = torch.tensor(rerank_scores).argmax().item()

    # Weight combination
    final_scores = [
        0.5 * sim + 0.5 * rerank
        for sim, rerank in zip(top_k_sims, rerank_scores)
    ]

    best_rerank_idx = torch.tensor(final_scores).argmax().item()
    predicted = top_k_full[best_rerank_idx].split(' can be defined as ')[0]
    score = final_scores[best_rerank_idx]

    y_true.append(original)
    y_pred.append(predicted)
    
    results.append({
        'NL_Query': query_text,
        'GT_Glossary': original,
        'Predicted_Glossary': predicted,
        'Similarity_Score': round(score, 4)
    })


In [50]:
# 7. Convert to DataFrame and save or inspect
results_df = pd.DataFrame(results)
results_df.head(50)

Unnamed: 0,NL_Query,GT_Glossary,Predicted_Glossary,Similarity_Score
0,How much money did we bring in from sales this...,Revenue,Revenue,2.9094
1,What did it cost us to make the stuff we sold ...,Cost of Goods Sold (COGS),Cost of Goods Sold (COGS),3.1195
2,"After paying for production, how much did we k...",Gross Profit,Earnings Per Share (EPS),2.6983
3,What are we spending to keep the business runn...,Operating Expenses,Operating Expenses,4.0959
4,How much did our main operations earn us in th...,Operating Profit (EBIT),Operating Profit Margin,3.646
5,"What’s our profit before taxes, interest, and ...",EBITDA,Operating Profit (EBIT),4.1577
6,"After all expenses, how much did we end up wit...",Net Profit (PAT),Net Profit (PAT),4.1849
7,How much profit did each share make last quarter?,Earnings Per Share (EPS),Earnings Per Share (EPS),4.3222
8,How much cash came in from our operations in A...,Operating Cash Flow,Operating Cash Flow,2.6542
9,What did we spend on new equipment or investme...,Investing Cash Flow,Investing Cash Flow,2.7964


In [51]:
# 8. Evaluate
accuracy = accuracy_score(y_true, y_pred)
f1_macro = f1_score(y_true, y_pred, average="macro")

print(f"Accuracy (labels): {accuracy:.4f}")
print(f"Macro F1 (labels): {f1_macro:.4f}")
#print("\nClassification Report (labels):\n")
#print(classification_report(y_true, y_pred, digits=4))

Accuracy (labels): 0.4900
Macro F1 (labels): 0.4083


In [30]:
# Save the results to a CSV file
results_df.to_csv('../results/25May_experiments/stage1_nl2glossary_easydata_rerankerfinetuned.csv', index=False)