In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sentence_transformers import CrossEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- 1. Glossary to Grouping Label ---
df = pd.read_csv("../data/2_glossary_to_label_gt.csv")
df_glossary_desc = pd.read_csv("../data/1b_glossary_descriptions.csv")

# --- 2. Load grouping_master to get all possible grouping_labels ---
grouping_df = pd.read_csv("../data/fbi_grouping_master.csv")
all_labels = grouping_df["grouping_label"].tolist()

In [3]:
# Merge df with df_glossary_desc on the 'Glossary' column
df_merged = df.merge(df_glossary_desc[['Glossary', 'Description']], on='Glossary', how='left')
df_merged.head()

Unnamed: 0,Glossary,ground_label,grouping_id,Description
0,Assets,Total Assets,5300004,"Resources owned by a company (e.g., cash, inve..."
1,Liabilities,(Current liabilities) + (Non-current liablilit...,"(5300177),(5300146)",Obligations or debts the company must pay (e.g...
2,Equity,Equity,5300007,Owners’ interest in the company (Assets - Liab...
3,Current Assets,Current assets,5300033,Assets expected to be used or converted to cas...
4,Non-Current Assets,Non-current assets,5300031,"Long-term assets (e.g., property, plant, and e..."


In [4]:
# --- 3. Initialize embedding model ---
model = SentenceTransformer('BAAI/bge-large-en-v1.5')

In [None]:
#reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
#reranker = CrossEncoder("../models/stage2_cross_encoder_finetuned_miniLM_hardnegative")
reranker = CrossEncoder("../models/stage2_cross_encoder_finetuned_MiniLM_hardnegative_v2") # 5 epochs, 120 samples

In [6]:
# --- 4. Embed all grouping_labels once ---
label_embs = model.encode(all_labels, convert_to_tensor=True)

In [7]:
# --- 5. For each predicted glossary term, find best grouping_label ---
predicted_labels = []

for _, row in df_merged.iterrows():
    # Add description to glossary item
    glossary = row['Glossary']
    description = row['Description']
    #query = glossary + ' is ' + description
    query = glossary
    #print(query)

    # Embed and match
    q_emb = model.encode(query, convert_to_tensor=True)
    sims = util.cos_sim(q_emb, label_embs)[0]
    #best_idx = torch.argmax(sims).item()
    #predicted_labels.append(all_labels[best_idx])

    # Reranker
    top_k = 5
    top_k_indices = torch.topk(sims, k=top_k).indices.tolist()
    top_k_labels = [all_labels[i] for i in top_k_indices]
    top_k_sims = [sims[i].item() for i in top_k_indices]

    # Step 2: Rerank top-k candidates using cross-encoder
    pairs = [(query, label) for label in top_k_labels]
    rerank_scores = reranker.predict(pairs)

    # Weighed scoring
    #print(top_k_sims)
    #print(rerank_scores)
    final_scores = [
        0.6 * sim + 0.4 * rerank
        for sim, rerank in zip(top_k_sims, rerank_scores)
    ]

    # Step 3: Pick best reranked label
    best_rerank_idx = torch.tensor(final_scores).argmax().item()
    predicted_label = top_k_labels[best_rerank_idx]

    predicted_labels.append(predicted_label)


df_merged["predicted_label"] = predicted_labels

df_merged["predicted_grouping_id"] = [
    grouping_df.loc[grouping_df["grouping_label"] == label, "grouping_id"].iat[0]
    if label in grouping_df["grouping_label"].values else None
    for label in df_merged["predicted_label"]
]

In [8]:
# --- 6. Evaluate at label level ---
y_true = df_merged["ground_label"]
y_pred = df_merged["predicted_label"]

accuracy = accuracy_score(y_true, y_pred)
f1_macro = f1_score(y_true, y_pred, average="macro")
f1_micro = f1_score(y_true, y_pred, average="micro")

print(f"Accuracy (labels): {accuracy:.4f}")
print(f"Macro F1 (labels): {f1_macro:.4f}")
#print(f"Micro F1 (labels): {f1_micro:.4f}")
#print("\nClassification Report (labels):\n")
#print(classification_report(y_true, y_pred, digits=4))

Accuracy (labels): 0.6207
Macro F1 (labels): 0.4530


In [9]:
df_merged.head(50)

Unnamed: 0,Glossary,ground_label,grouping_id,Description,predicted_label,predicted_grouping_id
0,Assets,Total Assets,5300004,"Resources owned by a company (e.g., cash, inve...",Total Assets,5300004.0
1,Liabilities,(Current liabilities) + (Non-current liablilit...,"(5300177),(5300146)",Obligations or debts the company must pay (e.g...,Current liabilities,5300177.0
2,Equity,Equity,5300007,Owners’ interest in the company (Assets - Liab...,Equity,5300007.0
3,Current Assets,Current assets,5300033,Assets expected to be used or converted to cas...,Current assets,5300033.0
4,Non-Current Assets,Non-current assets,5300031,"Long-term assets (e.g., property, plant, and e...",Non-current assets,5300031.0
5,Current Liabilities,Current liabilities,5300177,Obligations due within one year.,Current liabilities,5300177.0
6,Non-Current Liabilities,Non-current liabilities,5300146,"Long-term obligations (e.g., long-term loans).",Non-current liabilities,5300146.0
7,Working Capital,Current Assets - Current Liabilities.,"(5300033),(5300177)",Current Assets - Current Liabilities.,Add/Less: Decrease/(increase) in worki...,5301162.0
8,Net Worth,total assets - total liabilities,"(5300004),(5300177),(5300146)",Another term for equity,Net Value,5300030.0
9,Revenue,Revenue from operations,5300260,Income from sales of goods or services.,Revenue as per contrac...,5300266.0


In [10]:
# --- 7. Save enriched CSV ---
path = "../results/25May_1b/stage2_glossary_to_label_finetunedreranker_weighedscoring.csv"
df_merged.to_csv(path, index=False)
print("✅ Saved " + path)

✅ Saved ../results/25May_1b/stage2_glossary_to_label_finetunedreranker_weighedscoring.csv
