In [None]:
!pip install rank_bm25
!pip install bert-score
!pip install rouge-score

In [7]:
import pandas as pd
import numpy as np
from rank_bm25 import BM25Okapi
import re

# Additional Required Imports For Meteor Scores
import nltk
from nltk.translate.meteor_score import meteor_score

# Additional Required Import For BERTScores
from bert_score import score

#Additional Required Import For ROGUE scores
from rouge_score import rouge_scorer

In [3]:
df = pd.read_csv('/content/final_merged_datasets.csv')

In [None]:
df.drop_duplicates(inplace=True)
df.head()

Unnamed: 0,dataset_id,few_shot_Description,original_description,autoddg_description,llama_ufd_no_shot_descriptions,search_query
0,nfkx-wd79,This dataset contains metric values from the 2...,Metric values contained in Pre-kindergarten Sc...,This dataset contains information about pre-ki...,This dataset contains metric values from the 2...,education school quality report nyc
1,3279-pp7v,"This dataset, provided by the Mayor's Office o...","Properties remediated under OER's oversight, b...",This dataset contains information about enviro...,"This dataset, OER Cleanup Sites, contains info...",oer oer cleanup sites
2,ht7m-2uh6,This dataset contains the 2010 Census Tracts f...,Census Tracts from the 2010 US Census for New ...,This dataset contains spatial and temporal inf...,This dataset contains the 2010 US Census Tract...,2010 census tracts (water the_geom
3,pjgc-h7uv,This dataset contains case-related information...,Case-related information.,This dataset contains information on civil lit...,This dataset contains case-related information...,city government law nyc
4,n55z-cx8y,This dataset contains information on Single Us...,Parking Permits issued for a particular purpos...,This dataset contains information on NYC parki...,"This dataset, titled Single Use Parking Permit...",dot single use parking permit-


In [None]:
df.shape[0]

500

In [None]:
def clean_text(text):
    if pd.isna(text): return ""
    text = str(text)
    text = re.sub(r"^Here is a rewritten.*?:\s*", "", text, flags=re.IGNORECASE)
    text = re.sub(r"^Based on the.*?:\s*", "", text, flags=re.IGNORECASE)
    return text.strip()

In [None]:
cols_to_clean = [
    'autoddg_description',
    'llama_ufd_no_shot_descriptions',
    'few_shot_Description'
]

for col in cols_to_clean:
    if col in df.columns:
        df[col] = df[col].apply(clean_text)
df = df.fillna("")

In [None]:
def run_benchmark(name, corpus_col):
    if corpus_col not in df.columns:
        return None

    print(f"Testing: {name}")
    corpus = df[corpus_col].tolist()
    queries = df['search_query'].tolist()

    tokenized_corpus = [doc.lower().split() for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)

    metrics = {k: [] for k in [5, 10, 15, 20]}

    for i, query in enumerate(queries):
        tokenized_query = str(query).lower().split()
        doc_scores = bm25.get_scores(tokenized_query)

        top_20 = sorted(range(len(doc_scores)), key=lambda k: doc_scores[k], reverse=True)[:20]

        for k in metrics:
            if i in top_20[:k]:
                rank = top_20[:k].index(i) + 1
                metrics[k].append(1.0 / np.log2(rank + 1))
            else:
                metrics[k].append(0.0)

    return {
        'Model': name,
        'NDCG@5': np.mean(metrics[5]),
        'NDCG@10': np.mean(metrics[10]),
        'NDCG@15': np.mean(metrics[15]),
        'NDCG@20': np.mean(metrics[20])
    }

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab')

def calculate_meteor_avg(df, model_name, candidate_col, reference_col='original_description'):
    if candidate_col not in df.columns:
        return None

    scores = []
    for ref, cand in zip(df[reference_col], df[candidate_col]):
        ref_tokens = str(ref).lower().split()
        cand_tokens = str(cand).lower().split()
        scores.append(meteor_score([ref_tokens], cand_tokens))

    return {
        'Model': model_name,
        'METEOR Score': np.mean(scores)
    }

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [4]:
def calculate_bertscore_avg(df, model_name, candidate_col, reference_col='original_description'):
    if candidate_col not in df.columns:
        return None

    cands = df[candidate_col].astype(str).tolist()
    refs = df[reference_col].astype(str).tolist()

    P, R, F1 = score(cands, refs, lang="en", verbose=True)

    return {
        'Model': model_name,
        'BERTScore': F1.mean().item()
    }

In [None]:
def calculate_rouge_avg(df, model_name, candidate_col, reference_col='original_description'):
    if candidate_col not in df.columns:
        return None

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = []

    for ref, cand in zip(df[reference_col], df[candidate_col]):
        s = scorer.score(str(ref), str(cand))
        scores.append(s['rougeL'].fmeasure)

    return {
        'Model': model_name,
        'ROUGE-L Score': np.mean(scores)
    }

In [None]:
results = []
results.append(run_benchmark("Original Metadata", 'original_description'))
results.append(run_benchmark("AutoDDG Baseline", 'autoddg_description'))
results.append(run_benchmark("Llama (Zero-Shot)", 'llama_ufd_no_shot_descriptions'))
results.append(run_benchmark("Llama (Few-Shot)", 'few_shot_Description'))

results_df = pd.DataFrame([r for r in results if r]).set_index('Model')

print("\nFinal Results:")
print(results_df.round(4))

Testing: Original Metadata
Testing: AutoDDG Baseline
Testing: Llama (Zero-Shot)
Testing: Llama (Few-Shot)

Final Results:
                   NDCG@5  NDCG@10  NDCG@15  NDCG@20
Model                                               
Original Metadata  0.5367   0.5511   0.5569   0.5612
AutoDDG Baseline   0.3526   0.3734   0.3857   0.3914
Llama (Zero-Shot)  0.6844   0.6990   0.7023   0.7041
Llama (Few-Shot)   0.6565   0.6698   0.6735   0.6773


In [None]:
meteor_results = []

meteor_results.append(calculate_meteor_avg(df, "AutoDDG Baseline", 'autoddg_description'))
meteor_results.append(calculate_meteor_avg(df, "Llama (Zero-Shot)", 'llama_ufd_no_shot_descriptions'))
meteor_results.append(calculate_meteor_avg(df, "Llama (Few-Shot)", 'few_shot_Description'))

meteor_df = pd.DataFrame([r for r in meteor_results if r]).set_index('Model')

print("\nFinal METEOR Results:")
print(meteor_df.round(4))


Final METEOR Results:
                   METEOR Score
Model                          
AutoDDG Baseline         0.1577
Llama (Zero-Shot)        0.3926
Llama (Few-Shot)         0.3587


In [5]:
bert_results = []

bert_results.append(calculate_bertscore_avg(df, "AutoDDG Baseline", 'autoddg_description'))
bert_results.append(calculate_bertscore_avg(df, "Llama (Zero-Shot)", 'llama_ufd_no_shot_descriptions'))
bert_results.append(calculate_bertscore_avg(df, "Llama (Few-Shot)", 'few_shot_Description'))

bert_df = pd.DataFrame([r for r in bert_results if r]).set_index('Model')

print("\nFinal BERTScore Results:")
print(bert_df.round(4))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/15 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/8 [00:00<?, ?it/s]

done in 26.63 seconds, 19.23 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/15 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/8 [00:00<?, ?it/s]

done in 27.17 seconds, 18.84 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/15 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/8 [00:00<?, ?it/s]

done in 29.20 seconds, 17.53 sentences/sec

Final BERTScore Results:
                   BERTScore
Model                       
AutoDDG Baseline      0.8299
Llama (Zero-Shot)     0.8703
Llama (Few-Shot)      0.8642


In [9]:
rouge_results = []

rouge_results.append(calculate_rouge_avg(df, "AutoDDG Baseline", 'autoddg_description'))
rouge_results.append(calculate_rouge_avg(df, "Llama (Zero-Shot)", 'llama_ufd_no_shot_descriptions'))
rouge_results.append(calculate_rouge_avg(df, "Llama (Few-Shot)", 'few_shot_Description'))

rouge_df = pd.DataFrame([r for r in rouge_results if r]).set_index('Model')

print("\nFinal ROUGE-L Results:")
print(rouge_df.round(4))


Final ROUGE-L Results:
                   ROUGE-L Score
Model                           
AutoDDG Baseline          0.1237
Llama (Zero-Shot)         0.2826
Llama (Few-Shot)          0.2441
