### Imports


In [1]:
import pandas as pd
import os
import sys

### Set project root

In [2]:
project_root = os.path.abspath(
    os.path.join(os.getcwd(), '../../../')
)
if project_root not in sys.path:
    sys.path.append(project_root)

from models.src.evaluation.custom_evaluate import evaluate_dataset

### Get dataframes


In [3]:
bart_dataset_path = os.path.join(project_root, 'models', 'src', 'generation', 'transformers', 'golden_dataset_facebook_bart-base.csv')
t5_dataset_path = os.path.join(project_root, 'models', 'src', 'generation', 'transformers', 'golden_dataset_t5-small.csv')
gpt2_dataset_path = os.path.join(project_root, 'models', 'src', 'generation', 'transformers', 'golden_dataset_gpt2_old.csv')
bart_df = pd.read_csv(bart_dataset_path)
gpt2_df = pd.read_csv(gpt2_dataset_path)
t5_df = pd.read_csv(t5_dataset_path)

# rename the column prediction to predictions
bart_df.rename(columns={'prediction': 'predictions'}, inplace=True)
gpt2_df.rename(columns={'prediction': 'predictions'}, inplace=True)
t5_df.rename(columns={'prediction': 'predictions'}, inplace=True)

# Create a column input that takes the first 50 words of the ground_truth
bart_df['inputs'] = bart_df['ground_truth'].apply(lambda x: ' '.join(x.split()[:50]))
gpt2_df['inputs'] = gpt2_df['ground_truth'].apply(lambda x: ' '.join(x.split()[:50]))
t5_df['inputs'] = t5_df['ground_truth'].apply(lambda x: ' '.join(x.split()[:50]))
display(bart_df.head())
display(gpt2_df.head())
display(t5_df.head())

Unnamed: 0,predictions,ground_truth,inputs
0,"As a Publix Pharmacy Associate, you ll see how...","As a Publix Pharmacy Associate, you ll see how...","As a Publix Pharmacy Associate, you ll see how..."
1,Job Title Release Manager CLT Duration 12 18 m...,Job Title Release Manager CLT Duration 12 18 m...,Job Title Release Manager CLT Duration 12 18 m...
2,About This Role Wells Fargo is seeking a Senio...,About This Role Wells Fargo is seeking a Senio...,About This Role Wells Fargo is seeking a Senio...
3,We ve made a lot of progress since opening the...,We ve made a lot of progress since opening the...,We ve made a lot of progress since opening the...
4,"At Whole Foods Market, we re committed to prov...","At Whole Foods Market, we re committed to prov...","At Whole Foods Market, we re committed to prov..."


Unnamed: 0,predictions,ground_truth,inputs
0,"As a Publix Pharmacy Associate, you ll see how...","As a Publix Pharmacy Associate, you ll see how...","As a Publix Pharmacy Associate, you ll see how..."
1,Job Title Release Manager CLT Duration 12 18 m...,Job Title Release Manager CLT Duration 12 18 m...,Job Title Release Manager CLT Duration 12 18 m...
2,About This Role Wells Fargo is seeking a Senio...,About This Role Wells Fargo is seeking a Senio...,About This Role Wells Fargo is seeking a Senio...
3,We ve made a lot of progress since opening the...,We ve made a lot of progress since opening the...,We ve made a lot of progress since opening the...
4,"At Whole Foods Market, we re committed to prov...","At Whole Foods Market, we re committed to prov...","At Whole Foods Market, we re committed to prov..."


Unnamed: 0,predictions,ground_truth,inputs
0,"As a Publix Pharmacy Associate, you ll see how...","As a Publix Pharmacy Associate, you ll see how...","As a Publix Pharmacy Associate, you ll see how..."
1,Job Title Release Manager CLT Duration 12 18 m...,Job Title Release Manager CLT Duration 12 18 m...,Job Title Release Manager CLT Duration 12 18 m...
2,About This Role Wells Fargo is seeking a Senio...,About This Role Wells Fargo is seeking a Senio...,About This Role Wells Fargo is seeking a Senio...
3,We ve made a lot of progress since opening the...,We ve made a lot of progress since opening the...,We ve made a lot of progress since opening the...
4,"At Whole Foods Market, we re committed to prov...","At Whole Foods Market, we re committed to prov...","At Whole Foods Market, we re committed to prov..."


### Sample 50 random rows from each dataframe

In [7]:
small_t5_df = t5_df.sample(n=10, random_state=42)
small_bart_df = bart_df.sample(n=10, random_state=42)
small_gpt2_df = gpt2_df.sample(n=10, random_state=42)
display(small_t5_df.head())
display(small_bart_df.head())
display(small_gpt2_df.head())

Unnamed: 0,predictions,ground_truth,inputs
1501,RK K is hiring Transportation Engineers throug...,RK K is hiring Transportation Engineers throug...,RK K is hiring Transportation Engineers throug...
2586,Only Citizens GCs Preference for candidates lo...,Only Citizens GCs Preference for candidates lo...,Only Citizens GCs Preference for candidates lo...
2653,Ekeo Group is looking for a Senior Financial A...,Ekeo Group is looking for a Senior Financial A...,Ekeo Group is looking for a Senior Financial A...
1055,"At Belk, we have a vision to reimagine the dep...","At Belk, we have a vision to reimagine the dep...","At Belk, we have a vision to reimagine the dep..."
705,"Food Safety QA Technical Manager Princeton, N....","Food Safety QA Technical Manager Princeton, N....","Food Safety QA Technical Manager Princeton, N...."


Unnamed: 0,predictions,ground_truth,inputs
1501,RK K is hiring Transportation Engineers throug...,RK K is hiring Transportation Engineers throug...,RK K is hiring Transportation Engineers throug...
2586,Only Citizens GCs Preference for candidates lo...,Only Citizens GCs Preference for candidates lo...,Only Citizens GCs Preference for candidates lo...
2653,Ekeo Group is looking for a Senior Financial A...,Ekeo Group is looking for a Senior Financial A...,Ekeo Group is looking for a Senior Financial A...
1055,"At Belk, we have a vision to reimagine the dep...","At Belk, we have a vision to reimagine the dep...","At Belk, we have a vision to reimagine the dep..."
705,"Food Safety QA Technical Manager Princeton, N....","Food Safety QA Technical Manager Princeton, N....","Food Safety QA Technical Manager Princeton, N...."


Unnamed: 0,predictions,ground_truth,inputs
1501,RK K is hiring Transportation Engineers throug...,RK K is hiring Transportation Engineers throug...,RK K is hiring Transportation Engineers throug...
2586,Only Citizens GCs Preference for candidates lo...,Only Citizens GCs Preference for candidates lo...,Only Citizens GCs Preference for candidates lo...
2653,Ekeo Group is looking for a Senior Financial A...,Ekeo Group is looking for a Senior Financial A...,Ekeo Group is looking for a Senior Financial A...
1055,"At Belk, we have a vision to reimagine the dep...","At Belk, we have a vision to reimagine the dep...","At Belk, we have a vision to reimagine the dep..."
705,"Food Safety QA Technical Manager Princeton, N....","Food Safety QA Technical Manager Princeton, N....","Food Safety QA Technical Manager Princeton, N...."


In [9]:
results = evaluate_dataset(small_t5_df)

2025/05/07 19:14:38 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
100%|██████████| 1/1 [00:01<00:00,  1.35s/it]
100%|██████████| 1/1 [00:01<00:00,  1.00s/it]
100%|██████████| 1/1 [00:00<00:00,  1.06it/s]
100%|██████████| 1/1 [00:00<00:00,  1.15it/s]
100%|██████████| 1/1 [00:00<00:00,  1.04it/s]
100%|██████████| 10/10 [00:01<00:00,  7.94it/s]
100%|██████████| 10/10 [00:01<00:00,  6.47it/s]
100%|██████████| 10/10 [00:02<00:00,  4.95it/s]
100%|██████████| 10/10 [00:01<00:00,  5.91it/s]
100%|██████████| 10/10 [00:01<00:00,  6.70it/s]


🏃 View run respected-slug-961 at: http://localhost:5000/#/experiments/441364640883767191/runs/d4da66d90fe647109be3f9afb84dbad5
🧪 View experiment at: http://localhost:5000/#/experiments/441364640883767191
{'grammar_quality/v1/mean': np.float64(1.9), 'grammar_quality/v1/variance': np.float64(0.69), 'long_range_coherence_consistency/v1/mean': np.float64(3.3), 'long_range_coherence_consistency/v1/variance': np.float64(1.81), 'generation_novelty_non_repetitiveness/v1/mean': np.float64(1.0), 'generation_novelty_non_repetitiveness/v1/variance': np.float64(0.0), 'semantic_richness_nuance/v1/mean': np.float64(1.3), 'semantic_richness_nuance/v1/variance': np.float64(0.8100000000000002), 'professionalism/v1/mean': np.float64(2.4), 'professionalism/v1/variance': np.float64(0.8400000000000001), 'rougeL/v1/mean': np.float64(0.2205265702188736), 'rougeL/v1/variance': np.float64(0.00784821834966349), 'rougeL/v1/p90': np.float64(0.37689984787593944), 'bleu/v1/mean': np.float64(0.018082290560948813), 'b

Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 188.55it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 233.19it/s]

                                              inputs  \
0  RK K is hiring Transportation Engineers throug...   
1  Only Citizens GCs Preference for candidates lo...   
2  Ekeo Group is looking for a Senior Financial A...   
3  At Belk, we have a vision to reimagine the dep...   
4  Food Safety QA Technical Manager Princeton, N....   
5  Sutro Biopharma, Inc. is seeking a driven and ...   
6  Description At Bath Body Works, everyone belon...   
7  Job Summary The Market and Growth Analyst is r...   
8  Service Center CSCS TX Job Summary Contacts po...   
9  Job Description Crowell Moring LLP is an inter...   

                                        ground_truth  \
0  RK K is hiring Transportation Engineers throug...   
1  Only Citizens GCs Preference for candidates lo...   
2  Ekeo Group is looking for a Senior Financial A...   
3  At Belk, we have a vision to reimagine the dep...   
4  Food Safety QA Technical Manager Princeton, N....   
5  Sutro Biopharma, Inc. is seeking a driven an


