In [1]:
import os
import sys
import pandas as pd
import numpy as np
import re
import json
from pathlib import Path
import yaml
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter

from thesis_project.preprocessing.normalizer import norm
from thesis_project.scoring.similarity import exact_match_rate, token_match_rate, last_token_match_rate, substring_match_rate

# Set themes for seaborn and matplotlib
sns.set_theme(style="whitegrid", palette="colorblind")
plt.rcParams['figure.figsize'] = (12, 5)
plt.rcParams['figure.dpi'] = 120

In [2]:
# Load config
with open('../configs/_default_configs.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Define paths
PROJECT_ROOT = Path(os.getcwd()).parent
DATA_DIR = PROJECT_ROOT / config['paths']['data_dir']
PROCESSED_DATA_DIR = PROJECT_ROOT / config['paths']['processed_dir']
NOTEBOOKS_DIR = PROJECT_ROOT / config['paths']['notebooks_dir']
MODEL_DIR = PROJECT_ROOT / config['paths']['model_dir']

# Define dataset paths
BNT_PATH= DATA_DIR / 'BNT-syntheticData_v2.xlsx'
FAS_PATH = DATA_DIR / 'FAS-syntheticData_v1.xlsx'
SVF_PATH = DATA_DIR / 'SVF-syntheticData_v1.xlsx'

In [3]:
bnt_raw = pd.read_excel(BNT_PATH)
svf_raw = pd.read_excel(SVF_PATH)
fas_raw = pd.read_excel(FAS_PATH)
print(f"BNT raw shape: {bnt_raw.shape}")
print(f"SVF raw shape: {svf_raw.shape}")
print(f"FAS raw shape: {fas_raw.shape}")

BNT raw shape: (35, 102)
SVF raw shape: (29, 101)
FAS raw shape: (20, 101)


In [4]:
df = pd.read_excel(BNT_PATH)

user_cols = [c for c in df.columns if str(c).startswith("User-")]

# Keep only “real” BNT items (exclude blank + Gender:/Age:)
mask = df["Gold"].notna() & ~df["Gold"].astype(str).str.contains(r":\s*$")
items = df[mask].copy()

gold = norm(items["Gold"])

print("Exact:", exact_match_rate(user_cols, items, gold, norm))
print("Token contains gold:", token_match_rate(user_cols, items, gold, norm))
print("Last token == gold:", last_token_match_rate(user_cols, items, gold, norm))
print("Substring contains gold:", substring_match_rate(user_cols, items, gold, norm))


Exact: 0.4603333333333333
Token contains gold: 0.5473333333333333
Last token == gold: 0.5196666666666667
Substring contains gold: 0.671
