# Mini Selector

In [None]:
import gymnasium as gym
import pandas as pd
import numpy as np
import bdikit as bdi
from dateutil import parser

In [3]:
raw_dataset = pd.read_csv('data/Hospital/gt.csv')
raw_dataset.head()

Unnamed: 0,id_l,title_l,id_r,title_r
0,5,Barts Health NHS Trust,0,Barts and The London NHS Trust
1,8,Klinikum Aachen,1,Uniklinikum Aachen
2,16,Moorfields Eye Hospital NHS Foundation Trust,2,Moorfields Eye Hospital
3,31,Hospital for Sick Children,3,The Hospital for Sick Children
4,47,Princess Margaret Cancer Centre,4,Princess Margaret Hospital (Toronto)


In [4]:
all_targets = raw_dataset['title_r'].unique().tolist()

# Create list of dicts
dataset = []
for _, row in raw_dataset.iterrows():
    dataset.append({
        'source': row['title_l'],
        'gold': row['title_r'],
        'targets': all_targets,
    })

In [5]:
dataset[0]

{'source': 'Barts Health NHS Trust',
 'gold': 'Barts and The London NHS Trust',
 'targets': ['Barts and The London NHS Trust',
  'Uniklinikum Aachen',
  'Moorfields Eye Hospital',
  'The Hospital for Sick Children',
  'Princess Margaret Hospital (Toronto)',
  'Queen Mary Hospital (Hong Kong)',
  'Mount Sinai Hospital (Manhattan)',
  "Evelina London Children's Hospital",
  'James Paget University Hospitals NHS Foundation Trust',
  'LAC+USC Medical Center',
  'TYKS',
  'NYC Health + Hospitals/Bellevue',
  'Royal Stoke University Hospital',
  'Oslo University Hospital, Rikshospitalet',
  'WellStar Kennestone Regional Medical Center',
  "Children's Medical Center Dallas",
  'St John of God Hospital Geelong',
  'Prince of Wales Hospital (Sydney)',
  "St Vincent's Private Hospital",
  'Aalborg University Hospital',
  'Wagga Wagga Rural Referral Hospital',
  'Florida Hospital',
  'University of California, Irvine Medical Center',
  'UC San Diego Health',
  'Lankenau Hospital',
  'Queen Elizab

In [6]:
from rapidfuzz.distance import Levenshtein
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import pandas as pd
import random

random.seed(42)
max_length = 10

model = SentenceTransformer('all-MiniLM-L6-v2')


def get_semantic_features(target_name, source_name, target_values, source_value):
    """
    Compute semantic similarity features between a single source_value and a list of target_values.
    Returns [max_similarity, min_similarity, avg_similarity]
    """
    if len(target_values) == 0:
        return [0.0, 0.0, 0.0]

    source_embedding = model.encode([str(source_value)])
    target_embeddings = model.encode([str(x) for x in target_values])

    similarities = cosine_similarity(source_embedding, target_embeddings)
    max_similarity = similarities.max()
    min_similarity = similarities.min()
    avg_similarity = similarities.mean()

    return [max_similarity, min_similarity, avg_similarity]


def get_lexical_features(target_name, source_name, target_values, source_value):
    """
    Compute Levenshtein similarity features between a single source_value and a list of target_values.
    Returns [max_sim, min_sim, avg_sim]
    """
    if len(target_values) == 0:
        return [0.0, 0.0, 0.0]

    similarities = [
        Levenshtein.normalized_similarity(source_value, target)
        for target in target_values
    ]
    max_sim = max(similarities)
    min_sim = min(similarities)
    avg_sim = sum(similarities) / len(similarities)

    return [max_sim, min_sim, avg_sim]


def get_statistic_features(target_name, source_name, target_values, source_value):
    """
    Compute simple ratio-based statistic features using a single source_value.
    Returns [ratio]
    """
    if len(target_values) == 0:
        return [0.0]

    # With a single source_value, ratio is either 1/len(target_values)
    return [1.0 / len(target_values)]


In [19]:
from dataclasses import dataclass
from typing import Optional, Tuple

@dataclass
class DateDetectResult:
    parsed: pd.Series
    success_rate: float
    dayfirst: bool
    yearfirst: bool

def _try_parse(col: pd.Series, *, dayfirst=False, yearfirst=False, to_date=False, utc=True) -> pd.Series:
    dt = pd.to_datetime(
        col, errors="coerce", infer_datetime_format=True,
        dayfirst=dayfirst, yearfirst=yearfirst, utc=utc
    )
    return dt.dt.normalize() if to_date else dt

def _best_datetime_parse(col: pd.Series, to_date=False, utc=True) -> DateDetectResult:
    # Try four common strategies; pick the one that parses the most values
    strategies = [
        (False, False),
        (True,  False),
        (False, True),
        (True,  True),
    ]
    best = None
    best_rate = -1.0
    best_series = None
    best_flags = (False, False)

    for df, yf in strategies:
        parsed = _try_parse(col, dayfirst=df, yearfirst=yf, to_date=to_date, utc=utc)
        rate = parsed.notna().mean()
        if rate > best_rate:
            best_rate, best_series, best_flags = rate, parsed, (df, yf)

    return DateDetectResult(parsed=best_series, success_rate=best_rate,
                            dayfirst=best_flags[0], yearfirst=best_flags[1])

def detect_and_normalize_datetime(col: pd.Series, *, min_success=0.55, to_date=False, utc=True) -> Tuple[bool, pd.Series]:
    """
    Returns (is_datetime_like, parsed_series).
    A column is treated as datetime-like if at least `min_success` of values parse.
    """
    res = _best_datetime_parse(col, to_date=to_date, utc=utc)
    return (res.success_rate >= min_success), res.parsed

def datetime_row_match(
    df: pd.DataFrame,
    source_col: str,
    target_col: str,
    *,
    date_only: bool = True,
    tolerance: Optional[pd.Timedelta] = None,
    min_success: float = 0.55,
    require_both_datetime: bool = True,
    utc: bool = True,
) -> pd.Series:
    """
    Auto-detect datetime columns, normalize, then row-wise match.

    - date_only=True: match by calendar date (time ignored)
    - tolerance=None: exact equality (NaT-safe)
    - tolerance=Timedelta: match within +/- tolerance (requires datetimes, not date-only)
    - require_both_datetime=True: if either column isn't datetime-like, return all False
    """
    s_is_dt, s_parsed = detect_and_normalize_datetime(df[source_col], to_date=date_only, utc=utc, min_success=min_success)
    t_is_dt, t_parsed = detect_and_normalize_datetime(df[target_col], to_date=date_only, utc=utc, min_success=min_success)

    if require_both_datetime and not (s_is_dt and t_is_dt):
        # columns don’t look like dates → no matches
        return pd.Series(False, index=df.index)

    # If one looks like datetime and the other doesn’t, still try (but many will be NaT)
    if tolerance is None:
        # exact equality at chosen granularity
        return s_parsed.eq(t_parsed)
    else:
        # tolerance only makes sense when we kept time-of-day (date_only=False recommended)
        valid = s_parsed.notna() & t_parsed.notna()
        diff = (s_parsed - t_parsed).abs()
        return valid & (diff <= tolerance)


In [None]:
# --- New imports (top of file) ---
import pandas as pd
import numpy as np
from dateutil import parser
from datetime import timedelta
import re

# =========================
# 1) Helper: datetime normalize (vectorized)
# =========================
def _normalize_datetime_series(series, *, to_date=True, utc=True):
    """
    Vectorized parse to datetime. Returns datetime64[ns, UTC] if utc=True,
    and normalizes to midnight if to_date=True (calendar-date equality).
    """
    dt = pd.to_datetime(series, errors="coerce", infer_datetime_format=True, utc=utc)
    return dt.dt.normalize() if to_date else dt

def _looks_like_datetime(series, min_success=0.55):
    """Heuristic: consider a series datetime-like if parse success rate >= min_success."""
    parsed = pd.to_datetime(series, errors="coerce", infer_datetime_format=True, utc=True)
    return parsed.notna().mean() >= min_success

# =========================
# 2) Algorithms
# =========================

def lexical_algorithm(source, targets):
    source_column = 'source'
    target_column = 'target'
    source_dataset = pd.DataFrame({source_column: [source]})
    target_dataset = pd.DataFrame({target_column: targets})
    matches = bdi.match_values(
        source_dataset,
        target_dataset,
        attribute_matches=(source_column, target_column),
        method="edit_distance",
    )
    return matches["target_value"].iloc[0] if len(matches) else None

def semantic_algorithm(source, targets):
    source_column = 'source'
    target_column = 'target'
    source_dataset = pd.DataFrame({source_column: [source]})
    target_dataset = pd.DataFrame({target_column: targets})
    matches = bdi.match_values(
        source_dataset,
        target_dataset,
        attribute_matches=(source_column, target_column),
        method="embedding",
    )
    return matches["target_value"].iloc[0] if len(matches) else None

def llm_reasoning_algorithm(source, targets):
    source_column = 'source'
    target_column = 'target'
    source_dataset = pd.DataFrame({source_column: [source]})
    target_dataset = pd.DataFrame({target_column: targets})
    matches = bdi.match_values(
        source_dataset,
        target_dataset,
        attribute_matches=(source_column, target_column),
        method="llm",
    )
    return matches["target_value"].iloc[0] if len(matches) else None

# --- Datetime matching (normalize both cols, then row-wise compare) ---
def date_algorithm(source, targets, *, tolerance: timedelta | None = None, date_only: bool = True):
    """
    Normalize both source and targets as datetimes and return the best target.
    - If tolerance is None: exact equality (calendar date if date_only=True)
    - If tolerance provided: choose the nearest within tolerance (time-of-day kept if date_only=False)
    Returns the matched target value, or None if no match.
    """
    # Build tiny dataframes so we can vectorize uniformly
    s_df = pd.DataFrame({"source": [source]})
    t_df = pd.DataFrame({"target": targets})

    # Quick guard: if neither column parses reasonably, bail
    if not (_looks_like_datetime(s_df["source"]) or _looks_like_datetime(t_df["target"])):
        return None

    # Normalize both
    s_parsed = _normalize_datetime_series(s_df["source"], to_date=date_only)
    t_parsed = _normalize_datetime_series(t_df["target"], to_date=date_only)

    s_val = s_parsed.iloc[0]
    if pd.isna(s_val):
        return None

    # Exact equality on chosen granularity
    if tolerance is None:
        mask = t_parsed.eq(s_val)
        if not mask.any():
            return None
        # Return the first exact match’s original target string
        idx = mask.idxmax()
        return t_df["target"].iloc[idx]

    # Tolerance-based: find nearest within tolerance
    valid = t_parsed.notna()
    if not valid.any():
        return None
    diffs = (t_parsed[valid] - s_val).abs()
    within = diffs <= pd.Timedelta(tolerance)
    if not within.any():
        return None
    # pick nearest
    best_idx = diffs[within].idxmin()
    return t_df["target"].loc[best_idx]

# --- Simple regex matcher baseline ---
def regex_algorithm(source, targets):
    """
    Very light regex baseline:
    - Extract alnum tokens from source and each target
    - Score by size of token intersection (Jaccard-like)
    - Return best-scoring target (ties → first)
    """
    if not targets:
        return None
    tok = lambda s: set(re.findall(r"[A-Za-z0-9]+", str(s).lower()))
    s_tokens = tok(source)
    if not s_tokens:
        return None

    best_i, best_score = None, -1.0
    for i, t in enumerate(targets):
        t_tokens = tok(t)
        if not t_tokens:
            continue
        inter = len(s_tokens & t_tokens)
        union = len(s_tokens | t_tokens)
        score = inter / union if union else 0.0
        if score > best_score:
            best_score, best_i = score, i

    return targets[best_i] if best_i is not None else None

# =========================
# 3) Actions + primitives
# =========================
actions_dict = {
    0: 'lexical',
    1: 'semantic',
    2: 'llm',
    3: 'date',     # NEW
    4: 'regex',    # NEW
}
primitives_dict = {
    'lexical': lexical_algorithm,
    'semantic': semantic_algorithm,
    'llm': llm_reasoning_algorithm,
    'date': date_algorithm,
    'regex': regex_algorithm,
}
num_algorithms = len(actions_dict)



class ValueMatchingEnv(gym.Env):

    def __init__(self, dataset, max_steps=3):
        super(ValueMatchingEnv, self).__init__()

        self.dataset = dataset
        self.max_steps = max_steps
        self.action_space = gym.spaces.Discrete(num_algorithms)

        feature_dim = 7 + max_steps
        self.observation_space = gym.spaces.Box(
            low=0.0, high=1.0,
            shape=(feature_dim,),
            dtype=np.float32
        )

        # Episode-specific values
        self.source = None
        self.targets = None
        self.gold = None

        self.steps_taken = 0
        self.action_history = None

    # ---- Convert action history to normalized vector ----
    def _encode_history(self):
        encoded = []
        for action in self.action_history:
            if action == -1:
                encoded.append(0.0)  # unused slot
            else:
                encoded.append((action + 1) / num_algorithms)
        return encoded

    # ---- Compute full feature vector ----
    def _compute_features(self, source, targets):
        lexical_features = get_lexical_features('target', 'source', targets, source)
        semantic_features = get_semantic_features('target', 'source', targets, source)
        statistic_features = get_statistic_features('target', 'source', targets, source)

        history_vector = self._encode_history()

        all_features = lexical_features + semantic_features + statistic_features + history_vector
        return np.array(all_features, dtype=np.float32)

    # ---- Reset episode ----
    def reset(self):
    # randomly pick an item from dataset
        sample = random.choice(self.dataset)
        self.source = sample['source']
        self.targets = sample['targets']
        self.gold = sample['gold']

        self.steps_taken = 0
        self.action_history = [-1] * self.max_steps

        self.state = self._compute_features(self.source, self.targets)
        return self.state


    # ---- Step ----
    # ---- Step ----
    def step(self, action):
    # Record algorithm in history
        if self.steps_taken < self.max_steps:
            self.action_history[self.steps_taken] = action
        self.steps_taken += 1

        alg_name = actions_dict[action]

        try:
            if alg_name == 'date':
                # You can pick the policy you prefer:
                # exact calendar date:
                predicted = primitives_dict[alg_name](self.source, self.targets, tolerance=None, date_only=True)
                # or within 1 day keeping time: predicted = primitives_dict[alg_name](self.source, self.targets, tolerance=timedelta(days=1), date_only=False)
            else:
                predicted = primitives_dict[alg_name](self.source, self.targets)
        except Exception:
            predicted = None  # fail-safe, let policy try next action

        reward = 1.0 if (predicted is not None and predicted == self.gold) else 0.0
        done = (reward == 1.0) or (self.steps_taken >= self.max_steps)

        if not done:
            self.state = self._compute_features(self.source, self.targets)
        else:
            self.state = np.zeros(self.observation_space.shape, dtype=np.float32)

        info = {
            "predicted": predicted,
            "gold": self.gold,
            "algorithm": alg_name,
            "history": self.action_history.copy()
        }
        return self.state, reward, done, info



In [18]:
# Create environment
env = ValueMatchingEnv(dataset, max_steps=3)
num_actions = env.action_space.n

# Initialize Q-table
Q = {}

# Set hyperparameters
alpha = 0.1    # learning rate
gamma = 0.9    # discount factor
epsilon = 0.5  # exploration rate

# Helper function to convert numeric state vector to tuple (hashable for Q-table)
def state_to_key(state):
    return tuple(state.round(3))  # round to reduce floating-point noise

# Training loop
num_episodes = 1000
for episode in range(num_episodes):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        state_key = state_to_key(state)
        
        # Choose action
        if np.random.rand() < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            q_values = {a: Q.get((state_key, a), 0) for a in range(num_actions)}
            action = max(q_values, key=q_values.get)  # Exploit
        
        # Take action
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        
        next_state_key = state_to_key(next_state)
        next_q_values = {a: Q.get((next_state_key, a), 0) for a in range(num_actions)}
        max_next_q_value = max(next_q_values.values()) if next_q_values else 0
        
        # Update Q-table
        current_q_value = Q.get((state_key, action), 0)
        updated_q_value = current_q_value + alpha * (reward + gamma * max_next_q_value - current_q_value)
        Q[(state_key, action)] = updated_q_value
        
        state = next_state

    print(f"Episode {episode + 1}, Total Reward: {total_reward}")

# --- Evaluate trained agent ---
total_rewards = []
num_eval_episodes = 3

for _ in range(num_eval_episodes):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        state_key = state_to_key(state)
        q_values = {a: Q.get((state_key, a), 0) for a in range(num_actions)}
        action = max(q_values, key=q_values.get)
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        state = next_state
    
    print('Predicted target:', info['predicted_target'])
    print('Gold target:', info['gold'])
    print('Algorithm used:', info['algorithm_used'])
    print('Test reward:', total_reward)
    total_rewards.append(total_reward)

print(f"Average Total Reward: {np.mean(total_rewards)}")


Episode 1, Total Reward: 1.0


'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 40701698-1f1f-4c3f-85bf-06c7245dfd3a)')' thrown while requesting HEAD https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].


Episode 2, Total Reward: 1.0
Episode 3, Total Reward: 1.0

[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.



AuthenticationError: litellm.AuthenticationError: AuthenticationError: OpenAIException - The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable