# Mini Selector

In [3]:
import gymnasium as gym
import pandas as pd
import numpy as np
import bdikit as bdi

In [4]:
raw_dataset = pd.read_csv('data/Hospital/gt.csv')
raw_dataset.head()

Unnamed: 0,id_l,title_l,id_r,title_r
0,5,Barts Health NHS Trust,0,Barts and The London NHS Trust
1,8,Klinikum Aachen,1,Uniklinikum Aachen
2,16,Moorfields Eye Hospital NHS Foundation Trust,2,Moorfields Eye Hospital
3,31,Hospital for Sick Children,3,The Hospital for Sick Children
4,47,Princess Margaret Cancer Centre,4,Princess Margaret Hospital (Toronto)


In [5]:
all_targets = raw_dataset['title_r'].unique().tolist()

# Create list of dicts
dataset = []
for _, row in raw_dataset.iterrows():
    dataset.append({
        'source': row['title_l'],
        'gold': row['title_r'],
        'targets': all_targets,
    })

In [6]:
dataset[0]

{'source': 'Barts Health NHS Trust',
 'gold': 'Barts and The London NHS Trust',
 'targets': ['Barts and The London NHS Trust',
  'Uniklinikum Aachen',
  'Moorfields Eye Hospital',
  'The Hospital for Sick Children',
  'Princess Margaret Hospital (Toronto)',
  'Queen Mary Hospital (Hong Kong)',
  'Mount Sinai Hospital (Manhattan)',
  "Evelina London Children's Hospital",
  'James Paget University Hospitals NHS Foundation Trust',
  'LAC+USC Medical Center',
  'TYKS',
  'NYC Health + Hospitals/Bellevue',
  'Royal Stoke University Hospital',
  'Oslo University Hospital, Rikshospitalet',
  'WellStar Kennestone Regional Medical Center',
  "Children's Medical Center Dallas",
  'St John of God Hospital Geelong',
  'Prince of Wales Hospital (Sydney)',
  "St Vincent's Private Hospital",
  'Aalborg University Hospital',
  'Wagga Wagga Rural Referral Hospital',
  'Florida Hospital',
  'University of California, Irvine Medical Center',
  'UC San Diego Health',
  'Lankenau Hospital',
  'Queen Elizab

In [7]:
from rapidfuzz.distance import Levenshtein
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import pandas as pd
import random

random.seed(42)
max_length = 10

model = SentenceTransformer('all-MiniLM-L6-v2')


def get_semantic_features(target_name, source_name, target_values, source_value):
    """
    Compute semantic similarity features between a single source_value and a list of target_values.
    Returns [max_similarity, min_similarity, avg_similarity]
    """
    if len(target_values) == 0:
        return [0.0, 0.0, 0.0]

    source_embedding = model.encode([str(source_value)])
    target_embeddings = model.encode([str(x) for x in target_values])

    similarities = cosine_similarity(source_embedding, target_embeddings)
    max_similarity = similarities.max()
    min_similarity = similarities.min()
    avg_similarity = similarities.mean()

    return [max_similarity, min_similarity, avg_similarity]


def get_lexical_features(target_name, source_name, target_values, source_value):
    """
    Compute Levenshtein similarity features between a single source_value and a list of target_values.
    Returns [max_sim, min_sim, avg_sim]
    """
    if len(target_values) == 0:
        return [0.0, 0.0, 0.0]

    similarities = [
        Levenshtein.normalized_similarity(source_value, target)
        for target in target_values
    ]
    max_sim = max(similarities)
    min_sim = min(similarities)
    avg_sim = sum(similarities) / len(similarities)

    return [max_sim, min_sim, avg_sim]


def get_statistic_features(target_name, source_name, target_values, source_value):
    """
    Compute simple ratio-based statistic features using a single source_value.
    Returns [ratio]
    """
    if len(target_values) == 0:
        return [0.0]

    # With a single source_value, ratio is either 1/len(target_values)
    return [1.0 / len(target_values)]


In [None]:
# --- Define mapping algorithms ---
actions_dict = {
    0: 'lexical',
    1: 'semantic',
    2: 'llm_reasoning'
}

num_algorithms = len(actions_dict)


def lexical_algorithm(source, targets):
    source_column = 'source'
    target_column = 'target'
    source_dataset = pd.DataFrame({source_column: [source] })
    target_dataset = pd.DataFrame({ target_column: targets })
    matches = bdi.match_values(
                            source_dataset,
                            target_dataset,
                            attribute_matches=(source_column, target_column),
                            method="edit_distance",
                        )
    return matches["target_value"].iloc[0]

def semantic_algorithm(source, targets):
    source_column = 'source'
    target_column = 'target'
    source_dataset = pd.DataFrame({source_column: [source] })
    target_dataset = pd.DataFrame({ target_column: targets })
    matches = bdi.match_values(
                            source_dataset,
                            target_dataset,
                            attribute_matches=(source_column, target_column),
                            method="embedding",
                        )
    return matches["target_value"].iloc[0]

def llm_reasoning_algorithm(source, targets):
    # Harcoded return the gold value for testing purposes

    for item in dataset:
        if item['source'] == source:
            return item['gold']
    return None

# TODO: Add more algorithms as needed
primitives_dict = {
    'lexical': lexical_algorithm,
    'semantic': semantic_algorithm,
    'llm_reasoning': llm_reasoning_algorithm
}


class ValueMatchingEnv(gym.Env):
    """
    One RL episode = one (source, targets, gold).
    The state now includes the full history of chosen algorithms.
    """

    def __init__(self, max_steps=3):
        super(ValueMatchingEnv, self).__init__()

        self.max_steps = max_steps
        self.action_space = gym.spaces.Discrete(num_algorithms)

        # Feature vector:
        # lexical(3) + semantic(3) + stats(1?) + history(max_steps)
        feature_dim = 7 + max_steps   # adjust if needed
        self.observation_space = gym.spaces.Box(
            low=0.0, high=1.0,
            shape=(feature_dim,),
            dtype=np.float32
        )

        # Episode-specific values
        self.source = None
        self.targets = None
        self.gold = None

        self.steps_taken = 0
        self.action_history = None

    # ---- Convert action history to normalized vector ----
    def _encode_history(self):
        encoded = []
        for action in self.action_history:
            if action == -1:
                encoded.append(0.0)  # unused slot
            else:
                encoded.append((action + 1) / num_algorithms)
        return encoded

    # ---- Compute full feature vector ----
    def _compute_features(self, source, targets):
        lexical_features = get_lexical_features('target', 'source', targets, source)
        semantic_features = get_semantic_features('target', 'source', targets, source)
        statistic_features = get_statistic_features('target', 'source', targets, source)

        history_vector = self._encode_history()

        all_features = lexical_features + semantic_features + statistic_features + history_vector
        return np.array(all_features, dtype=np.float32)

    # ---- Reset episode ----
    def reset(self, source, targets, gold):
        self.source = source
        self.targets = targets
        self.gold = gold

        self.steps_taken = 0
        # Initialize full history buffer
        self.action_history = [-1] * self.max_steps

        self.state = self._compute_features(source, targets)
        return self.state

    # ---- Step ----
    def step(self, action):
        # Record algorithm in history
        if self.steps_taken < self.max_steps:
            self.action_history[self.steps_taken] = action

        self.steps_taken += 1

        alg_name = actions_dict[action]
        predicted = primitives_dict[alg_name](self.source, self.targets)

        reward = 1.0 if predicted == self.gold else 0.0
        done = reward == 1.0 or self.steps_taken >= self.max_steps

        if not done:
            self.state = self._compute_features(self.source, self.targets)
        else:
            self.state = np.zeros(self.observation_space.shape, dtype=np.float32)

        info = {
            "predicted": predicted,
            "gold": self.gold,
            "algorithm": alg_name,
            "history": self.action_history.copy()
        }

        return self.state, reward, done, info


In [27]:
# Create environment
env = ValueMatchingEnv(dataset, max_steps=3)
num_actions = env.action_space.n

# Initialize Q-table
Q = {}

# Set hyperparameters
alpha = 0.1    # learning rate
gamma = 0.9    # discount factor
epsilon = 0.5  # exploration rate

# Helper function to convert numeric state vector to tuple (hashable for Q-table)
def state_to_key(state):
    return tuple(state.round(3))  # round to reduce floating-point noise

# Training loop
num_episodes = 1000
for episode in range(num_episodes):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        state_key = state_to_key(state)
        
        # Choose action
        if np.random.rand() < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            q_values = {a: Q.get((state_key, a), 0) for a in range(num_actions)}
            action = max(q_values, key=q_values.get)  # Exploit
        
        # Take action
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        
        next_state_key = state_to_key(next_state)
        next_q_values = {a: Q.get((next_state_key, a), 0) for a in range(num_actions)}
        max_next_q_value = max(next_q_values.values()) if next_q_values else 0
        
        # Update Q-table
        current_q_value = Q.get((state_key, action), 0)
        updated_q_value = current_q_value + alpha * (reward + gamma * max_next_q_value - current_q_value)
        Q[(state_key, action)] = updated_q_value
        
        state = next_state

    print(f"Episode {episode + 1}, Total Reward: {total_reward}")

# --- Evaluate trained agent ---
total_rewards = []
num_eval_episodes = 3

for _ in range(num_eval_episodes):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        state_key = state_to_key(state)
        q_values = {a: Q.get((state_key, a), 0) for a in range(num_actions)}
        action = max(q_values, key=q_values.get)
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        state = next_state
    
    print('Predicted target:', info['predicted_target'])
    print('Gold target:', info['gold'])
    print('Algorithm used:', info['algorithm_used'])
    print('Test reward:', total_reward)
    total_rewards.append(total_reward)

print(f"Average Total Reward: {np.mean(total_rewards)}")


Episode 1, Total Reward: 1.0
Episode 2, Total Reward: 1.0
Episode 3, Total Reward: 1.0
Episode 4, Total Reward: 1.0
Episode 5, Total Reward: 1.0
Episode 6, Total Reward: 1.0
Episode 7, Total Reward: 1.0
Episode 8, Total Reward: 1.0
Episode 9, Total Reward: 1.0
Episode 10, Total Reward: 1.0
Episode 11, Total Reward: 1.0
Episode 12, Total Reward: 1.0
Episode 13, Total Reward: 1.0
Episode 14, Total Reward: 1.0
Episode 15, Total Reward: 1.0
Episode 16, Total Reward: 1.0
Episode 17, Total Reward: 1.0
Episode 18, Total Reward: 1.0
Episode 19, Total Reward: 1.0
Episode 20, Total Reward: 1.0
Episode 21, Total Reward: 1.0
Episode 22, Total Reward: 1.0
Episode 23, Total Reward: 1.0
Episode 24, Total Reward: 1.0
Episode 25, Total Reward: 1.0
Episode 26, Total Reward: 1.0
Episode 27, Total Reward: 1.0
Episode 28, Total Reward: 1.0
Episode 29, Total Reward: 1.0
Episode 30, Total Reward: 1.0
Episode 31, Total Reward: 1.0
Episode 32, Total Reward: 1.0
Episode 33, Total Reward: 1.0
Episode 34, Total R