In [3]:
import pandas as pd
from pathlib import Path
import os, yaml
if '_fixed' not in locals():
    _fixed = os.chdir(os.path.dirname(os.getcwd()))
from ludwig import repo_root
from ludwig.util import vllm_Client
from ludwig import LiveBenchReasoning
from collections import Counter
from tqdm.notebook import tqdm
import random
from tabulate import tabulate
import json
import networkx as nx
import io
import matplotlib.pyplot as plt

In [4]:

# Login using e.g. `huggingface-cli login` to access this dataset
# df = pd.read_parquet("hf://datasets/livebench/reasoning/data/test-00000-of-00001.parquet")
subtasks = ['zebra_puzzle', 'spatial', 'web_of_lies_v2']
task = LiveBenchReasoning(subtasks[0])
task.prepare()
task.total_questions

90

In [5]:
devqs = [task.ask_dev(i) for i in range(task.total_dev_questions)]
qs = devqs
len(devqs)

10

In [6]:
print(qs[0]['question'])

There are 2 people standing in a line. From left to right, they are numbered 1 to 2.
Each person has a set of attributes: Hobby, Job, Movie-Genre.
The attributes have the following possible values:
Hobby: filmmaking, collecting
Job: journalist, police-officer
Movie-Genre: adventure, thriller
Each person has a unique value for each attribute.
You know the following about the people:
The person who is a journalist is somewhere to the right of the person who watches adventure
The person who is a police-officer is not the same as the person who likes filmmaking

Given this information, answer the following questions:
At what position is the person who watches adventure?
What hobby does the person who is a journalist do?
What is the job of the person who watches adventure?
What is the job of the person who watches thriller?
Think step by step and explain your reasoning, then output your answers in order in the format:
<solution>answer1, answer2, answer3, ...</solution>
For instance, if ther

In [4]:
qs[0]['rationale']

["This puzzle's premises lead to a logical contradiction. However, the given answer can be reached by following one line of reasoning while ignoring the paradox.",
 'First, a direct fact: the person at the **theater (Nia) is a liar**.',
 "The person at the **art gallery (Jake)** says 'the person at the theater lies.' Since this is a true statement, the person at the **art gallery is a truth-teller**.",
 "The person at the **campground (Nadia)** says 'the person at the art gallery tells the truth.' This is also a true statement, so the **campground person is a truth-teller**.",
 "The person at the **aquarium (Ayaan)** says 'the person at the campground tells the truth.' This is true, so the **aquarium person is a truth-teller** (Answer: yes).",
 "The person at the **amusement park (Mateo)** says 'the person at the aquarium tells the truth.' This is true, so the **amusement park person is a truth-teller** (Answer: yes).",
 "Finally, the person at the **train station (Kehinde)** says 'the

In [34]:
# regqs = [task.ask(i) for i in range(task.total_questions)]
# qs = regqs
# len(regqs)

In [35]:
# qs = [task.ask(i, dev=True) for i in range(len(task._data))]
# len(qs)

In [36]:
def show(inds=None):
    q = enumerate(qs) if inds is None else [(i, qs[i]) for i in inds]
    print(tabulate([(i, q['level'], q['answer'], q['question']) for i, q in q]))
show()

-  ---  -------------  -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [37]:
# # random.seed(53)
# inds = random.sample(range(len(qs)), k=10)
# show(inds)
# inds

In [38]:
qs[0].keys()

dict_keys(['question_id', 'category', 'ground_truth', 'turns', 'task', 'livebench_release_date', 'livebench_removal_date', 'level', 'question', 'answer', 'system'])

In [39]:
for q in qs:
    print(f'```\n{q['question']}\n```\nThe answer is: {q['answer']!r}\n')

```
In this question, assume each person either always tells the truth or always lies. Tala is at the movie theater. The person at the restaurant says the person at the aquarium lies. Ayaan is at the aquarium. Ryan is at the botanical garden. The person at the park says the person at the art gallery lies. The person at the museum tells the truth. Zara is at the museum. Jake is at the art gallery. The person at the art gallery says the person at the theater lies. Beatriz is at the park. The person at the movie theater says the person at the train station lies. Nadia is at the campground. The person at the campground says the person at the art gallery tells the truth. The person at the theater lies. The person at the amusement park says the person at the aquarium tells the truth. Grace is at the restaurant. The person at the aquarium thinks their friend is lying. Nia is at the theater. Kehinde is at the train station. The person at the theater thinks their friend is lying. The person at 

In [11]:
import re
from itertools import permutations

def solve_livebench_puzzle(problem_text: str) -> str:
    """
    Solves a livebench logic puzzle by parsing its rules, deducing the solution
    step-by-step, and generating a human-readable rationale.

    Args:
        problem_text: The full text of the livebench puzzle.

    Returns:
        A string containing the step-by-step reasoning and the final answer
        in the specified <solution> format.
    """

    # --- 1. Parsing the Input Text ---
    
    # Extract number of people
    num_people_match = re.search(r"There are (\d+) people", problem_text)
    num_people = int(num_people_match.group(1)) if num_people_match else 0

    # Extract attributes and their possible values
    attributes_text = re.search(r"attributes have the following possible values:(.*?)"
                                r"Each person has a unique value", problem_text, re.DOTALL).group(1)
    attributes = {}
    for line in attributes_text.strip().split('\n'):
        key, values = line.split(':')
        attributes[key.strip()] = [v.strip() for v in values.split(',')]

    # Extract clues
    clues_text = re.search(r"You know the following about the people:(.*?)"
                           r"Given this information", problem_text, re.DOTALL).group(1)
    clues = [clue.strip() for clue in clues_text.strip().split('\n')]

    # Extract questions
    questions_text = re.search(r"answer the following questions:(.*?)"
                               r"Think step by step", problem_text, re.DOTALL).group(1)
    questions = [q.strip() for q in questions_text.strip().split('\n')]
    
    # --- 2. Solving Logic and Generating Rationale ---

    rationale = []
    
    # Initialize a list of dictionaries, one for each person/position
    solution = [{} for _ in range(num_people)]
    
    # Helper to find which category a value belongs to (e.g., 'chef' -> 'Job')
    def get_category(value):
        for cat, values in attributes.items():
            if value in values:
                return cat
        return None

    # Helper to apply deductions and their opposites
    def assign(person_index, category, value, reason):
        # Direct assignment
        if category not in solution[person_index]:
            solution[person_index][category] = value
            rationale.append(f"{reason}, the person at **position {person_index + 1}** must have the **{category}** of **{value}**.")
            
            # Elimination assignment for the other person (since N=2)
            other_index = 1 - person_index
            if category not in solution[other_index]:
                other_value = [v for v in attributes[category] if v != value][0]
                solution[other_index][category] = other_value
                rationale.append(f"By elimination, the person at **position {other_index + 1}** must have the **{category}** of **{other_value}**.")

    # A simple iterative solver. We loop a few times to ensure all deductions are made.
    for _ in range(len(attributes) + 1):
        for clue in clues:
            # Type 1: Positional clues (e.g., "in an even position")
            m = re.search(r"The person who (?:is a|plays|likes|watches|drinks) (\w+(?:-\w+)?) is in an (even|odd) position", clue)
            if m:
                value, position_type = m.group(1), m.group(2)
                cat = get_category(value)
                pos = 2 if position_type == 'even' else 1
                assign(pos - 1, cat, value, f"From the clue '{clue}'")

            # Type 2: Relational clues (e.g., "somewhere to the left of")
            m = re.search(r"The person who (?:is a|plays|likes|watches|drinks) (\w+(?:-\w+)?) is somewhere to the (left|right) of the person who (?:is a|plays|likes|watches|drinks) (\w+(?:-\w+)?)", clue)
            if m:
                val1, direction, val2 = m.groups()
                cat1, cat2 = get_category(val1), get_category(val2)
                
                p1_idx, p2_idx = (0, 1) if direction == 'left' else (1, 0)
                
                reason = f"Based on the clue '{clue}', the person with the attribute **{val1}** must be to the {direction} of the person with **{val2}**."
                assign(p1_idx, cat1, val1, reason)
                assign(p2_idx, cat2, val2, reason)
                
            # Type 3: "is not the same as" clue
            m = re.search(r"The person who (?:is a|likes) (\w+(?:-\w+)?) is not the same as the person who (?:is a|likes) (\w+(?:-\w+)?)", clue)
            if m:
                val1, val2 = m.groups()
                cat1, cat2 = get_category(val1), get_category(val2)
                # Find the person with val1 and assign the other value for cat2
                for i in range(num_people):
                    if solution[i].get(cat1) == val1:
                        other_val2 = [v for v in attributes[cat2] if v != val2][0]
                        assign(i, cat2, other_val2, f"From the clue '{clue}' and knowing the **{val1}** is at position {i+1}")

    # Combine attributes for the same person
    for i in range(num_people):
        known_values = list(solution[i].values())
        if len(known_values) > 1:
            rationale.append(f"Combining our findings, we now know the person at **position {i+1}** has the attributes: **{', '.join(known_values)}**.")

    # --- 3. Answering Questions and Formatting Output ---

    final_answers = []
    for question in questions:
        # Q-Type 1: "At what position is the person who..."
        m = re.search(r"At what position is the person who (?:is a|plays|likes|watches|drinks) (\w+(?:-\w+)?)", question)
        if m:
            value = m.group(1)
            cat = get_category(value)
            for i in range(num_people):
                if solution[i].get(cat) == value:
                    final_answers.append(str(i + 1))
                    break
        
        # Q-Type 2: "What is the [attribute] of the person who..."
        m = re.search(r"What (?:is the|job does the|hobby does the|movie genre does the|beverage does the) (\w+(?:-\w+)?) (?:of the person|does the person) (?:who|in position) (?:is a|plays|likes|watches|drinks|in position)? ?(\w+(?:-\w+)?)", question)
        if m:
            target_cat, key_val = m.group(1).replace(" ", "-"), m.group(2)
            target_cat = next((k for k in attributes if k.lower().startswith(target_cat.lower())), None)
            
            person_idx = -1
            if key_val.isdigit():
                person_idx = int(key_val) - 1
            else:
                key_cat = get_category(key_val)
                for i in range(num_people):
                    if solution[i].get(key_cat) == key_val:
                        person_idx = i
                        break
            if person_idx != -1:
                final_answers.append(solution[person_idx][target_cat])

    # Build the final output string
    output = "Here is my step-by-step reasoning to solve the puzzle:\n\n"
    output += "\n".join(f"- {step}" for step in rationale)
    output += "\n\nAfter establishing the complete arrangement, I can answer the questions."
    output += f"\n<solution>{', '.join(final_answers)}</solution>"
    
    return output


print(solve_livebench_puzzle(qs[0]['question']))

Here is my step-by-step reasoning to solve the puzzle:

- Based on the clue 'The person who is a journalist is somewhere to the right of the person who watches adventure', the person with the attribute **journalist** must be to the right of the person with **adventure**., the person at **position 2** must have the **Job** of **journalist**.
- By elimination, the person at **position 1** must have the **Job** of **police-officer**.
- Based on the clue 'The person who is a journalist is somewhere to the right of the person who watches adventure', the person with the attribute **journalist** must be to the right of the person with **adventure**., the person at **position 1** must have the **Movie-Genre** of **adventure**.
- By elimination, the person at **position 2** must have the **Movie-Genre** of **thriller**.
- From the clue 'The person who is a police-officer is not the same as the person who likes filmmaking' and knowing the **police-officer** is at position 1, the person at **posi

In [12]:
qs[0]['answer']

'1, filmmaking, police-officer, journalist'

In [15]:
import re
from typing import List, Dict, Any, Tuple

def generate_livebench_rationale(puzzle_text: str, correct_answer: str) -> str:
    """
    Solves a livebench logic puzzle and generates a step-by-step rationale.

    Args:
        puzzle_text: The full text of the puzzle, including setup, clues, and questions.
        correct_answer: The comma-separated string of correct answers.

    Returns:
        A string containing the step-by-step reasoning followed by the
        solution in the specified format.
    """
    
    # --- 1. Parsing the Input ---
    lines = [line.strip() for line in puzzle_text.strip().split('\n') if line.strip()]
    
    # Extract number of people
    num_people = int(re.search(r'(\d+) people', lines[0]).group(1))
    
    # Extract attributes and values
    attributes = {}
    # FIX: Precisely locate the attribute definition block to avoid parsing header lines.
    attr_start = lines.index("The attributes have the following possible values:") + 1
    attr_end = lines.index("Each person has a unique value for each attribute.")
    attr_lines = lines[attr_start:attr_end]
    for line in attr_lines:
        attr, values = line.split(': ')
        attributes[attr.strip()] = [v.strip() for v in values.split(', ')]
        
    # Extract clues
    clue_start = lines.index("You know the following about the people:") + 1
    question_start = lines.index("Given this information, answer the following questions:")
    clues = lines[clue_start:question_start]
    
    # Extract questions
    questions = lines[question_start+1:lines.index("Think step by step and explain your reasoning, then output your answers in order in the format:")]

    # --- 2. Setup Solver ---
    # Initialize a list of dictionaries, one for each person/position
    solution_grid: List[Dict[str, Any]] = [{'position': i + 1} for i in range(num_people)]
    
    # Helper function to find a person by a known attribute
    def find_person(**kwargs):
        attr, val = list(kwargs.items())[0]
        for person in solution_grid:
            if person.get(attr) == val:
                return person
        return None

    # Helper function to set an attribute and handle elimination
    def set_attribute(person_idx: int, attr: str, value: str):
        # Set the attribute for the target person
        solution_grid[person_idx][attr] = value
        # Find the opposing value
        other_value = next(v for v in attributes[attr] if v != value)
        # Set the opposing value for the other person (works for n=2)
        other_person_idx = 1 - person_idx
        solution_grid[other_person_idx][attr] = other_value

    # --- 3. Generate Rationale by Solving ---
    rationale = []
    
    # A simple, hardcoded solver for the specific patterns in the n=2 examples
    # This would need to be a more robust constraint satisfaction engine for general puzzles
    
    # Example 1 Logic
    if "journalist is somewhere to the right of the person who watches adventure" in puzzle_text:
        rationale.append("Step 1: Analyze Clue 1: 'The person who is a journalist is somewhere to the right of the person who watches adventure'.")
        rationale.append("In a line of two, this means the person who watches 'adventure' is at position 1, and the 'journalist' is at position 2.")
        set_attribute(0, 'Movie-Genre', 'adventure')
        set_attribute(1, 'Job', 'journalist')
        rationale.append("By elimination, the person at position 2 must watch 'thriller', and the person at position 1 must be the 'police-officer'.")
        
        rationale.append("\nStep 2: Analyze Clue 2: 'The person who is a police-officer is not the same as the person who likes filmmaking'.")
        rationale.append("We know the 'police-officer' is at position 1. Therefore, the person who likes 'filmmaking' cannot be at position 1, and must be at position 2.")
        set_attribute(1, 'Hobby', 'filmmaking')
        rationale.append("By elimination, the person at position 1 must like 'collecting'.")

    # Example 2 Logic
    elif "videographer is somewhere to the left of the person who watches animation" in puzzle_text:
        rationale.append("Step 1: Analyze Clue 2: 'The person who is a videographer is somewhere to the left of the person who watches animation'.")
        rationale.append("In a line of two, this means the 'videographer' must be at position 1 and the person watching 'animation' must be at position 2.")
        set_attribute(0, 'Job', 'videographer')
        set_attribute(1, 'Movie-Genre', 'animation')
        rationale.append("By elimination, the person at position 2 is the 'freelancer' and the person at position 1 watches 'thriller'.")

        rationale.append("\nStep 2: Analyze Clue 1: 'The person who is pakistani is on the immediate left or immediate right of the person who is a videographer'.")
        rationale.append("This means the Pakistani person and the videographer are different people. Since the videographer is at position 1, the Pakistani person must be at position 2.")
        set_attribute(1, 'Nationality', 'pakistani')
        rationale.append("By elimination, the person at position 1 must be 'nigerian'.")
        
    # Example 3 Logic
    elif "chef is in an even position" in puzzle_text:
        rationale.append("Step 1: Analyze Clue 1: 'The person who is a chef is in an even position'.")
        rationale.append("In a line of two, the only even position is 2. Therefore, the person at position 2 is the 'chef'.")
        set_attribute(1, 'Job', 'chef')
        rationale.append("By elimination, the person at position 1 must be the 'architect'.")

        rationale.append("\nStep 2: Analyze Clue 2: 'The person who plays snowboarding is somewhere to the right of the person who watches musical'.")
        rationale.append("This means the musical-watcher must be at position 1 and the snowboarder at position 2.")
        set_attribute(0, 'Movie-Genre', 'musical')
        set_attribute(1, 'Sport', 'snowboarding')
        rationale.append("By elimination, the person at position 1 must do 'skiing', and the person at position 2 must watch 'fantasy'.")

    # Example 4 Logic
    elif "engineer is not anywhere to the left of the person who likes chess" in puzzle_text:
        solution_grid = [{'position': i + 1} for i in range(num_people)] # Reset
        rationale = []
        rationale.append("Step 1: Analyze Clue 2: 'The person who likes cooking is somewhere to the left of the person who is thai'.")
        rationale.append("In a line of two, this means the person who likes 'cooking' is at position 1, and the 'thai' person is at position 2.")
        set_attribute(0, 'Hobby', 'cooking')
        set_attribute(1, 'Nationality', 'thai')
        rationale.append("By elimination, the person at position 2 likes 'chess' and the person at position 1 is 'canadian'.")

        rationale.append("\nStep 2: Analyze Clue 1: 'The person who is a engineer is not anywhere to the left of the person who likes chess'.")
        rationale.append("This means the engineer is at the same position or to the right of the chess player. We know the chess player is at position 2. As there is no position to the right, the engineer must also be at position 2.")
        set_attribute(1, 'Job', 'engineer')
        rationale.append("By elimination, the person at position 1 must be the 'chef'.")


    # Example 5 Logic
    elif "satire is somewhere to the left of the person who watches fantasy" in puzzle_text:
        rationale.append("Step 1: Analyze Clue 1: 'The person who watches satire is somewhere to the left of the person who watches fantasy'.")
        rationale.append("In a line of two, this places the 'satire' watcher at position 1 and the 'fantasy' watcher at position 2.")
        set_attribute(0, 'Movie-Genre', 'satire')

        rationale.append("\nStep 2: Analyze Clue 2: 'The person who drinks coffee is in an even position'.")
        rationale.append("The only even position is 2. So, the person at position 2 drinks 'coffee'.")
        set_attribute(1, 'Beverage', 'coffee')

        rationale.append("\nStep 3: Analyze Clue 3: 'The person who is a fisherman is somewhere to the right of the person who is a musician'.")
        rationale.append("This places the 'musician' at position 1 and the 'fisherman' at position 2.")
        set_attribute(0, 'Job', 'musician')
        
        rationale.append("\nStep 4: Filling in the remaining attributes by elimination.")
        rationale.append("From Step 1, the person at pos 2 watches 'fantasy'. From Step 2, the person at pos 1 drinks 'juice'. From Step 3, the person at pos 2 is the 'fisherman'.")
        # Explicitly set the eliminated values to complete the grid
        solution_grid[1]['Movie-Genre'] = 'fantasy'
        solution_grid[0]['Beverage'] = 'juice'
        solution_grid[1]['Job'] = 'fisherman'


    rationale.append("\nAfter all deductions, the final arrangement is:")
    for person in solution_grid:
        pos = person['position']
        attrs_list = [f"{k}: {v}" for k, v in person.items() if k != 'position']
        rationale.append(f"Position {pos}: {', '.join(sorted(attrs_list))}")
        
    rationale.append("\nAnswering the questions based on the final arrangement:")
    
    # --- 4. Answer Questions ---
    answers = []
    
    # This is a simplified parser for the known question patterns
    for q in questions:
        target_attr_match = re.search(r"What (?:is the |)(hobby|job|movie-genre|nationality|beverage|sport)", q.lower())
        target_attr = ""
        if target_attr_match:
            # Find the correctly cased attribute name
            target_attr = next(k for k in attributes.keys() if k.lower().replace("-","") == target_attr_match.group(1).replace("-",""))
            
        if "At what position is the person who" in q:
            match = re.search(r'who (?:is a |is |watches |likes |plays |drinks )([\w-]+)', q)
            val = match.group(1)
            attr = next(k for k, v_list in attributes.items() if val in v_list)
            ans = find_person(**{attr: val})['position']
            answers.append(str(ans))
        elif target_attr:
            if "in position" in q:
                pos = int(re.search(r'position (\d+)', q).group(1))
                ans = solution_grid[pos - 1][target_attr]
                answers.append(ans)
            else:
                match = re.search(r'who (?:is a |is |watches |likes |plays |drinks )([\w-]+)', q)
                val = match.group(1)
                ref_attr = next(k for k, v_list in attributes.items() if val in v_list)
                ans = find_person(**{ref_attr: val})[target_attr]
                answers.append(ans)

    # --- 5. Final Formatting ---
    reasoning_text = "\n".join(rationale)
    answer_text = ", ".join(answers)
    
    return f"{reasoning_text}\n<solution>{answer_text}</solution>"

# --- Example Usage ---

puzzle_1_text = """
There are 2 people standing in a line. From left to right, they are numbered 1 to 2.
Each person has a set of attributes: Hobby, Job, Movie-Genre.
The attributes have the following possible values:
Hobby: filmmaking, collecting
Job: journalist, police-officer
Movie-Genre: adventure, thriller
Each person has a unique value for each attribute.
You know the following about the people:
The person who is a journalist is somewhere to the right of the person who watches adventure
The person who is a police-officer is not the same as the person who likes filmmaking

Given this information, answer the following questions:
At what position is the person who watches adventure?
What hobby does the person who is a journalist do?
What is the job of the person who watches adventure?
What is the job of the person who watches thriller?
Think step by step and explain your reasoning, then output your answers in order in the format:
<solution>answer1, answer2, answer-3, ...</solution>
"""
puzzle_1_answer = "1, filmmaking, police-officer, journalist"

# Generate and print the rationale for the first puzzle
print(generate_livebench_rationale(puzzle_1_text, puzzle_1_answer))

Step 1: Analyze Clue 1: 'The person who is a journalist is somewhere to the right of the person who watches adventure'.
In a line of two, this means the person who watches 'adventure' is at position 1, and the 'journalist' is at position 2.
By elimination, the person at position 2 must watch 'thriller', and the person at position 1 must be the 'police-officer'.

Step 2: Analyze Clue 2: 'The person who is a police-officer is not the same as the person who likes filmmaking'.
We know the 'police-officer' is at position 1. Therefore, the person who likes 'filmmaking' cannot be at position 1, and must be at position 2.
By elimination, the person at position 1 must like 'collecting'.

After all deductions, the final arrangement is:
Position 1: Hobby: collecting, Job: police-officer, Movie-Genre: adventure
Position 2: Hobby: filmmaking, Job: journalist, Movie-Genre: thriller

Answering the questions based on the final arrangement:
<solution>1</solution>


In [3]:
df['task'].value_counts()

task
zebra_puzzle      100
spatial            50
web_of_lies_v2     50
Name: count, dtype: int64

In [None]:
client = vllm_Client('8001')
client.prepare()

In [65]:
x = df.sample()
turns = x['turns'].values[0].tolist()
gt = x['ground_truth'].values[0]
print(turns[0])
gt

There are 4 people standing in a line numbered 1 through 4 in a left to right order.
Each person has a set of attributes: Pet, Movie-Genre, Sport, Beverage.
The attributes have the following possible values:
- Pet: frog, rabbit, horse, lizard
- Movie-Genre: action movies, drama movies, family movies, spy movies
- Sport: rugby, water-polo, volleyball, golf
- Beverage: juice, fanta, tea, sprite
and exactly one person in the line has a given value for an attribute.

Given the following premises about the line of people:
- the person that has a rabbit drinks sprite or the person that watches drama movies has a rabbit or both
- the person that watches drama movies plays volleyball or the person that plays volleyball drinks juice or both
- the person that has a lizard is not anywhere to the right of the person that plays water-polo
- the person who drinks fanta is not anywhere to the right of the person that plays golf
- the person that plays volleyball is somewhere between the person that p

'rabbit'

In [68]:
resp = client.step(turns[0], max_tokens=4096)

msg = resp['choices'][0]['message']
print(msg['content'])

***rabbit***


In [None]:
resp