In [39]:
from pathlib import Path
from pprint import pformat

import polars as pl

In [4]:
DATA_DIR = Path("../data")

In [65]:
answer_df = pl.read_ndjson(DATA_DIR / "answers.jsonl")
question_df = pl.read_ndjson(DATA_DIR / "questions.jsonl")
rating_df = pl.read_ndjson(DATA_DIR / "mturk_ratings.jsonl")
assignment_df = pl.read_ndjson(DATA_DIR / "assignments.jsonl")

In [82]:
def search_questions(query: str):
    query = query.lower()
    return question_df.filter(
        pl.col("question").str.to_lowercase().str.contains(query)
    )


def search_answers(query: str):
    query = query.lower()
    return answer_df.filter(
        pl.col("answer").str.to_lowercase().str.contains(query)
    )


def print_instance(question_id: str, answer_id: str = None):
    question = question_df.filter(pl.col("question_id") == question_id).row(
        0, named=True
    )["question"]
    print(f"## Question:\n{question}\n")

    answers = answer_df.filter(pl.col("question_id") == question_id)
    for answer in answers.iter_rows(named=True):
        print(
            f"### Answer ID: {answer['answer_id']} (score: {answer['answer_score']})"
        )

        answer_str = answer["answer"].replace("<br>", "\n")
        print(f"{pformat(answer_str)}\n")

        print("#### Ratings:")

        ratings = rating_df.filter(pl.col("answer_id") == answer["answer_id"]).sort("assignment_id")
        print("assignment_id\t\t\tFact\tInfo\tForm\tAcce\tPref")
        for rating in ratings.iter_rows(named=True):
            print(
                f"{rating['assignment_id']}\t{rating['factuality']}\t"
                f"{rating['amountInfo']}\t{rating['formality']}\t"
                f"{rating['acceptability']}\t{rating['preference']}"
            )

        print("\n")

    assignment_ids = ratings.get_column("assignment_id").unique().to_list()
    print("### Reason of preference:")
    for assignment_id in sorted(assignment_ids):
        assignment = assignment_df.filter(
            pl.col("assignment_id") == assignment_id
        ).row(0, named=True)
        print(assignment_id, assignment["Answer.reason_of_preference"])

In [83]:
search_questions("why do we hate")

question_id,question
str,str
"""1fflvw""","""- Why do we hate our own voice…"


In [85]:
print_instance(question_id="1fflvw")

## Question:
- Why do we hate our own voice when we hear it recorded?

### Answer ID: chatgpt-formal-1fflvw (score: None)
('We perceive our voice differently when we hear it on a recording because we '
 'are not used to hearing it from that perspective. When we speak, we hear our '
 'voice through the bones in our skull as well as through the air, which '
 'creates a richer, deeper tone. However, when we hear a recording of our '
 'voice, we only hear it through the air, resulting in a higher-pitched sound '
 'that we may not like. Additionally, some people may be self-conscious about '
 'their voice because they associate it with how they look or their image of '
 'themselves.')

#### Ratings:
assignment_id			Fact	Info	Form	Acce	Pref
31LVTDXBLAB0HNGMIMU5ML1NVB3LRO	3	0	0	3	1
33L7PJKHCJZIIV9VSMQWHVGZZW28T3	2	0	1	3	0
3SNVL38CI7TLRF9LFPS9IQPM6AVKCE	3	0	0	3	1


### Answer ID: ca9sd6w (score: 18.0)
('When you hear your voice normally, you hear a sound transmitted through the '
 'air just li