In [6]:
import bz2

file_path = "/Users/a1/github_reps/butterboard_CRAG/data/crag_task_1_and_2_dev_v4.jsonl.bz2"

with bz2.open(file_path, "rt", encoding="utf-8") as f:
    print(f.readline(228))  # Вывести первую строку файла

{"interaction_id": "7bb29eb4-12f9-45f9-bf8a-66832b3c8962", "query_time": "03/10/2024, 23:19:21 PT", "domain": "sports", "question_type": "post-processing", "static_or_dynamic": "static", "query": "how many 3-point attempts did s


# Filtering the long-tailed questions

In [10]:
import bz2
import json
import pandas as pd
from collections import Counter

CRAG_DATA_PATH = "data/crag_task_1_and_2_dev_v4.jsonl.bz2"
FILTERED_DATA_PATH = "data/filtered_long_tailed_questions_test.jsonl"

# Define long-tailed question criteria
RARE_THRESHOLD = 5  # Words appearing less than 5 times
LONG_TAIL_TYPES = ["multi-hop", "comparison", "false_premise"]  # Harder question types
SAMPLE_SIZE = 10  # Limit dataset size

def load_crag_data(filepath):
    """Load CRAG dataset from bz2 compressed JSONL format."""
    with bz2.open(filepath, "rt") as f:
        return [json.loads(line) for line in f]

def filter_long_tailed_questions(data):
    """Extract long-tailed questions using word frequency and question type."""
    df = pd.DataFrame(data)

    # Compute word frequencies
    all_words = " ".join(df["query"]).split()
    word_counts = Counter(all_words)

    def is_long_tailed(query, question_type):
        words = query.split()
        rare_words = [word for word in words if word_counts[word] < RARE_THRESHOLD]
        return len(rare_words) > 2 or question_type in LONG_TAIL_TYPES

    filtered_df = df[df.apply(lambda row: is_long_tailed(row["query"], row["question_type"]), axis=1)]
    return filtered_df.sample(n=min(SAMPLE_SIZE, len(filtered_df)), random_state=42)

if __name__ == "__main__":
    crag_data = load_crag_data(CRAG_DATA_PATH)
    filtered_data = filter_long_tailed_questions(crag_data)
    
    # Save filtered dataset
    with open(FILTERED_DATA_PATH, "w") as f:
        for record in filtered_data.to_dict(orient="records"):
            f.write(json.dumps(record) + "\n")

    print(f"Filtered {len(filtered_data)} long-tailed questions saved to {FILTERED_DATA_PATH}")


Filtered 10 long-tailed questions saved to data/filtered_long_tailed_questions_test.jsonl


# Creating limited balanced dataset

In [12]:
import bz2
import json
import pandas as pd
from collections import Counter

CRAG_DATA_PATH = "data/crag_task_1_and_2_dev_v4.jsonl.bz2"
FILTERED_DATA_PATH = "data/balanced_100_questions.jsonl"
SAMPLE_PER_CATEGORY = 10  # Number of questions per category

def load_crag_data(filepath):
    """Load CRAG dataset from bz2 compressed JSONL format."""
    with bz2.open(filepath, "rt") as f:
        return [json.loads(line) for line in f]

def sample_balanced_questions(data):
    """Select an equal number of questions from each category."""
    df = pd.DataFrame(data)
    categories = df["question_type"].unique()
    
    sampled_dfs = []
    for category in categories:
        category_df = df[df["question_type"] == category]
        sampled_dfs.append(category_df.sample(n=min(SAMPLE_PER_CATEGORY, len(category_df)), random_state=42))
    
    return pd.concat(sampled_dfs)

if __name__ == "__main__":
    crag_data = load_crag_data(CRAG_DATA_PATH)
    balanced_data = sample_balanced_questions(crag_data)
    
    # Save balanced dataset
    with open(FILTERED_DATA_PATH, "w") as f:
        for record in balanced_data.to_dict(orient="records"):
            f.write(json.dumps(record) + "\n")

    print(f"Balanced dataset with {len(balanced_data)} questions saved to {FILTERED_DATA_PATH}")


Balanced dataset with 80 questions saved to data/balanced_100_questions.jsonl


# Compressing files to bz2

In [11]:
import bz2
import json

def compress_jsonl_to_bz2(jsonl_path, bz2_path):
    with open(jsonl_path, 'rt') as jsonl_file, bz2.open(bz2_path, 'wt') as bz2_file:
        for line in jsonl_file:
            bz2_file.write(line)

if __name__ == "__main__":
    jsonl_path = 'data/filtered_long_tailed_questions_test.jsonl'  # Replace with the path to your JSONL file
    bz2_path = 'data/filtered_long_tailed_questions_test.jsonl.bz2'  # Replace with the desired output path

    compress_jsonl_to_bz2(jsonl_path, bz2_path)
    print(f"Compressed {jsonl_path} to {bz2_path}")

Compressed data/filtered_long_tailed_questions_test.jsonl to data/filtered_long_tailed_questions_test.jsonl.bz2


In [13]:
import bz2
import json

def compress_jsonl_to_bz2(jsonl_path, bz2_path):
    with open(jsonl_path, 'rt') as jsonl_file, bz2.open(bz2_path, 'wt') as bz2_file:
        for line in jsonl_file:
            bz2_file.write(line)

if __name__ == "__main__":
    jsonl_path = 'data/balanced_100_questions.jsonl'  # Replace with the path to your JSONL file
    bz2_path = 'data/balanced_100_questions.jsonl.bz2'  # Replace with the desired output path

    compress_jsonl_to_bz2(jsonl_path, bz2_path)
    print(f"Compressed {jsonl_path} to {bz2_path}")

Compressed data/balanced_100_questions.jsonl to data/balanced_100_questions.jsonl.bz2
