In [None]:
def prepare_training_data(input_file, output_file):
    """Convert processed FEVEROUS data to training-ready format"""
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    training_examples = []
    claims = data["processed_claims"]

    for claim in tqdm(claims, desc="Preparing examples"):
        # Basic claim information
        example = {
            "id": claim.get("id", ""),
            "claim": claim.get("claim", ""),
            "verdict": claim.get("verdict", ""),
            "evidence_count": sum(len(ev_set) for ev_set in claim.get("evidence", [])),
            "evidence_sets": []
        }

        # Process evidence sets
        for ev_set_idx, ev_set in enumerate(claim.get("evidence", [])):
            formatted_set = []
            for ev_piece in ev_set:
                # Extract source from ID
                source = ev_piece.get("id", "").split("_")[0] if "_" in ev_piece.get("id", "") else "unknown"

                formatted_set.append({
                    "content": ev_piece.get("content", ""),
                    "source": source,
                    "type": determine_evidence_type(ev_piece.get("id", "")),
                })

            example["evidence_sets"].append(formatted_set)

        training_examples.append(example)

    # Save to file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump({"examples": training_examples}, f)

    print(f"Saved {len(training_examples)} examples to {output_file}")

def determine_evidence_type(ev_id):
    """Determine evidence type from ID"""
    if "_sentence_" in ev_id:
        return "sentence"
    elif "_cell_" in ev_id:
        return "table_cell"
    elif "_item_" in ev_id:
        return "list_item"
    else:
        return "other"

In [None]:
def split_into_chunks(input_file, output_dir, chunk_size=10000):
    """Split large dataset into manageable chunks"""
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    examples = data["examples"]
    os.makedirs(output_dir, exist_ok=True)

    # Split into chunks
    for i in range(0, len(examples), chunk_size):
        chunk = examples[i:i+chunk_size]
        chunk_file = os.path.join(output_dir, f"feverous_chunk_{i//chunk_size}.json")

        with open(chunk_file, 'w', encoding='utf-8') as f:
            json.dump({"examples": chunk}, f)

        print(f"Saved chunk {i//chunk_size} with {len(chunk)} examples to {chunk_file}")

In [None]:
def create_stratified_samples(input_file, output_dir, sample_size=5000):
    """Create stratified samples for balanced training"""
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    examples = data["examples"]

    # Group by verdict
    by_verdict = {
        "SUPPORTS": [],
        "REFUTES": [],
        "NOT ENOUGH INFO": []
    }

    for example in examples:
        verdict = example["verdict"]
        if verdict in by_verdict:
            by_verdict[verdict].append(example)

    # Create balanced sample
    os.makedirs(output_dir, exist_ok=True)

    # Calculate samples per class
    total_classes = len(by_verdict)
    per_class = sample_size // total_classes

    balanced_sample = []
    for verdict, examples in by_verdict.items():
        # Take random sample or all if not enough
        sample_count = min(per_class, len(examples))
        sampled = random.sample(examples, sample_count)
        balanced_sample.extend(sampled)
        print(f"Added {sample_count} {verdict} examples")

    # Shuffle final sample
    random.shuffle(balanced_sample)

    # Save balanced sample
    sample_file = os.path.join(output_dir, f"feverous_balanced_{sample_size}.json")
    with open(sample_file, 'w', encoding='utf-8') as f:
        json.dump({"examples": balanced_sample}, f)

    print(f"Saved balanced sample with {len(balanced_sample)} examples to {sample_file}")

In [None]:
def create_prompt_files(input_file, output_dir):
    """Create files with pre-formatted prompts"""
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    examples = data["examples"]
    os.makedirs(output_dir, exist_ok=True)

    formatted_examples = []
    for example in tqdm(examples, desc="Formatting prompts"):
        # Format evidence text
        formatted_evidence = []
        for i, ev_set in enumerate(example["evidence_sets"]):
            for j, ev_piece in enumerate(ev_set):
                source = ev_piece["source"]
                content = ev_piece["content"]
                ev_type = ev_piece["type"]

                formatted_evidence.append(
                    f"Evidence {len(formatted_evidence)+1} (from {source} article{', table' if ev_type == 'table_cell' else ''}): {content}"
                )

        # Create prompt
        prompt = f"""Instruction: Analyze the following claim and evidence to determine if the claim is supported, refuted, or cannot be determined from the evidence provided.

Claim: {example["claim"]}

Evidence:
{chr(10).join(formatted_evidence)}

Question: Based on the evidence, does the claim appear to be SUPPORTED, REFUTED, or NOT ENOUGH INFORMATION?"""

        formatted_examples.append({
            "id": example["id"],
            "prompt": prompt,
            "verdict": example["verdict"]
        })

    # Save formatted prompts
    output_file = os.path.join(output_dir, "feverous_formatted_prompts.json")
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump({"examples": formatted_examples}, f)

    print(f"Saved {len(formatted_examples)} formatted prompts to {output_file}")

In [None]:
def create_tfrecords(input_file, output_dir):
    """Create TFRecord files for TensorFlow training"""
    import tensorflow as tf

    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    examples = data["examples"]
    os.makedirs(output_dir, exist_ok=True)

    # Create label mapping
    label_map = {
        "SUPPORTS": 0,
        "REFUTES": 1,
        "NOT ENOUGH INFO": 2
    }

    def _bytes_feature(value):
        """Returns a bytes_list feature."""
        if isinstance(value, type(tf.constant(0))):
            value = value.numpy()
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

    def _int64_feature(value):
        """Returns an int64_list feature."""
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

    # Write TFRecords in chunks
    chunk_size = 5000
    for i in range(0, len(examples), chunk_size):
        chunk = examples[i:i+chunk_size]
        output_file = os.path.join(output_dir, f"feverous_chunk_{i//chunk_size}.tfrecord")

        with tf.io.TFRecordWriter(output_file) as writer:
            for example in tqdm(chunk, desc=f"Writing chunk {i//chunk_size}"):
                claim = example["claim"]

                # Format evidence
                evidence_text = ""
                for ev_set in example["evidence_sets"]:
                    for ev_piece in ev_set:
                        evidence_text += f"{ev_piece['source']}: {ev_piece['content']}\n"

                # Create TF Example
                tf_example = tf.train.Example(features=tf.train.Features(feature={
                    'id': _bytes_feature(example["id"].encode('utf-8')),
                    'claim': _bytes_feature(claim.encode('utf-8')),
                    'evidence': _bytes_feature(evidence_text.encode('utf-8')),
                    'label': _int64_feature(label_map.get(example["verdict"], 2))
                }))

                writer.write(tf_example.SerializeToString())

        print(f"Saved {len(chunk)} examples to {output_file}")

In [None]:
def create_pytorch_files(input_file, output_dir):
    """Create files for PyTorch training"""
    import torch

    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    examples = data["examples"]
    os.makedirs(output_dir, exist_ok=True)

    # Create label mapping
    label_map = {
        "SUPPORTS": 0,
        "REFUTES": 1,
        "NOT ENOUGH INFO": 2
    }

    # Prepare examples
    processed_examples = []
    for example in tqdm(examples, desc="Processing for PyTorch"):
        # Format evidence
        evidence_text = ""
        for ev_set in example["evidence_sets"]:
            for ev_piece in ev_set:
                evidence_text += f"{ev_piece['source']}: {ev_piece['content']}\n"

        processed_examples.append({
            "id": example["id"],
            "claim": example["claim"],
            "evidence": evidence_text,
            "label": label_map.get(example["verdict"], 2)
        })

    # Save as pickle file
    output_file = os.path.join(output_dir, "feverous_pytorch.pkl")
    with open(output_file, 'wb') as f:
        torch.save(processed_examples, f)

    print(f"Saved {len(processed_examples)} examples to {output_file}")

In [None]:
def process_feverous_for_training(input_train, input_dev, output_dir):
    """Complete pipeline for processing FEVEROUS data for training"""
    # Create base output directory
    os.makedirs(output_dir, exist_ok=True)

    # Step 1: Convert to standard format
    train_standard = os.path.join(output_dir, "feverous_train_standard.json")
    dev_standard = os.path.join(output_dir, "feverous_dev_standard.json")

    prepare_training_data(input_train, train_standard)
    prepare_training_data(input_dev, dev_standard)

    # Step 2: Create chunks for training set
    train_chunks_dir = os.path.join(output_dir, "train_chunks")
    split_into_chunks(train_standard, train_chunks_dir)

    # Step 3: Create stratified samples
    samples_dir = os.path.join(output_dir, "stratified_samples")
    create_stratified_samples(train_standard, samples_dir)

    # Step 4: Create formatted prompts
    prompts_dir = os.path.join(output_dir, "formatted_prompts")
    create_prompt_files(train_standard, prompts_dir)
    create_prompt_files(dev_standard, prompts_dir)

    # Step 5: Create framework-specific files
    tf_dir = os.path.join(output_dir, "tensorflow")
    pt_dir = os.path.join(output_dir, "pytorch")

    create_tfrecords(train_standard, tf_dir)
    create_tfrecords(dev_standard, tf_dir)

    create_pytorch_files(train_standard, pt_dir)
    create_pytorch_files(dev_standard, pt_dir)

    print("Complete FEVEROUS processing pipeline finished!")

In [None]:
import json
import os
import random
from tqdm import tqdm

def prepare_training_data(input_file, output_file):
    """Convert processed FEVEROUS data to a clean, training-ready format"""
    print(f"Processing {input_file}...")

    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    claims = data["processed_claims"]
    print(f"Found {len(claims)} claims")

    # Create a clean, simplified format
    training_examples = []

    for claim in tqdm(claims, desc="Processing claims"):
        # Basic claim information
        example = {
            "id": claim.get("id", ""),
            "claim": claim.get("claim", ""),
            "verdict": claim.get("verdict", ""),
            "evidence": []
        }

        # Process all evidence pieces
        for ev_set in claim.get("evidence", []):
            evidence_set = []
            for ev_piece in ev_set:
                # Determine evidence type
                ev_id = ev_piece.get("id", "")
                if "_sentence_" in ev_id:
                    ev_type = "sentence"
                elif "_cell_" in ev_id:
                    ev_type = "table_cell"
                elif "_item_" in ev_id:
                    ev_type = "list_item"
                else:
                    ev_type = "other"

                # Extract source document
                source = ev_id.split("_")[0] if "_" in ev_id else "unknown"

                # Add formatted evidence
                evidence_set.append({
                    "content": ev_piece.get("content", ""),
                    "type": ev_type,
                    "source": source
                })

            example["evidence"].append(evidence_set)

        training_examples.append(example)

    # Save to file
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump({"examples": training_examples}, f, indent=2)

    print(f"Saved {len(training_examples)} examples to {output_file}")
    return training_examples

def create_prompt_files(examples, output_file):
    """Create files with formatted prompts for training"""
    print("Creating formatted prompts...")

    formatted_examples = []
    for example in tqdm(examples, desc="Formatting prompts"):
        # Format all evidence pieces
        all_evidence = []
        for ev_set_idx, ev_set in enumerate(example["evidence"]):
            for ev_piece in ev_set:
                source = ev_piece["source"]
                content = ev_piece["content"]
                ev_type = ev_piece["type"]

                formatted_ev = f"Evidence {len(all_evidence)+1}"
                formatted_ev += f" (from {source}"
                if ev_type == "table_cell":
                    formatted_ev += ", table"
                elif ev_type == "list_item":
                    formatted_ev += ", list"
                formatted_ev += f"): {content}"

                all_evidence.append(formatted_ev)

        # Create prompt
        prompt = f"""Instruction: Analyze the following claim and evidence to determine if the claim is supported, refuted, or cannot be determined from the evidence provided.

Claim: {example["claim"]}

Evidence:
{chr(10).join(all_evidence)}

Question: Based on the evidence, does the claim appear to be SUPPORTED, REFUTED, or NOT ENOUGH INFORMATION?"""

        formatted_examples.append({
            "id": example["id"],
            "prompt": prompt,
            "verdict": example["verdict"]
        })

    # Save formatted prompts
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump({"examples": formatted_examples}, f, indent=2)

    print(f"Saved {len(formatted_examples)} formatted prompts to {output_file}")

def main():
    # Create output directory
    output_dir = "feverous_prepared"
    os.makedirs(output_dir, exist_ok=True)

    # Process training data
    train_examples = prepare_training_data(
        "train_all_processed.json",
        os.path.join(output_dir, "feverous_train.json")
    )

    # Process dev data
    dev_examples = prepare_training_data(
        "dev_all_processed.json",
        os.path.join(output_dir, "feverous_dev.json")
    )

    # Create prompt files
    create_prompt_files(
        train_examples,
        os.path.join(output_dir, "feverous_train_prompts.json")
    )

    create_prompt_files(
        dev_examples,
        os.path.join(output_dir, "feverous_dev_prompts.json")
    )

    print("FEVEROUS data preparation complete!")

if __name__ == "__main__":
    main()

In [None]:
# Example loading code
with open("feverous_prepared/feverous_train_prompts.json", "r") as f:
    training_data = json.load(f)["examples"]

for example in training_data:
    prompt = example["prompt"]
    label = example["verdict"]
    # Use in your training loop

In [None]:
import json
import os
import random
from tqdm import tqdm

def prepare_training_data(input_file, output_file):
    """Convert processed FEVEROUS data to a clean, training-ready format"""
    print(f"Processing {input_file}...")

    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    claims = data["processed_claims"]
    print(f"Found {len(claims)} claims")

    # Create a clean, simplified format
    training_examples = []

    for claim in tqdm(claims, desc="Processing claims"):
        # Basic claim information
        example = {
            "id": claim.get("id", ""),
            "claim": claim.get("claim", ""),
            "verdict": claim.get("verdict", ""),
            "evidence": []
        }

        # Process all evidence pieces
        for ev_set in claim.get("evidence", []):
            evidence_set = []
            for ev_piece in ev_set:
                # Determine evidence type
                ev_id = ev_piece.get("id", "")
                if "_sentence_" in ev_id:
                    ev_type = "sentence"
                elif "_cell_" in ev_id:
                    ev_type = "table_cell"
                elif "_item_" in ev_id:
                    ev_type = "list_item"
                else:
                    ev_type = "other"

                # Extract source document
                source = ev_id.split("_")[0] if "_" in ev_id else "unknown"

                # Add formatted evidence
                evidence_set.append({
                    "content": ev_piece.get("content", ""),
                    "type": ev_type,
                    "source": source
                })

            example["evidence"].append(evidence_set)

        training_examples.append(example)

    # Save to file
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump({"examples": training_examples}, f, indent=2)

    print(f"Saved {len(training_examples)} examples to {output_file}")
    return training_examples

def create_prompt_files(examples, output_file):
    """Create files with formatted prompts for training"""
    print("Creating formatted prompts...")

    formatted_examples = []
    for example in tqdm(examples, desc="Formatting prompts"):
        # Format all evidence pieces
        all_evidence = []
        for ev_set_idx, ev_set in enumerate(example["evidence"]):
            for ev_piece in ev_set:
                source = ev_piece["source"]
                content = ev_piece["content"]
                ev_type = ev_piece["type"]

                formatted_ev = f"Evidence {len(all_evidence)+1}"
                formatted_ev += f" (from {source}"
                if ev_type == "table_cell":
                    formatted_ev += ", table"
                elif ev_type == "list_item":
                    formatted_ev += ", list"
                formatted_ev += f"): {content}"

                all_evidence.append(formatted_ev)

        # Create prompt
        prompt = f"""Instruction: Analyze the following claim and evidence to determine if the claim is supported, refuted, or cannot be determined from the evidence provided.

Claim: {example["claim"]}

Evidence:
{chr(10).join(all_evidence)}

Question: Based on the evidence, does the claim appear to be SUPPORTED, REFUTED, or NOT ENOUGH INFORMATION?"""

        formatted_examples.append({
            "id": example["id"],
            "prompt": prompt,
            "verdict": example["verdict"]
        })

    # Save formatted prompts
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump({"examples": formatted_examples}, f, indent=2)

    print(f"Saved {len(formatted_examples)} formatted prompts to {output_file}")

def main():
    # Create output directory
    output_dir = "feverous_prepared"
    os.makedirs(output_dir, exist_ok=True)

    # Process training data
    train_examples = prepare_training_data(
        "train_all_processed.json",
        os.path.join(output_dir, "feverous_train.json")
    )

    # Process dev data
    dev_examples = prepare_training_data(
        "dev_all_processed.json",
        os.path.join(output_dir, "feverous_dev.json")
    )

    # Create prompt files
    create_prompt_files(
        train_examples,
        os.path.join(output_dir, "feverous_train_prompts.json")
    )

    create_prompt_files(
        dev_examples,
        os.path.join(output_dir, "feverous_dev_prompts.json")
    )

    print("FEVEROUS data preparation complete!")

if __name__ == "__main__":
    main()