In [4]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
import json

# Load the Climate Fever dataset (assuming it's in JSONL format)
file_path = "climate-fever-dataset-r1.jsonl"
climate_data = []

# Read the JSONL file
with open(file_path, "r") as f:
    for line in f:
        climate_data.append(json.loads(line))

# Generate context and answers for each claim
contextual_data = []
for entry in climate_data:
    claim = entry["claim"]
    evidences = entry.get("evidences", [])

    # Combine evidence from supporting articles into a context
    context = " ".join([ev["evidence"] for ev in evidences if ev["evidence_label"] == "SUPPORTS"])

    # If no supporting evidence, use all available evidence
    if not context:
        context = " ".join([ev["evidence"] for ev in evidences])

    # Generate the answer (use supporting evidence as answer)
    answer = " ".join([ev["evidence"] for ev in evidences if ev["evidence_label"] == "SUPPORTS"])
    if not answer:
        answer = " ".join([ev["evidence"] for ev in evidences])  # Fallback to all evidence if no support

    # Append the claim, context, and answer to the dataset
    if context and answer:  # Ensure context and answer are not empty
        contextual_data.append({"claim": claim, "context": context, "answer": answer})

# Save the contextualized data with answers in JSON format
with open("climate_contextual_data.json", "w") as f:
    json.dump(contextual_data, f, indent=4)

print(f"Generated {len(contextual_data)} contextualized claims with answers.")

# Generate a text file for GPT training
output_file = "climate_context_qna.txt"

with open(output_file, "w") as f:
    for entry in contextual_data:
        claim = entry["claim"]
        context = entry["context"]
        answer = entry["answer"]
        # Format the entry as context, question, and answer
        formatted_entry = f"Context: {context}\nQ: {claim}\nA: {answer}\n---\n"
        f.write(formatted_entry)

print(f"Context, question, and answer pairs saved to '{output_file}'.")


Generated 1535 contextualized claims with answers.
Context, question, and answer pairs saved to 'climate_context_qna.txt'.


In [5]:
from datasets import load_dataset

ds = load_dataset("rexarski/climate_fever_fixed")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.23k [00:00<?, ?B/s]

(…)-00000-of-00001-d2bb9fbd7d385150.parquet:   0%|          | 0.00/763k [00:00<?, ?B/s]

(…)-00000-of-00001-33faf1a72443ee69.parquet:   0%|          | 0.00/279k [00:00<?, ?B/s]

(…)-00000-of-00001-1823c907d6d1891b.parquet:   0%|          | 0.00/331k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4298 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1535 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/1842 [00:00<?, ? examples/s]

In [6]:
ds

DatasetDict({
    train: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'label', 'category'],
        num_rows: 4298
    })
    test: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'label', 'category'],
        num_rows: 1535
    })
    valid: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'label', 'category'],
        num_rows: 1842
    })
})

In [8]:
from datasets import DatasetDict, concatenate_datasets

# Assuming the dataset is already loaded as 'dataset'
# Structure: DatasetDict with train, test, valid splits

# Combine train and test splits into a new 'train' split
combined_train = concatenate_datasets([ds["train"], ds["test"]])

# Create a new DatasetDict with combined train and valid renamed to test
dataset = DatasetDict({
    "train": combined_train,
    "test": ds["valid"]  # Treat the 'valid' split as 'test'
})

# Print the new dataset structure
print(dataset)

# Optional: Save the new dataset to disk if needed
dataset.save_to_disk("combined_climate_dataset")

DatasetDict({
    train: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'label', 'category'],
        num_rows: 5833
    })
    test: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'label', 'category'],
        num_rows: 1842
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5833 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1842 [00:00<?, ? examples/s]

In [9]:
from datasets import DatasetDict, concatenate_datasets

# Assuming 'dataset' is the combined Hugging Face DatasetDict
# Structure: train and test splits

# Function to create the text format
def format_context_qa(example):
    """
    Convert an example into the desired context-question-answer text format.
    """
    context = example["evidence"]  # Use evidence as the context
    question = example["claim"]    # Use claim as the question
    answer = example["evidence"]   # Use evidence as the answer

    return f"Context: {context}\nQ: {question}\nA: {answer}\n---\n"

# Open a text file to write the data
output_file = "climate_context_qna_2.txt"
with open(output_file, "w") as f:
    # Process the train split
    for example in dataset["train"]:
        formatted_entry = format_context_qa(example)
        f.write(formatted_entry)
    # Process the test split
    for example in dataset["test"]:
        formatted_entry = format_context_qa(example)
        f.write(formatted_entry)

print(f"Context-QA pairs saved to '{output_file}'.")

Context-QA pairs saved to 'climate_context_qna_2.txt'.


In [10]:
import os

def combine_text_files(file1, file2, output_file):
    """Combines two text files into a single output file."""
    try:
        with open(file1, 'r') as f1, open(file2, 'r') as f2, open(output_file, 'w') as outfile:
            outfile.write(f1.read())
            outfile.write(f2.read())
        print(f"Successfully combined '{file1}' and '{file2}' into '{output_file}'.")
    except FileNotFoundError:
        print(f"Error: One or both input files not found.")

# Example usage:
combine_text_files("climate_context_qna.txt", "climate_context_qna_2.txt", "combined_climate_data.txt")

Successfully combined 'climate_context_qna.txt' and 'climate_context_qna_2.txt' into 'combined_climate_data.txt'.


In [14]:
from datasets import load_dataset, concatenate_datasets
import re

# Load SQuAD 2.0 dataset
squad_dataset = load_dataset("squad_v2")

# Keywords to filter IPCC-related questions or contexts
keywords = ["IPCC", "Intergovernmental Panel on Climate Change", "climate report", "climate assessment"]

# Function to filter examples based on keywords
def filter_ipcc_reports(example):
    context = example["context"]
    question = example["question"]
    # Check if any keyword is in the context or question (case insensitive)
    for keyword in keywords:
        if re.search(keyword, context, re.IGNORECASE) or re.search(keyword, question, re.IGNORECASE):
            return True
    return False

# Apply the filtering function to train and validation splits
ipcc_train = squad_dataset["train"].filter(filter_ipcc_reports)
ipcc_validation = squad_dataset["validation"].filter(filter_ipcc_reports)

# Combine the filtered splits using concatenate_datasets
from datasets import concatenate_datasets
filtered_ipcc = concatenate_datasets([ipcc_train, ipcc_validation])

# Function to format context, question, and answer into the desired text format
def format_context_qna(example):
    context = example["context"]
    question = example["question"]
    # Select the first answer (SQuAD usually has multiple possible answers)
    answer = example["answers"]["text"][0] if example["answers"]["text"] else "No answer provided"

    # Return the formatted text
    return f"Context: {context}\nQ: {question}\nA: {answer}\n---\n"

# Write the formatted data to a text file
output_file = "ipcc_context_qna.txt"
with open(output_file, "w") as f:
    for example in filtered_ipcc:
        formatted_entry = format_context_qna(example)
        f.write(formatted_entry)

print(f"Filtered Context-QA pairs saved to '{output_file}'.")

Filtered Context-QA pairs saved to 'ipcc_context_qna.txt'.


In [15]:
def combine_text_files(file1, file2, output_file):
    """Combines two text files into a single output file."""
    try:
        with open(file1, 'r') as f1, open(file2, 'r') as f2, open(output_file, 'w') as outfile:
            outfile.write(f1.read())
            outfile.write(f2.read())
        print(f"Successfully combined '{file1}' and '{file2}' into '{output_file}'.")
    except FileNotFoundError:
        print(f"Error: One or both input files not found.")

# Example usage:
combine_text_files("combined_climate_data.txt", "ipcc_context_qna.txt", "final_combined_data.txt")

Successfully combined 'combined_climate_data.txt' and 'ipcc_context_qna.txt' into 'final_combined_data.txt'.
