In [1]:
import json, os, pandas as pd, numpy as np, csv
import requests
import io
import tarfile
import zipfile
from datasets import load_dataset

### download datasets in to data folder

In [2]:
#XSUM data
def download_XSUM():
    # Convert the JSON data to a pandas DataFrame
    df = pd.read_json('https://raw.githubusercontent.com/tanyuqian/ctc-gen-eval/master/train/data/qags_xsum.json')
    
    # Create the new column 'Id' by concatenating the row index with 'qags_xsum'
    df['id'] = 'qags_xsum' + df.index.astype(str)
    
    # Randomly assign 70% of the rows to 'val' and 30% to 'test'
    df['cut'] = np.where(np.random.rand(len(df)) < 0.7, 'val', 'test')
    
    df['dataset_origin'] = 'qags_xsum'
    
    # Define new column names
    new_column_names = {'document': 'grounding', 'summary': 'generated_text', 'consistency': 'label'}
    
    # Rename the columns
    df.rename(columns=new_column_names, inplace=True)

    # Define the list of columns to subset
    columns_to_subset = ['id', 'grounding', 'generated_text', 'label', 'cut', 'dataset_origin']
    
    # Create the subset DataFrame
    xsum_df = df[columns_to_subset]

    # Directory where the CSV file will be saved
    save_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')
    
    # Create the directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)

    # Save the subset DataFrame as a CSV file
    xsum_df.to_csv(os.path.join(save_dir, 'qags_xsum_download.csv'), index=False)

download_XSUM()

In [3]:
#CNN/DM data
def download_CNNDM():
    
    # Convert the JSON data to a pandas DataFrame
    df = pd.read_json('https://raw.githubusercontent.com/tanyuqian/ctc-gen-eval/master/train/data/qags_cnndm.json')
    
    # Create the new column 'Id' by concatenating the row index with 'qags_xsum'
    df['id'] = 'qags_cnndm' + df.index.astype(str)
    
    # Randomly assign 70% of the rows to 'val' and 30% to 'test'
    df['cut'] = np.where(np.random.rand(len(df)) < 0.7, 'val', 'test')
    
    df['dataset_origin'] = 'qags_cnndm'
    
    # Create a new column based on the value of 'consistency'
    df['label'] = df['consistency'].apply(lambda x: 1 if x == 1 else 0)
    
    # Define new column names
    new_column_names = {'document': 'grounding', 'summary': 'generated_text'}
    
    # Rename the columns
    df.rename(columns=new_column_names, inplace=True)

    # Define the list of columns to subset
    columns_to_subset = ['id', 'grounding', 'generated_text', 'label', 'cut', 'dataset_origin']
    
    # Create the subset DataFrame
    cnndm_df = df[columns_to_subset]

    # Directory where the CSV file will be saved
    save_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')
    
    # Create the directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)

    # Save the subset DataFrame as a CSV file
    cnndm_df.to_csv(os.path.join(save_dir, 'qags_cnndm_download.csv'), index=False)

download_CNNDM()

In [4]:
# Aggre-Fact data
def download_AggreFact():
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv('https://raw.githubusercontent.com/Liyan06/AggreFact/refs/heads/main/data/aggre_fact_final.csv')

    # Define new column names
    new_column_names = {'doc': 'grounding', 'summary': 'generated_text', 'dataset': 'dataset_origin'}
    
    # Rename the columns
    df.rename(columns=new_column_names, inplace=True)

    # Define the list of columns to subset
    columns_to_subset = ['id', 'grounding', 'generated_text', 'label', 'cut', 'dataset_origin']
    
    # Create the subset DataFrame
    aggrefact_df = df[columns_to_subset]

    # Directory where the CSV file will be saved
    save_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')
    
    # Create the directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)

    # Save the subset DataFrame as a CSV file
    aggrefact_df.to_csv(os.path.join(save_dir, 'aggre_fact_download.csv'), index=False)
        
download_AggreFact()

In [5]:
# True data - download PAWS, FEVER and VitaminC datasets

# PAWS
def download_paws():
    response = requests.get('https://storage.googleapis.com/paws/english/paws_wiki_labeled_final.tar.gz',stream=True)
    with tarfile.open(fileobj=io.BytesIO(response.raw.read()), mode='r:gz') as tar_file:
        f = tar_file.extractfile('final/dev.tsv')
        df = pd.read_csv(f, sep='\t')
        df = df.rename(columns={'sentence1': 'grounding','sentence2': 'generated_text'})
        # Randomly assign 70% of the rows to 'val' and 30% to 'test'
        df['cut'] = np.where(np.random.rand(len(df)) < 0.7, 'val', 'test')
        
        df['dataset_origin'] = 'PAWS'
        
        
        # Directory where the CSV file will be saved
        save_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')
        os.makedirs(save_dir, exist_ok=True)
        # Path to save the CSV file
        save_path = os.path.join(save_dir, 'paws_download.csv')
        df.to_csv(save_path, index=False)
        



# Vitamin C dataset
def download_vitc():
    response = requests.get(
      'https://github.com/TalSchuster/talschuster.github.io/raw/master/static/vitaminc.zip',
      stream=True)
    with zipfile.ZipFile(io.BytesIO(response.raw.read())) as z:
        
        # Initialize an empty list to store DataFrames
        dataframes = []
        
        # Extract and load each JSON file into a DataFrame
        with z.open('vitaminc/dev.jsonl') as file:
            df = pd.read_json(file, lines=True)
            
            # Randomly assign 70% of the rows to 'val' and 30% to 'test'
            df['cut'] = np.where(np.random.rand(len(df)) < 0.7, 'val', 'test')
            
            df['label'] = df['label'].apply(lambda x: 1 if x == 'SUPPORTS' else 0)
            dataframes.append(df)
        

        
        
        # Concatenate all DataFrames into one
        df = pd.concat(dataframes, ignore_index=True)
        df['dataset_origin'] = 'Vitamin C'
        
        # Reset the index of the concatenated DataFrame
        df.reset_index(drop=True, inplace=True)
        
        new_column_names = {'unique_id':'id','evidence':'grounding', 'claim':'generated_text'}
        # Rename the columns
        df.rename(columns=new_column_names, inplace=True)
        
        # Define the list of columns to subset
        columns_to_subset = ['id', 'grounding', 'generated_text', 'label', 'cut', 'dataset_origin']
        
        #Create the subset DataFrame
        subset_df = df[columns_to_subset]
        
        # Directory where the CSV file will be saved
        save_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')
        
        # Create the directory if it doesn't exist
        os.makedirs(save_dir, exist_ok=True)
        
        # Path to save the CSV file
        save_path = os.path.join(save_dir, 'vitc_download.csv')
        
        subset_df.to_csv(save_path, index=False)
        
       

    
    
    
download_paws()
download_vitc()


In [6]:
#Download Fever NLI zip file

# URL of the Dropbox zip file
url = 'https://www.dropbox.com/scl/fi/nyvxxz7n0hwwuvozmiknb/nli_fever.zip?rlkey=2yfvi7c2cxglyklexhgezioqr&e=1&dl=1'

# Directory where the CSV file will be saved
save_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')
        
# Create the directory if it doesn't exist
os.makedirs(save_dir, exist_ok=True)
        
# Path to save the downloaded zip file
save_path = os.path.join(save_dir, 'nli_fever.zip')

# Download the zip file
response = requests.get(url, stream=True)
if response.status_code == 200:
    with open(save_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
    print(f'File downloaded successfully and saved to {save_path}')
else:
    print('Failed to download file')



File downloaded successfully and saved to /storage/coda1/p-dsgt_clef2025/0/kmarturi3/simpletext-2025-controlcreativity/data/nli_fever.zip


In [7]:
# Mapping of FEVER ids to labels using the FEVER dataset from HuggingFace Datasets
def get_fever_labels(data_split = 'labelled_dev'):
    id_to_label = {}
    fever_data = load_dataset('fever', 'v1.0', split=data_split)
    for example in fever_data:
        id_to_label[example['id']] = example['label']
    return id_to_label

def download_fever():
    save_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')
    zip_file_path = os.path.join(save_dir, 'nli_fever.zip')
    
    
    # Create a directory to extract files if it doesn't exist
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        
    # Open the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        
        zip_ref.extract('nli_fever/dev_fitems.jsonl', save_dir)
        
        
    # Extract and load each JSON file into a DataFrame
    fever_id_to_label = get_fever_labels('labelled_dev')
    df = pd.read_json(os.path.join(save_dir, 'nli_fever/dev_fitems.jsonl'), lines=True)
    
    # Randomly assign 70% of the rows to 'val' and 30% to 'test'
    df['cut'] = np.where(np.random.rand(len(df)) < 0.7, 'val', 'test')
    
    df['label'] = df['cid'].apply(lambda x: 1 if fever_id_to_label[int(x)]  == 'SUPPORTS' else 0)
    df['dataset_origin'] = 'Fever'
    
    
        
    new_column_names = {'cid':'id','context':'grounding', 'query':'generated_text'}
    # Rename the columns
    df.rename(columns=new_column_names, inplace=True)
        
    # Define the list of columns to subset
    columns_to_subset = ['id', 'grounding', 'generated_text', 'label', 'cut', 'dataset_origin']
        
    #Create the subset DataFrame
    subset_df = df[columns_to_subset]
        
    # Directory where the CSV file will be saved
    #save_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')
        
    # Create the directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
        
    # Path to save the CSV file
    save_path = os.path.join(save_dir, 'fever_download.csv')
        
    subset_df.to_csv(save_path, index=False)

        
download_fever()

In [8]:
# Download HaluEval and TofuEval 

# Load the dataset from Hugging Face
dataset = load_dataset("achandlr/FactualConsistencyScoresTextSummarization")

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset['train'])

new_column_names = {'unique_id':'id','context':'grounding', 'summary':'generated_text', 'benchmark_origin':'dataset_origin'}
# Rename the columns
df.rename(columns=new_column_names, inplace=True)


# Define the list of columns to subset
columns_to_subset = ['id', 'grounding', 'generated_text', 'label', 'cut', 'dataset_origin']
        
#Create the subset DataFrame
subset_df = df[columns_to_subset]

#HaluEval dataset
df_halu_eval = subset_df[subset_df['dataset_origin'] == 'HaluEval']
df_halu_eval.reset_index(drop=True, inplace=True)
# Randomly assign 70% of the rows to 'val' and 30% to 'test'
halu_mask = np.random.rand(len(df_halu_eval)) < 0.7
df_halu_eval.loc[halu_mask, 'cut'] = 'val'

#TofuEval dataset
df_tofu_eval = subset_df[subset_df['dataset_origin'] == 'TofuEval']
df_tofu_eval.reset_index(drop=True, inplace=True)
# Randomly assign 70% of the rows to 'val' and 30% to 'test'
tofu_mask = np.random.rand(len(df_tofu_eval)) < 0.7
df_tofu_eval.loc[tofu_mask, 'cut'] = 'val'

# Directory where the CSV file will be saved
save_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')
        
# Create the directory if it doesn't exist
os.makedirs(save_dir, exist_ok=True)

# save datasets as CSV file
df_halu_eval.to_csv(os.path.join(save_dir, 'halueval_download.csv'), index=False)
df_tofu_eval.to_csv(os.path.join(save_dir, 'tofueval_download.csv'), index=False)

In [9]:
#samsum data
def download_samsum():
    
    # URL of the raw JSONL file on GitHub
    jsonl_url = 'https://raw.githubusercontent.com/skgabriel/GoFigure/main/human_eval/samsum.jsonl'
    
    # Load the JSONL file into a pandas DataFrame
    df = pd.read_json(jsonl_url, lines=True)
    
    # Create the new column 'Id' by concatenating the row index with 'qags_xsum'
    df['id'] = 'samsum' + df.index.astype(str)
    
    # Randomly assign 70% of the rows to 'val' and 30% to 'test'
    df['cut'] = np.where(np.random.rand(len(df)) < 0.7, 'val', 'test')
    
    df['dataset_origin'] = 'samsum'
    
    # Create a new column based on the value of 'consistency'
    df['label'] = df['label'].apply(lambda x: 1 if x == 'factual' else 0)
    
    # Define new column names
    new_column_names = {'article': 'grounding', 'summary': 'generated_text'}
    
    # Rename the columns
    df.rename(columns=new_column_names, inplace=True)

    # Define the list of columns to subset
    columns_to_subset = ['id', 'grounding', 'generated_text', 'label', 'cut', 'dataset_origin']
    
    # Create the subset DataFrame
    samsum_df = df[columns_to_subset]

    # Directory where the CSV file will be saved
    save_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')
    
    # Create the directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)

    # Save the subset DataFrame as a CSV file
    samsum_df.to_csv(os.path.join(save_dir, 'samsum_download.csv'), index=False)

download_samsum()