In [10]:
import datasets
import os
from datasets import load_dataset, concatenate_datasets

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
def resplit_and_save(dataset_name, input_path, output_path):
    """
    Resplits a dataset and saves it in Parquet format.

    Args:
        dataset_name (str): Name of the dataset.
        input_path (str): Path to the input dataset files.
        output_path (str): Path to save the resplit datasets.

    Returns:
        None
    """
    # Ensure the output directory exists
    dataset_output_path = f'{output_path}/{dataset_name}'
    os.makedirs(dataset_output_path, exist_ok=True)

    # Load the dataset
    dataset_dict = load_dataset('parquet', data_files={
        'dev': f'{input_path}/{dataset_name}/dev-00000-of-00001.parquet',
        'test': f'{input_path}/{dataset_name}/test-00000-of-00001.parquet',
        'validation': f'{input_path}/{dataset_name}/validation-00000-of-00001.parquet'
    })

    # Step 1: Combine all rows from dev, validation, and test into one dataset
    combined = concatenate_datasets([
        dataset_dict['dev'],
        dataset_dict['validation'],
        dataset_dict['test']
    ])

    # Step 2: Extract up to 16 rows for the new dev set
    num_dev_rows = min(16, len(combined))
    new_dev = combined.select(range(num_dev_rows))

    # Step 3: Use the remaining rows for validation and test splits
    if len(combined) > num_dev_rows:
        remaining = combined.select(range(num_dev_rows, len(combined)))
        split = remaining.train_test_split(test_size=0.1, seed=42)  # Set seed for reproducibility
        new_validation = split['train']
        new_test = split['test']
    else:
        # If there are fewer than 16 rows, copy the dev set to validation and test
        new_validation = new_dev
        new_test = new_dev

    # Step 4: Save the new DatasetDict in Parquet format
    new_dev.to_parquet(f'{dataset_output_path}/dev-00000-of-00001.parquet')
    new_validation.to_parquet(f'{dataset_output_path}/validation-00000-of-00001.parquet')
    new_test.to_parquet(f'{dataset_output_path}/test-00000-of-00001.parquet')

    print(f"Dataset resplit and saved to {dataset_output_path}/")


In [11]:
dataset_names = os.listdir('/workdir/AGRIVQA')
input_path = '/workdir/AGRIVQA'
output_path = '/workdir/important_datasets/AGRIVQA_light'

for dataset_name in dataset_names:
    resplit_and_save(dataset_name, input_path, output_path)

NameError: name 'resplit_and_save' is not defined

In [26]:
input_path = '/workdir/important_datasets/AGRIVQA_light'
dataset = {}
for dataset_name in dataset_names:
    dataset[dataset_name] = load_dataset('parquet', data_files={
        'dev': f'{input_path}/{dataset_name}/dev-00000-of-00001.parquet',
        'test': f'{input_path}/{dataset_name}/test-00000-of-00001.parquet',
        'validation': f'{input_path}/{dataset_name}/validation-00000-of-00001.parquet'
    })

In [27]:
dataset

{'Identification': DatasetDict({
     dev: Dataset({
         features: ['id', 'question', 'options', 'explanation', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'img_type', 'answer', 'topic_difficulty', 'question_type', 'subfield', 'metadata'],
         num_rows: 16
     })
     test: Dataset({
         features: ['id', 'question', 'options', 'explanation', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'img_type', 'answer', 'topic_difficulty', 'question_type', 'subfield', 'metadata'],
         num_rows: 9
     })
     validation: Dataset({
         features: ['id', 'question', 'options', 'explanation', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'img_type', 'answer', 'topic_difficulty', 'question_type', 'subfield', 'metadata'],
         num_rows: 75
     })
 }),
 'AgriExam': DatasetDict({
     dev: Dataset({
         features: ['id', 'question', 'options', 'explanation', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'img_type', 'answer', 'topic_d

In [22]:
for dataset_name in dataset_names:
    dataset[dataset_name]

KeyError: 'Identification'