In [1]:
from datasets import Features, Value, Sequence, Image, DatasetInfo, Dataset, DatasetDict
from tqdm import tqdm
import os
from datasets import load_dataset, concatenate_datasets
import json
import re
from copy import deepcopy

import asyncio
import math
import numpy as np
import random
import pandas as pd
from ollama import AsyncClient

from random import sample
from collections import Counter
from openai import OpenAI

from workdir.utils import save_json, load_json

from random import shuffle, sample
from workdir.utils import load_dataset_dict

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# def load_dataset_dict(folder_path, concat=False): # TODO check that work with multiple files
#     dataset_name = "parquet"
#     data_files = {
#         "dev": f"{folder_path}/dev-0*.parquet",
#         "test": f"{folder_path}/test-*.parquet",
#         "validation": f"{folder_path}/validation-*.parquet"
#     }
#     dataset = load_dataset(dataset_name,data_files=data_files)
#     if concat:
#         return concatenate_datasets([dataset['dev'], dataset['test'], dataset['validation']])
#     return dataset

### Split and Save utils

In [2]:
RANDOM_SEED = 420

           
def split_dataset(dataset, dev_count=16, test_ratio=0.8, validation_ratio=None, random_seed=RANDOM_SEED):
    """
    Splits a Hugging Face dataset into dev, test, and validation splits.
    If only one of test_ratio or validation_ratio is provided, the other is inferred.
    
    Args:
        dataset (Dataset): Hugging Face Dataset to split.
        dev_count (int): Number of rows for the dev set.
        test_ratio (float, optional): Proportion of remaining data for the test set.
        validation_ratio (float, optional): Proportion of remaining data for the validation set.
                                           test_ratio + validation_ratio must equal 1.
    
    Returns:
        DatasetDict: Dictionary containing 'dev', 'test', and 'validation' splits.
    """
    # Handle missing ratios
    if test_ratio is None and validation_ratio is None:
        raise ValueError("At least one of test_ratio or validation_ratio must be provided.")
    if test_ratio is None:
        test_ratio = 1 - validation_ratio
    if validation_ratio is None:
        validation_ratio = 1 - test_ratio

    # Validate ratios
    if not (0 <= test_ratio <= 1 and 0 <= validation_ratio <= 1):
        raise ValueError("Ratios must be between 0 and 1.")
    if round(test_ratio + validation_ratio, 6) != 1:
        raise ValueError("test_ratio and validation_ratio must add up to 1.")

    # Shuffle the dataset for randomness
    dataset = dataset.shuffle(seed=random_seed)
    total_rows = len(dataset)
    
    # Step 1: Extract 'dev_count' rows for the dev set
    if dev_count >= total_rows:
        raise ValueError("dev_count is too large for the dataset size.")
    
    dev_set = dataset.select(range(dev_count))
    remaining_dataset = dataset.select(range(dev_count, total_rows))

    # Step 2: Use train_test_split to split remaining data into test and validation
    split = remaining_dataset.train_test_split(test_size=validation_ratio, seed=random_seed)
    
    # Step 3: Combine splits into DatasetDict
    return DatasetDict({
        "dev": dev_set,
        "test": split['train'],        # 1 - validation_ratio
        "validation": split['test']    # validation_ratio
    })
    
    
            
def rename_split(split_dict, dataset_name=None):
    # split_dict is a dictionary {split_name: dataset}
    # renames row['id'] adding split_name and increasing id
    
    renamed_split_dict = {}
    
    def rename_row(row, index, split_name, dataset_name=None):
        if not dataset_name:
            dataset_name = row['id']
        row['id'] = f"{split_name}_{dataset_name}_{index+1}"
        return row
    
    for split, dataset in split_dict.items():
        renamed_split_dict[split] = dataset.map(lambda row, index: rename_row(row, index, split_name=split, dataset_name=dataset_name), with_indices=True)
    
    return renamed_split_dict
        
        
            
def split_and_save(all_rows,
                   root_folder = '/workdir/important_datasets/AGRIVQA/',
                   max_len = None,
                   dataset_name = None,
                   dev_count = 16, test_ratio = 0.8, validation_ratio = None,
                   random_seed = RANDOM_SEED):
    
    if not dataset_name:
        dataset_name = all_rows[0]['id']
    
    dataset = Dataset.from_list(all_rows)
    
    split_dict = split_dataset(dataset, dev_count=dev_count, test_ratio=test_ratio, validation_ratio=validation_ratio, random_seed=random_seed)
    
    split_dict = rename_split(split_dict, dataset_name=dataset_name)
    
    for split, dataset in split_dict.items():
        if not max_len:
            chunk_size = len(dataset)
        else:
            chunk_size = max_len
    
        total_rows = len(dataset)
        
        num_files = 1 if total_rows == chunk_size else total_rows // chunk_size + 1
        
        for i in range(0, total_rows, chunk_size):
        # Select a batch of rows from the dataset
            chunk = dataset.select(range(i, min(i + chunk_size, total_rows)))
            
            # Create the file name with the appropriate split and chunk indices
            file_name = f'{split}-{str(i // chunk_size).zfill(5)}-of-{str(num_files).zfill(5)}.parquet'
            file_path = os.path.join(root_folder, dataset_name, file_name)
            
            # Make sure the directory exists
            os.makedirs(os.path.dirname(file_path), exist_ok=True)
            
            # Save the chunk to a Parquet file
            print(f'Saving dataset file: {file_path}')
            chunk.to_parquet(file_path)

    

### AgriExam


In [None]:
AgriExam_row = {
    "id": "AgriExam",
    "question": "{question}",
    "options": ["{option_1}", "{option_2}", "{option_3}", "{option_4}"],  # Assuming options are strings in a list
    "answer": "{answer}",
    "category": "{category}",  # Matches the 'category' field
    "question_type": "{question_type}",  # Matches the 'question_type' field
    "metadata":
        {
            "source": "{source}",
            "license": "{license}",
            "url": "{url}",
            "language": "{language}",
            "verbose_answer": "{verbose_answer}"
        }
}

In [62]:
AgriExam_rows = []

AgriExam_data = load_json('/workdir/agriexam_category_dictionary.json')

for category, questions in AgriExam_data.items():

    for q in tqdm(questions):
        AgriExam_row['id'] = f"AgriExam"
        AgriExam_row['question'] = q['question']
        options = eval(q['options'])
        if len(options) == 0:
            options = []
        AgriExam_row['options'] = options
            
        # Placeholder for explanation or any additional info
        AgriExam_row['answer'] = q['answer']  # Assuming the correct answer is in q.correct_answer
        
        # Placeholder for category if applicable, adjust based on your requirements
        AgriExam_row['category'] = category  # Replace with actual category logic if needed
        
        # Placeholder for question_type logic: assuming it's a multiple-choice or open-ended type
        if options:
            AgriExam_row['question_type'] = 'multiple-choice'
        else:
            AgriExam_row['question_type'] = 'open-ended'
        
        # Metadata section: Assuming placeholders for now
        metadata = eval(q['metadata'])
        
        AgriExam_row['metadata']['source'] = metadata['source']  # Replace with actual source if needed
        AgriExam_row['metadata']['license'] = metadata['license']
        AgriExam_row['metadata']['url'] = metadata['url']
        AgriExam_row['metadata']['language'] = metadata['language']
        AgriExam_row['metadata']['verbose_answer'] = metadata['verbose_answer']

        # Appending the row to rows
        AgriExam_rows.append(deepcopy(AgriExam_row))

100%|██████████| 79/79 [00:00<00:00, 19975.28it/s]
100%|██████████| 192/192 [00:00<00:00, 20349.89it/s]
  0%|          | 0/347 [00:00<?, ?it/s]

100%|██████████| 347/347 [00:00<00:00, 20167.16it/s]
100%|██████████| 74/74 [00:00<00:00, 20200.36it/s]
100%|██████████| 317/317 [00:00<00:00, 19552.86it/s]
100%|██████████| 745/745 [00:00<00:00, 20121.81it/s]
100%|██████████| 155/155 [00:00<00:00, 19983.93it/s]
100%|██████████| 114/114 [00:00<00:00, 20118.26it/s]
100%|██████████| 30/30 [00:00<00:00, 19750.29it/s]
100%|██████████| 221/221 [00:00<00:00, 20311.62it/s]
100%|██████████| 37/37 [00:00<00:00, 18554.43it/s]
100%|██████████| 67/67 [00:00<00:00, 20510.79it/s]
100%|██████████| 73/73 [00:00<00:00, 20286.50it/s]
100%|██████████| 99/99 [00:00<00:00, 19480.02it/s]
100%|██████████| 16/16 [00:00<00:00, 18406.16it/s]
100%|██████████| 31/31 [00:00<00:00, 18819.43it/s]
100%|██████████| 166/166 [00:00<00:00, 18742.72it/s]
100%|██████████| 50/50 [00:00<00:00, 19686.02it/s]
100%|██████████| 88/88 [00:00<00:00, 12275.88it/s]
100%|██████████| 388/388 [00:00<00:00, 19717.10it/s]
100%|██████████| 66/66 [00:00<00:00, 19889.64it/s]
100%|██████████

#### Save and check

In [None]:
dataset_name = 'AgriExam'
root_folder = '/workdir/AGRIVQA/'
split_and_save(AgriExam_rows, root_folder=root_folder)

dataset = load_dataset_dict(os.path.join(root_folder, dataset_name))
dataset

Map: 100%|██████████| 16/16 [00:00<00:00, 2103.00 examples/s]
Map: 100%|██████████| 16160/16160 [00:02<00:00, 7731.92 examples/s] 
Map: 100%|██████████| 4041/4041 [00:00<00:00, 11189.11 examples/s]


Saving dataset file: /workdir/AGRIVQA/Agri500P/dev-00000-of-00001.parquet


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1058.37ba/s]


Saving dataset file: /workdir/AGRIVQA/Agri500P/test-00000-of-00001.parquet


Creating parquet from Arrow format: 100%|██████████| 17/17 [00:00<00:00, 285.08ba/s]


Saving dataset file: /workdir/AGRIVQA/Agri500P/validation-00000-of-00001.parquet


Creating parquet from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 347.60ba/s]
Generating dev split: 16 examples [00:00, 4116.10 examples/s]
Generating test split: 16160 examples [00:00, 178046.87 examples/s]
Generating validation split: 4041 examples [00:00, 152566.14 examples/s]


### Agri500P

In [None]:
Agri500P_row = {
    "id": "Agri500P",
    "question": "{question}",
    "answer": "{answer}",
    "context": "{context}",
    "category": "{category}",  # Example: Plant Science, Pests, Taxonomy
    "question_type": "{question_type}",  # Example: multiple choice, open ended
    "metadata":
        {
            "source": "{source}",
            "license": "{license}",
            "language": "{language}",
            "book_title": "{book_title}",
            "chapter_title": "{chapter_title}"
        }
}

In [70]:
task_name = '500P'

dataset_name = "parquet"
data_files = {
    "dev": "/workdir/important_datasets/AGRIVQA/"+task_name+"/dev-00000-of-00001.parquet",
    "test": "/workdir/important_datasets/AGRIVQA/"+task_name+"/test-00000-of-00001.parquet",
    "validation": "/workdir/important_datasets/AGRIVQA/"+task_name+"/validation-00000-of-00001.parquet"
}

dataset = load_dataset(dataset_name,data_files=data_files)

Agri500P_dataset = concatenate_datasets([dataset['dev'], dataset['test'], dataset['validation']])

In [74]:
Agri500P_rows = []

pt_en_title=load_json('/workdir/translate_title.json')
en_pt_title = {en:pt for pt,en in pt_en_title.items()}

Agri500P_category = load_json('/workdir/500P_categories.json')
theme_category_dict = {theme:category  for category,theme_list in Agri500P_category.items() for theme in theme_list}


for q in tqdm(Agri500P_dataset):
    Agri500P_row['id'] = f"Agri500P"
    Agri500P_row['question'] = q['question']
        
    # Placeholder for explanation or any additional info
    Agri500P_row['answer'] = q['answer']  # Assuming the correct answer is in q.correct_answer
    
    # Metadata section: Assuming placeholders for now
    metadata = eval(q['metadata'])
    
    # Placeholder for category if applicable, adjust based on your requirements
    
    book_title = metadata['book_title'].replace(' – The Producer Asks, Embrapa Answers', 's')
    chapter_title = metadata['chapter_title'].replace('Strawberry production in greenhouses is a more common topic, but I assume you meant... ', '')
    
    category = theme_category_dict[f"{book_title}: {chapter_title}"]
    
    Agri500P_row['category'] = category  # Replace with actual category logic if needed
    
    Agri500P_row['context'] = f"This question is sourced from the book titled '{book_title}', specifically found in the chapter '{chapter_title}'."
    
    # Placeholder for question_type logic: assuming it's a multiple-choice or open-ended type
    if options:
        Agri500P_row['question_type'] = 'multiple-choice'
    else:
        Agri500P_row['question_type'] = 'open-ended'
    
    
    
    Agri500P_row['metadata']['source'] = '500 Perguntas e 500 Respostas: ' + en_pt_title[book_title]  # Replace with actual source if needed
    Agri500P_row['metadata']['license'] = metadata['license']
    Agri500P_row['metadata']['language'] = metadata['language']
    Agri500P_row['metadata']['book_title'] = metadata['book_title']
    Agri500P_row['metadata']['chapter_title'] = metadata['chapter_title']
    # Appending the row to rows
    Agri500P_rows.append(deepcopy(Agri500P_row))

100%|██████████| 20217/20217 [00:04<00:00, 4957.55it/s]


#### Save and check

In [81]:
dataset_name = 'Agri500P'
root_folder = '/workdir/AGRIVQA/'
split_and_save(Agri500P_rows, root_folder=root_folder)

dataset = load_dataset_dict(os.path.join(root_folder, dataset_name))
dataset

Map: 100%|██████████| 16/16 [00:00<00:00, 2395.89 examples/s]
Map: 100%|██████████| 16160/16160 [00:01<00:00, 10164.16 examples/s]
Map: 100%|██████████| 4041/4041 [00:00<00:00, 7711.41 examples/s]


Saving dataset file: /workdir/AGRIVQA/Agri500P/dev-00000-of-00001.parquet


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 842.91ba/s]


Saving dataset file: /workdir/AGRIVQA/Agri500P/test-00000-of-00001.parquet


Creating parquet from Arrow format: 100%|██████████| 17/17 [00:00<00:00, 296.62ba/s]


Saving dataset file: /workdir/AGRIVQA/Agri500P/validation-00000-of-00001.parquet


Creating parquet from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 359.79ba/s]
Generating dev split: 16 examples [00:00, 2684.68 examples/s]
Generating test split: 16160 examples [00:00, 181283.62 examples/s]
Generating validation split: 4041 examples [00:00, 215235.91 examples/s]


DatasetDict({
    dev: Dataset({
        features: ['id', 'question', 'answer', 'context', 'category', 'question_type', 'metadata'],
        num_rows: 16
    })
    test: Dataset({
        features: ['id', 'question', 'answer', 'context', 'category', 'question_type', 'metadata'],
        num_rows: 16160
    })
    validation: Dataset({
        features: ['id', 'question', 'answer', 'context', 'category', 'question_type', 'metadata'],
        num_rows: 4041
    })
})

### EPPO



In [76]:
task_name = 'EPPO'

dataset_name = "parquet"
data_files = {
    "dev": "/workdir/important_datasets/AGRIVQA/"+task_name+"/dev-00000-of-00001.parquet",
    "test": "/workdir/important_datasets/AGRIVQA/"+task_name+"/test-00000-of-00001.parquet",
    "validation": "/workdir/important_datasets/AGRIVQA/"+task_name+"/validation-00000-of-00001.parquet"
}

dataset = load_dataset(dataset_name,data_files=data_files)

EPPO_dataset = concatenate_datasets([dataset['dev'], dataset['test'], dataset['validation']])


In [78]:
template_dict = {
    'common_name':'Common Name',
    'damage_cause': 'Taxonomy',
    'genus_name': 'Taxonomy',
    'growth_stage': 'Growth Stage',
    'scientific_name': 'Taxonomy',
    'weed_identification': 'Taxonomy'
}

kingdoms = [
 'Archaea',
 'Bacteria',
 'Chromista',
 'Viruses_and_viroids',
 'Fungi',
 'Protista',
 'Plantae',
 'Animalia']

kingdom_eppo_dict = { k: list(load_json(f'/workdir/important_datasets/EPPO_to_GBIF/{k}_EPPO_to_GBIF.json')) for k in kingdoms}

In [79]:
EPPO_row = {
    "id": "EPPO",
    "question": "{question}",
    "options": ["{option_1}", "{option_2}", "{option_3}", "{option_4}"],  # List of options
    "answer": "{answer}",
    "image": "{image_path}",  # Path to the image
    "taxon_rank": 0,  # Integer
    "options_difficulty": 0,  # Integer
    "kingdom": "{kingdom}",  # Example: Plantae, Animalia
    "category": "{category}",  # Example: Taxonomy, Growth Stage, Common Name
    "question_template": "{question_template}",  # Example: scientific_name, common_name
    "question_type": "{question_type}",  # Example: multiple choice, open ended
    "metadata":
        {
            "source": "{source}",
            "license": "{license}",
            "image_url": "{image_url}",
            "language": "{language}",
            "verbose_answer": "{verbose_answer}",
            "eppo_code": "{eppo_code}",
            "gbif_key": "{gbif_key}",
            "common_name_language": "{common_name_language}"
        }
}

In [80]:
EPPO_rows = []

OPTIONS = ['A','B','C','D','E']

for q in tqdm(EPPO_dataset):
    EPPO_row['id'] = f"EPPO"
    EPPO_row['question'] = q['question']
        
    # Placeholder for explanation or any additional info
    EPPO_row['answer'] = q['answer']  # Assuming the correct answer is in q.correct_answer
    
    options = eval(q['options'])
    if len(options) == 0:
            options = []
    EPPO_row['options'] = options
    
    EPPO_row['image'] = q['image_1']
    EPPO_row['options_difficulty'] = int(q['options_difficulty'])
    # Metadata section: Assuming placeholders for now
    metadata = eval(q['metadata'])
    
    kingdom = metadata['kingdom']
    EPPO_row['kingdom'] = kingdom
    EPPO_row['taxon_rank'] = kingdom_eppo_dict[kingdom].index(metadata['eppo_code'])
    
    # Placeholder for category if applicable, adjust based on your requirements
    EPPO_row["question_template"] = metadata['tag']
    
    EPPO_row['category'] = template_dict[metadata['tag']]  # Replace with actual category logic if needed
    
    
    #EPPO_row['context'] = #f"This question is sourced from the book titled '{book_title}', specifically found in the chapter '{chapter_title}'."
    
    # Placeholder for question_type logic: assuming it's a multiple-choice or open-ended type
    if options:
        EPPO_row['question_type'] = 'multiple-choice'
    else:
        EPPO_row['question_type'] = 'open-ended'
    
    
    EPPO_row['metadata']['source'] = 'EPPO'  # Replace with actual source if needed
    EPPO_row['metadata']['license'] = metadata['license']
    EPPO_row['metadata']['language'] = metadata['language']
    EPPO_row['metadata']['image_url'] = metadata['url']
    EPPO_row['metadata']['verbose_answer'] = options[OPTIONS.index(q['answer'])]
    EPPO_row['metadata']['eppo_code'] = metadata['eppo_code']
    EPPO_row['metadata']['gbif_key'] = metadata['gbif_key']
    EPPO_row['metadata']['common_name_language'] = metadata['common_name_language']
    # Appending the row to rows
    EPPO_rows.append(deepcopy(EPPO_row))

100%|██████████| 20648/20648 [03:30<00:00, 97.99it/s] 


#### Save and check

In [82]:
dataset_name = 'EPPO'
root_folder = '/workdir/AGRIVQA/'
split_and_save(EPPO_rows, root_folder=root_folder)

dataset = load_dataset_dict(os.path.join(root_folder, dataset_name))
dataset

Map: 100%|██████████| 16/16 [00:00<00:00, 1360.57 examples/s]
Map: 100%|██████████| 16505/16505 [00:12<00:00, 1371.13 examples/s]
Map: 100%|██████████| 4127/4127 [00:02<00:00, 1872.95 examples/s]


Saving dataset file: /workdir/AGRIVQA/EPPO/dev-00000-of-00001.parquet


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 95.46ba/s]


Saving dataset file: /workdir/AGRIVQA/EPPO/test-00000-of-00001.parquet


Creating parquet from Arrow format: 100%|██████████| 166/166 [00:04<00:00, 37.57ba/s]


Saving dataset file: /workdir/AGRIVQA/EPPO/validation-00000-of-00001.parquet


Creating parquet from Arrow format: 100%|██████████| 42/42 [00:01<00:00, 35.87ba/s]
Generating dev split: 16 examples [00:00, 664.75 examples/s]
Generating test split: 16505 examples [00:04, 3662.74 examples/s]
Generating validation split: 4127 examples [00:00, 4670.45 examples/s]


DatasetDict({
    dev: Dataset({
        features: ['id', 'question', 'options', 'answer', 'image', 'taxon_rank', 'options_difficulty', 'kingdom', 'category', 'question_template', 'question_type', 'metadata'],
        num_rows: 16
    })
    test: Dataset({
        features: ['id', 'question', 'options', 'answer', 'image', 'taxon_rank', 'options_difficulty', 'kingdom', 'category', 'question_template', 'question_type', 'metadata'],
        num_rows: 16505
    })
    validation: Dataset({
        features: ['id', 'question', 'options', 'answer', 'image', 'taxon_rank', 'options_difficulty', 'kingdom', 'category', 'question_template', 'question_type', 'metadata'],
        num_rows: 4127
    })
})

### GBIF !!!TODO!!!

In [5]:
GBIF_row = {
    "id": "_GBIF_{number}",
    "question": "{question}",
    "options": ["{option_1}", "{option_2}", "{option_3}", "{option_4}"],  # List of options
    "answer": "{answer}",
    "image_1": "{image_path_1}",  # Path to image 1
    "image_2": "{image_path_2}",  # Path to image 2
    "image_3": "{image_path_3}",  # Path to image 3
    "image_4": "{image_path_4}",  # Path to image 4
    "image_5": "{image_path_5}",  # Path to image 5
    "options_difficulty": 0,  # Integer
    "region": "{region}",  # Example: Europe, Asia
    "event_date": "{event_date}",  # Example: 2024-12-16
    "question_template": "{question_template}",  # Example: common_name, scientific_name
    "question_type": "{question_type}",  # Example: multiple choice, open ended
    "metadata":
        {
            "source": "{source}",
            "license": "{license}",
            "image_url": ["{image_url_1}", "{image_url_2}"],  # List of image URLs
            "gbif_id": "{gbif_id}",
            "eppo_codes": ["{eppo_code_1}", "{eppo_code_2}"],  # List of EPPO codes
            "gbif_taxon_key": "{gbif_taxon_key}",
            "language": "{language}",
            "verbose_answer": "{verbose_answer}"
        }
}

In [368]:
task_name = 'Identification'

dataset_name = "parquet"
data_files = {
    "dev": "/workdir/important_datasets/AGRIVQA/"+task_name+"/dev-00000-of-00001.parquet",
    "test": "/workdir/important_datasets/AGRIVQA/"+task_name+"/test-00000-of-00001.parquet",
    "validation": "/workdir/important_datasets/AGRIVQA/"+task_name+"/validation-00000-of-00001.parquet"
}

dataset = load_dataset(dataset_name,data_files=data_files)

EPPO_dataset = concatenate_datasets([dataset['dev'], dataset['test'], dataset['validation']])


Generating dev split: 16 examples [00:00, 191.98 examples/s]
Generating test split: 9 examples [00:00, 158.35 examples/s]
Generating validation split: 75 examples [00:00, 335.71 examples/s]


In [None]:
template_dict = {
    'common_name':'Common Name',
    'damage_cause': 'Taxonomy',
    'genus_name': 'Taxonomy',
    'growth_stage': 'Growth Stage',
    'scientific_name': 'Taxonomy',
    'weed_identification': 'Taxonomy' 
}

In [None]:
kingdoms = [
 'Archaea',
 'Bacteria',
 'Chromista',
 'Viruses_and_viroids',
 'Fungi',
 'Protista',
 'Plantae',
 'Animalia']
kingdom_eppo_dict = { k: list(load_json(f'/workdir/important_datasets/EPPO_to_GBIF/{k}_EPPO_to_GBIF.json')) for k in kingdoms}

In [None]:
EPPO_dataset[0]

{'id': 'dev__EPPO_1',
 'question': 'What is the common name in English of this plant? <image 1>',
 'options': "['Indian ginseng', 'violet tube flower', 'sticky tailflower', 'tomato']",
 'explanation': None,
 'image_1': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1024x1365>,
 'image_2': None,
 'image_3': None,
 'image_4': None,
 'image_5': None,
 'img_type': "['Picture']",
 'answer': 'D',
 'options_difficulty': '4',
 'question_type': 'multiple-choice',
 'subfield': 'Plantae / common_name',
 'metadata': '{"source": "EPPO", "author": "EPPO", "license": "", "url": "https://gd.eppo.int/media/data/taxon/L/LYPES/pics/1024x0/4984.jpg", "language": "English", "verbose_answer": "{verbose_answer}", "eppo_code": "LYPES", "gbif_key": "2930137", "kingdom": "Plantae", "tag": "common_name", "common_name_language": "English"}'}

In [None]:
EPPO_row = {
    "id": "EPPO",
    "question": "{question}",
    "options": ["{option_1}", "{option_2}", "{option_3}", "{option_4}"],  # List of options
    "answer": "{answer}",
    "image": "{image_path}",  # Path to the image
    "taxon_rank": 0,  # Integer
    "options_difficulty": 0,  # Integer
    "kingdom": "{kingdom}",  # Example: Plantae, Animalia
    "category": "{category}",  # Example: Taxonomy, Growth Stage, Common Name
    "question_template": "{question_template}",  # Example: scientific_name, common_name
    "question_type": "{question_type}",  # Example: multiple choice, open ended
    "metadata":
        {
            "source": "{source}",
            "license": "{license}",
            "image_url": "{image_url}",
            "language": "{language}",
            "verbose_answer": "{verbose_answer}",
            "eppo_code": "{eppo_code}",
            "gbif_key": "{gbif_key}",
            "common_name_language": "{common_name_language}"
        }
}

In [None]:
EPPO_rows = []

OPTIONS = ['A','B','C','D','E']

for q in tqdm(EPPO_dataset):
    EPPO_row['id'] = f"EPPO"
    EPPO_row['question'] = q['question']
        
    # Placeholder for explanation or any additional info
    EPPO_row['answer'] = q['answer']  # Assuming the correct answer is in q.correct_answer
    
    options = eval(q['options'])
    if len(options) == 0:
            options = []
    EPPO_row['options'] = options
    
    EPPO_row['image'] = q['image_1']
    EPPO_row['options_difficulty'] = int(q['options_difficulty'])
    # Metadata section: Assuming placeholders for now
    metadata = eval(q['metadata'])
    
    kingdom = metadata['kingdom']
    EPPO_row['kingdom'] = kingdom
    EPPO_row['taxon_rank'] = kingdom_eppo_dict[kingdom].index(metadata['eppo_code'])
    
    # Placeholder for category if applicable, adjust based on your requirements
    EPPO_row["question_template"] = metadata['tag']
    
    EPPO_row['category'] = template_dict[metadata['tag']]  # Replace with actual category logic if needed
    
    
    #EPPO_row['context'] = #f"This question is sourced from the book titled '{book_title}', specifically found in the chapter '{chapter_title}'."
    
    # Placeholder for question_type logic: assuming it's a multiple-choice or open-ended type
    if options:
        EPPO_row['question_type'] = 'multiple-choice'
    else:
        EPPO_row['question_type'] = 'open-ended'
    
    
    EPPO_row['metadata']['source'] = 'EPPO'  # Replace with actual source if needed
    EPPO_row['metadata']['license'] = metadata['license']
    EPPO_row['metadata']['language'] = metadata['language']
    EPPO_row['metadata']['image_url'] = metadata['url']
    EPPO_row['metadata']['verbose_answer'] = options[OPTIONS.index(q['answer'])]
    EPPO_row['metadata']['eppo_code'] = metadata['eppo_code']
    EPPO_row['metadata']['gbif_key'] = metadata['gbif_key']
    EPPO_row['metadata']['common_name_language'] = metadata['common_name_language']
    # Appending the row to rows
    EPPO_rows.append(deepcopy(EPPO_row))

100%|██████████| 20648/20648 [02:43<00:00, 126.65it/s]


In [None]:
split_and_save(EPPO_rows)
path = '/workdir/important_datasets/AGRIVQA_v2'
dataset_name = 'EPPO'

dataset_dict = load_dataset('parquet', data_files={
    'dev': f'{path}/{dataset_name}/dev-00000-of-00001.parquet',
    'test': f'{path}/{dataset_name}/test-00000-of-00001.parquet',
    'validation': f'{path}/{dataset_name}/validation-00000-of-00001.parquet'
})

dataset_dict['dev'][0]

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1824.40ba/s]
Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 567.18ba/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 529.65ba/s]
Generating dev split: 16 examples [00:00, 4733.98 examples/s]
Generating test split: 3625 examples [00:00, 321288.84 examples/s]
Generating validation split: 907 examples [00:00, 206538.56 examples/s]


{'id': 'dev_AgriExam_1',
 'question': 'Which of the following states is the largest producer of annual flowers’ seeds?',
 'options': ['Punjab', 'Karnataka', 'Tamil Nadu', 'Kerala'],
 'answer': 'A',
 'category': 'Horticulture and Ornamental Plants',
 'question_type': 'multiple-choice',
 'metadata': {'language': 'English',
  'license': '',
  'source': 'AgriExam',
  'url': 'https://www.agriexam.com/horticulture-jrf-2020',
  'verbose_answer': 'Punjab'}}

### WikiHow

In [58]:
WikiHow_row = {
    "id": "_WikiHow_{number}",
    "question": "{question}",
    "options": ["{option_1}", "{option_2}", "{option_3}", "{option_4}"],  # List of options
    "answer": "{answer}",
    "options_difficulty": 0,  # Integer
    "category": "{category}",  # Example: TODO
    "subcategory": "{subcategory}",
    "question_type": "{question_type}",  # Example: multiple choice, open ended
    "question_template": "{question_template}",  # Example: questiontype
    "metadata":
        {
            "source": "{source}",
            "license": "{license}",
            "url": "{url}",
            "language": "{language}",
            "verbose_answer": "{verbose_answer}"
        }
}

In [53]:
wikihow_dataset = load_json('/workdir/wikihow/wikihow.json')

In [54]:
wikihow_category = load_json('/workdir/wikihow/wikihow_category.json')

In [55]:
wikihow_category_url = {url:category for category,urls in wikihow_category.items() for url in urls}

In [59]:
WikiHow_rows = []

OPTIONS = ['A','B','C','D','E']

for q in tqdm(wikihow_dataset):
    WikiHow_row['id'] = f"wikiHow"
    WikiHow_row['question'] = q['question']
        
    # Placeholder for explanation or any additional info
       # Assuming the correct answer is in q.correct_answer
    
    options = q['options']
    if not options:
            options = []
            WikiHow_row['answer'] = q['answer']
    else:
        options=options.split('\n')
        options=[option[3:] for option in options]
        WikiHow_row['answer'] = OPTIONS[options.index(q['answer'])]
    WikiHow_row['options'] = options
    
    if q['options_difficulty']:
        options_difficulty = int(q['options_difficulty'])
    else:
        options_difficulty = q['options_difficulty']
    WikiHow_row['options_difficulty'] = options_difficulty
    # Metadata section: Assuming placeholders for now
    
    # Placeholder for category if applicable, adjust based on your requirements
    WikiHow_row["question_template"] = q['question_type']
    
    category = wikihow_category_url[q['url']].replace('Gardening/','')
    
    WikiHow_row['category'] = category.split('/')[0]  # Replace with actual category logic if needed
    if len(category.split('/'))>1:
        WikiHow_row['subcategory'] = category.split('/')[1]
    else:
        WikiHow_row['subcategory'] = None
    
    
    #WikiHow_row['context'] = #f"This question is sourced from the book titled '{book_title}', specifically found in the chapter '{chapter_title}'."
    
    # Placeholder for question_type logic: assuming it's a multiple-choice or open-ended type
    if options:
        WikiHow_row['question_type'] = 'multiple-choice'
    else:
        WikiHow_row['question_type'] = 'open-ended'
    
    
    WikiHow_row['metadata']['source'] = 'wikiHow'  # Replace with actual source if needed
    WikiHow_row['metadata']['license'] = ""
    WikiHow_row['metadata']['language'] = 'English'
    WikiHow_row['metadata']['url'] = q['url']
    WikiHow_row['metadata']['verbose_answer'] = q['answer']
    # Appending the row to rows
    WikiHow_rows.append(deepcopy(WikiHow_row))

100%|██████████| 2141/2141 [00:00<00:00, 40791.32it/s]


In [65]:
dataset_name = 'WikiHow'
root_folder = '/workdir/important_datasets/AGRIVQA'
split_and_save(WikiHow_rows, root_folder=root_folder)

dataset = load_dataset_dict(os.path.join(root_folder, dataset_name))
dataset

Map: 100%|██████████| 16/16 [00:00<00:00, 2694.49 examples/s]
Map:   0%|          | 0/1700 [00:00<?, ? examples/s]

Map: 100%|██████████| 1700/1700 [00:00<00:00, 6413.01 examples/s]
Map: 100%|██████████| 425/425 [00:00<00:00, 8991.21 examples/s]


PermissionError: [Errno 13] Permission denied: '/workdir/important_datasets/AGRIVQA/wikiHow'