In [11]:
from datasets import Features, Value, Sequence, Image, DatasetInfo
from tqdm import tqdm
import os
from datasets import load_dataset, concatenate_datasets
import json
import re
from copy import deepcopy

import asyncio
import math
import numpy as np
import random
import pandas as pd
from ollama import AsyncClient

from random import sample
from collections import Counter
from openai import OpenAI

### Setting Features for each dataset

In [20]:
AgriExam_features = Features({
    
 'id': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'options': Sequence(feature=Value("string")),
 'answer': Value(dtype='string', id=None),
 
 'category': Value(dtype='string', id=None), # Plant Science, Pests, Taxonomy, Scientific_name
 'question_type': Value(dtype='string', id=None), # multiple choice, open ended 
 
 'metadata':Sequence(feature={'source': Value(dtype='string'), 'license': Value(dtype='string'), 'url': Value(dtype='string'), 'language': Value(dtype='string'), 'verbose_answer': Value(dtype='string') } )
 })

In [21]:
Agri500P_features = Features({
    
 'id': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'answer': Value(dtype='string', id=None),
 
 'context' : Value(dtype='string', id=None),
 
 'category': Value(dtype='string', id=None), # Plant Science, Pests, Taxonomy
 'question_type': Value(dtype='string', id=None), # multiple choice, open ended 
 
 'metadata':Sequence(feature={'source': Value(dtype='string'), 'license': Value(dtype='string'), 'language': Value(dtype='string'), 'book_title': Value(dtype='string'), 'chapter_title': Value(dtype='string') } )
 })

In [26]:
EPPO_features = Features({
    
    'id': Value(dtype='string', id=None),
    'question': Value(dtype='string', id=None),
    'options': Sequence(feature=Value("string")),
    'answer': Value(dtype='string', id=None),
    
    'image': Image(decode=True, id=None),
    
    'taxon_difficulty' : Value(dtype='int32', id=None),
    'options_difficulty': Value(dtype='int32', id=None),
    'kingdom' : Value(dtype='string', id=None),
    
    'category': Value(dtype='string', id=None), # Taxonomy, Growth Stage, Common Name
    'question_template' : Value(dtype='string', id=None), # scientific_name, common_name
    'question_type': Value(dtype='string', id=None), # multiple choice, open ended
    
    'metadata':Sequence(feature={'source': Value(dtype='string'), 'license': Value(dtype='string'), 'image_url': Value(dtype='string'), 'language': Value(dtype='string'), 'verbose_answer': Value(dtype='string'), "eppo_code": Value(dtype='string'), "gbif_key": Value(dtype='string'), "common_name_language": Value(dtype='string') } )
 
 })

In [28]:
GBIF_features = Features({
    
    'id': Value(dtype='string', id=None),
    'question': Value(dtype='string', id=None),
    'options': Sequence(feature=Value("string")),
    'answer': Value(dtype='string', id=None),
 
    'image_1': Image(decode=True, id=None),
    'image_2': Image(decode=True, id=None),
    'image_3': Image(decode=True, id=None),
    'image_4': Image(decode=True, id=None),
    'image_5': Image(decode=True, id=None),
    
    'options_difficulty': Value(dtype='int32', id=None), # previously topic_difficulty
    'category_difficulty': Value(dtype='int32', id=None), # previously subfield last digit
    
    'region': Value(dtype='string', id=None),
    'event_date': Value(dtype='string', id=None),
    
    # Category can be Horticulture, tropical plants etc
    #'category': Value(dtype='string', id=None), #previously subfield # Plant Science, Pests, Taxonomy
    'question_templates' :  Value(dtype='string', id=None),
    'question_type': Value(dtype='string', id=None), # multiple choice, open ended 
    
    'metadata':Sequence(feature={'source': Value(dtype='string'), 'license': Value(dtype='string'), 'image_url': Sequence(feature=Value("string")), "gbif_id": Value(dtype='string'), "eppo_codes": Sequence(feature=Value("string")), "gbif_taxon_key": Value(dtype='string'), 'language': Value(dtype='string'), 'verbose_answer': Value(dtype='string') } )
 })

In [29]:
name = "WikiHow"
datset=load_dataset('/workdir/important_datasets/AGRIVQA/'+name)
ds=datset['test']
ds[0]

Generating validation split: 1933 examples [00:00, 60801.60 examples/s]
Generating test split: 213 examples [00:00, 26392.52 examples/s]


{'id': 'validation__wikihow_1261',
 'question': 'What step is missing from the "Applying Herbicide to the Stump" instructions in the "Prune Lilacs" procedure?\n\nSteps:\n1. ...............\n2. Select an herbicide.\n3. Wear safety equipment.\n4. Apply herbicide to a stump.',
 'options': "['Assess the risks', 'Consider the benefits and drawbacks', 'Identify potential hazards', 'Know the risks.']",
 'explanation': None,
 'image_1': None,
 'image_2': None,
 'image_3': None,
 'image_4': None,
 'image_5': None,
 'img_type': '',
 'answer': 'D',
 'options_difficulty': '4',
 'question_type': 'multiple-choice',
 'subfield': '{subfield}',
 'metadata': '{"source": "WikiHow", "author": "WikiHow", "license": "", "url": "{url}", "language": "English", "verbose_answer": "{verbose_answer}", "question_type": "missing_step"}'}

In [None]:
WikiHow_features = Features({
    
 'id': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'options': Sequence(feature=Value("string")),
 'answer': Value(dtype='string', id=None),
 
 'options_difficulty': Value(dtype='int32', id=None),
 
 'category': Value(dtype='string', id=None), # TODO
 'question_type': Value(dtype='string', id=None), # multiple choice, open ended
 'question_templates' : Value(dtype='string', id=None), # previously metadata[questiontype]
 
 'metadata':Sequence(feature={'source': Value(dtype='string'), 'url': Value(dtype='string'), 'language': Value(dtype='string'), 'verbose_answer': Value(dtype='string') } )
 })

### Creating the parquet files

In [None]:
from datasets import Dataset

for idx in [dev, validate, test]:
    # rename id with progressive number {set}_Identification_{number}
    dev_set = df.loc[idx].copy().reset_index(drop=True)
    dev_set['index'] = range(1, len(dev_set)+1)
    dev_set['id'] = dev_set.apply(lambda x: f"{x['id'][:x['id'].rindex('_')]}_{x['index']}", axis=1)
    # remove index column
    dev_set.drop(columns=['index'], inplace=True)
    name = dev_set.id[0].split('_')[0]
    # save each batch of length 10000
    max_len = 10000
    for i in range(0, len(dev_set), max_len):
        Dataset.from_pandas(dev_set[i:i+max_len], features=agriexam_features).to_parquet(f'/workdir/AGRIVQA/AgriExam/{name}-{str(i//max_len).zfill(5)}-of-{str((len(dev_set)//max_len)+1).zfill(5)}.parquet')