In [1]:
from tqdm import tqdm
import os
from datasets import load_dataset
import json
import nltk
import re
import difflib
from copy import deepcopy

import json
import numpy as np
import random
import pandas as pd
from ollama import AsyncClient

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
client_ollama = AsyncClient(
  host='http://192.168.1.5:11434',
  headers={'x-some-header': 'some-value'}
)

async def chat(content):
    message = {'role': 'user', 'content': content}
    response = await client_ollama.chat(model='llama3.2-vision', messages=[message])
    return response['message']['content']

# Agriexam

In [4]:
row = {
    "id": "_agri_identification_{number}",
    "question": "{question}",
    "options": [], 
    "explanation": "string:{explanation}",
    "image_1": "{image_1}",
    "image_2": "{image_2}",
    "image_3": "{image_3}",
    "image_4": "{image_4}",
    "image_5": "{image_5}",
    "img_type": "{img_type}",
    "answer": "{answer}",
    "topic_difficulty": "{topic_difficulty}",
    "question_type": "{question_type}",
    "subfield": "{subfield}",
    "metadata": {
        "source": "{source}",
        "author": "{author}",
        "license": "{license}",
        "url": "{url}",
        #"tag": "{tag}",
        "language": "{language}",
        "verbose_answer" : "{verbose_answer}",
    }
}

In [5]:
def load_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

agri_dict=load_json('/workdir/important_datasets/Agriexam/agri-business-management-jrf-2020.json')

In [6]:
def compare_strings(str1,str2):
    #return difflib.SequenceMatcher(None,str1,str2,False).ratio()
    return nltk.edit_distance(str1, str2)

In [7]:
def has_indian(text):
    for i, char in enumerate(text):
        if '\u0900' <= char <= '\u097F':
            return True

In [8]:
def remove_after_indian_char(text):
    split = text.split('/')
    if len(split)>=2:
        for s in split:
            if has_indian(s):
                split.remove(s)
        text = '/'.join(split)
    for i, char in enumerate(text):
        # Check if the character is non-ASCII (which includes Indian characters)
        if '\u0900' <= char <= '\u097F':
            return text[:i].strip()
    return text  # Return original if no non-ASCII character is found
def has_multiple_lines(s):
    return '\n' in s or '\r\n' in s

In [9]:
def has_multiple_lines(s):
    return '\n' in s or '\r\n' in s
def clean_question(question):
    #pattern = r'^[Qq]?[^\w]*(\d+)?[^\w]*\d*[^\w]*'
    pattern_one = r'^\s*\d+\s'
    pattern = r'^\s*(\d+)\.\s*|^\s*[Qq]+[^\w]*(\d+)?\:?\.?\s*|^\s*(\d+)\)\s*'  
    if has_multiple_lines(question):
        return question
    cleaned_question = re.sub(pattern, '', question).strip()
    if question == cleaned_question:
        cleaned_question = re.sub(pattern_one, '', question).strip()
    if question == cleaned_question:
        cleaned_question = re.sub(r'^\d+\…', '…', question).strip()
    cleaned_question=remove_after_indian_char(cleaned_question)
    return cleaned_question

In [10]:
def best_match(input_str, str_list):
    # Get the best match using difflib's get_close_matches
    matches = difflib.get_close_matches(input_str, str_list, n=1, cutoff=0.8)
    if matches:
        return matches[0]
    else:
        return None

In [11]:
def remove_numbers(strings):
    # Regex to match patterns like '(1) ' or '1. ' at the start of the string
    if not isinstance(strings, list):
        strings = [strings]
    cleaned_strings = [re.sub(r'^\(\d+\)\.?\s*', '', s) for s in strings]
    cleaned_strings = [re.sub(r'^\d+\.\s+', '', s) for s in cleaned_strings]
    if len(cleaned_strings) == 1:
        cleaned_strings = cleaned_strings[0]
    return cleaned_strings

In [12]:
def remove_first_letter(options):
    letters = ['A ','B ', 'C ', 'D ']
    for option, letter in zip(options,letters):
        if not option[:2] == letter:
            return options
    options = [option[2:] for option in options]
    return options

In [13]:

def clean_options_answer(options,answer):
    options = [remove_after_indian_char(option) for option in options]
    answer = remove_after_indian_char(answer)
    if answer.startswith('Answer:'):
        answer = answer.replace('Answer:','').strip()
    options = remove_numbers(options)
    answer = remove_numbers(answer)
    options = remove_first_letter(options)
    # If answer is a number between 1 and 4 the answer is the index
    if answer.isdigit() and 0<int(answer)<=4:
        if ''.join(options): # TODO Options are all numbers
            return options, options[int(answer)-1]
    try:
        return options, options[options.index(answer)]
    except ValueError:
        matching_option = best_match(answer,options)
        if matching_option:
            return options, options[options.index(matching_option)]
        #print('Best Match: ',best_match(answer,options), 'Answer: ', answer)
        print('No best match')
        return None
        

In [14]:
OPTIONS = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
rows = []
base_dir = '/workdir/important_datasets/Agriexam'
for file_path in os.listdir(base_dir):
    file_path = base_dir+'/'+file_path
    if not file_path.endswith('.json'):
        continue
    agridict=load_json(file_path)
    link=agridict['link']
    questions=agridict['test']
    for q in questions:
        row['id'] = f"_agriexam_{len(rows)+1}"
        row['question'] = q['question']
        
        row['explanation'] = None
        row['image_1'] = None
        row['image_2'] = None
        row['image_3'] = None
        row['image_4'] = None
        row['image_5'] = None
        row['img_type'] = ''
        
        row['options'], verbose_answer = clean_options_answer(q['options'],q['correct_answer'])
        row['answer'] = OPTIONS[row['options'].index(verbose_answer)]
        row['options'] = str(row['options'])
        
        row['question_type'] = 'multiple-choice'
        row['topic_difficulty'] = 3
        
        row['metadata']['verbose_answer'] = verbose_answer
        row['metadata']['source'] = 'AgriExam'
        row['metadata']['author'] = 'AgriExam'
        row['metadata']['license'] = ''
        row['metadata']['url'] = link
        row['metadata']['language'] = 'English'
        rows.append(deepcopy(row))

rows = [row for row in rows if not 'Image: h' in row['question']]
for i, row in enumerate(rows):
    if not 'Image: h' in row['question']:
        row['id'] = f"_agriexam_{i+1}" 
    

In [None]:
# split rows in dev, validation and test sets 
df = pd.DataFrame(rows)
#df.metadata = df.metadata.apply(json.dumps)
index = df.index.tolist()
random.shuffle(index)
dev, test, validation = np.split(index, [int(len(index)*0.1), int(len(index)*0.2)])


In [16]:
# overwrite row['id'] with new id {set}_Identification_{number}
df.loc[dev, 'id'] = df.loc[dev].apply(lambda x: f"dev_{x['id']}", axis=1)
df.loc[validate, 'id'] = df.loc[validate].apply(lambda x: f"validation_{x['id']}", axis=1)
df.loc[test, 'id'] = df.loc[test].apply(lambda x: f"test_{x['id']}", axis=1)

In [17]:
len(dev), len(validate), len(test)

(625, 5003, 625)

In [18]:
from datasets import Features, Value, Sequence, Image, DatasetInfo

features = Features({'id': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'options': Value(dtype='string', id=None),
 'explanation': Value(dtype='string', id=None),
 'image_1': Image(decode=True, id=None),
 'image_2': Image(decode=True, id=None),
 'image_3': Image(decode=True, id=None),
 'image_4': Image(decode=True, id=None),
 'image_5': Image(decode=True, id=None),
 'img_type': Value(dtype='string', id=None),
 'answer': Value(dtype='string', id=None),
 'topic_difficulty': Value(dtype='string', id=None),
 'question_type': Value(dtype='string', id=None),
 'subfield': Value(dtype='string', id=None),
 'metadata': Value(dtype='string', id=None)})

In [19]:
# from datasets import Dataset

# for idx in [dev, validate, test]:
#     # rename id with progressive number {set}_Identification_{number}
#     dev_set = df.loc[idx].copy().reset_index(drop=True)
#     dev_set['index'] = range(1, len(dev_set)+1)
#     dev_set['id'] = dev_set.apply(lambda x: f"{x['id'][:x['id'].rindex('_')]}_{x['index']}", axis=1)
#     # remove index column
#     dev_set.drop(columns=['index'], inplace=True)
#     name = dev_set.id[0].split('_')[0]
#     # save each batch of length 10000
#     max_len = 10000
#     for i in range(0, len(dev_set), max_len):
#         Dataset.from_pandas(dev_set[i:i+max_len], features=features).to_parquet(f'/workdir/AGRIVQA/AgriExam/{name}-{str(i//max_len).zfill(5)}-of-{str((len(dev_set)//max_len)+1).zfill(5)}.parquet')


In [None]:
row = {
    "id": "_agri_identification_{number}",
    "question": "{question}",
    "options": [], 
    "explanation": "string:{explanation}",
    "image_1": "{image_1}",
    "image_2": "{image_2}",
    "image_3": "{image_3}",
    "image_4": "{image_4}",
    "image_5": "{image_5}",
    "img_type": "{img_type}",
    "answer": "{answer}",
    "topic_difficulty": "{topic_difficulty}",
    "question_type": "{question_type}",
    "subfield": "{subfield}",
    "metadata": {
        "source": "{source}",
        "author": "{author}",
        "license": "{license}",
        "url": "{url}",
        #"tag": "{tag}",
        "language": "{language}",
        "verbose_answer" : "{verbose_answer}",
    }
}

In [None]:
OPTIONS = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
rows = []
base_dir = '/workdir/important_datasets/Agriexam'
for file_path in os.listdir(base_dir):
    file_path = base_dir+'/'+file_path
    if not file_path.endswith('.json'):
        continue
    agridict=load_json(file_path)
    link=agridict['link']
    questions=agridict['test']
    for q in questions:
        row['id'] = f"_agriexam_{len(rows)+1}"
        row['question'] = q['question']
        
        row['explanation'] = None
        row['image_1'] = None
        row['image_2'] = None
        row['image_3'] = None
        row['image_4'] = None
        row['image_5'] = None
        row['img_type'] = ''
        
        row['options'], verbose_answer = clean_options_answer(q['options'],q['correct_answer'])
        row['answer'] = OPTIONS[row['options'].index(verbose_answer)]
        row['options'] = str(row['options'])
        
        row['question_type'] = 'multiple-choice'
        row['topic_difficulty'] = 3
        
        row['metadata']['verbose_answer'] = verbose_answer
        row['metadata']['source'] = 'AgriExam'
        row['metadata']['author'] = 'AgriExam'
        row['metadata']['license'] = ''
        row['metadata']['url'] = link
        row['metadata']['language'] = 'English'
        rows.append(deepcopy(row))

rows = [row for row in rows if not 'Image: h' in row['question']]
for i, row in enumerate(rows):
    if not 'Image: h' in row['question']:
        row['id'] = f"_agriexam_{i+1}" 
    

In [None]:
# split rows in dev, validation and test sets 
df = pd.DataFrame(rows)
df.metadata = df.metadata.apply(json.dumps)
index = df.index.tolist()
random.shuffle(index)
dev, test, validate = np.split(index, [int(len(index)*0.1), int(len(index)*0.2)])

In [None]:
# overwrite row['id'] with new id {set}_Identification_{number}
df.loc[dev, 'id'] = df.loc[dev].apply(lambda x: f"dev_{x['id']}", axis=1)
df.loc[validate, 'id'] = df.loc[validate].apply(lambda x: f"validation_{x['id']}", axis=1)
df.loc[test, 'id'] = df.loc[test].apply(lambda x: f"test_{x['id']}", axis=1)

In [None]:
len(dev), len(validate), len(test)

(625, 5003, 625)

In [None]:
from datasets import Features, Value, Sequence, Image, DatasetInfo

features = Features({'id': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'options': Value(dtype='string', id=None),
 'explanation': Value(dtype='string', id=None),
 'image_1': Image(decode=True, id=None),
 'image_2': Image(decode=True, id=None),
 'image_3': Image(decode=True, id=None),
 'image_4': Image(decode=True, id=None),
 'image_5': Image(decode=True, id=None),
 'img_type': Value(dtype='string', id=None),
 'answer': Value(dtype='string', id=None),
 'topic_difficulty': Value(dtype='string', id=None),
 'question_type': Value(dtype='string', id=None),
 'subfield': Value(dtype='string', id=None),
 'metadata': Value(dtype='string', id=None)})

In [None]:
# from datasets import Dataset

# for idx in [dev, validate, test]:
#     # rename id with progressive number {set}_Identification_{number}
#     dev_set = df.loc[idx].copy().reset_index(drop=True)
#     dev_set['index'] = range(1, len(dev_set)+1)
#     dev_set['id'] = dev_set.apply(lambda x: f"{x['id'][:x['id'].rindex('_')]}_{x['index']}", axis=1)
#     # remove index column
#     dev_set.drop(columns=['index'], inplace=True)
#     name = dev_set.id[0].split('_')[0]
#     # save each batch of length 10000
#     max_len = 10000
#     for i in range(0, len(dev_set), max_len):
#         Dataset.from_pandas(dev_set[i:i+max_len], features=features).to_parquet(f'/workdir/AGRIVQA/AgriExam/{name}-{str(i//max_len).zfill(5)}-of-{str((len(dev_set)//max_len)+1).zfill(5)}.parquet')


In [20]:
# aids = load_dataset("parquet", data_files={'validation': '/workdir/AGRIVQA/AgriExam/validation-*.parquet',
#                                                'dev': '/workdir/AGRIVQA/AgriExam/dev-*.parquet',
#                                                'test': '/workdir/AGRIVQA/AgriExam/test-*.parquet'})

# Perguntas


In [21]:
row = {
    "id": "_500P_{number}",
    "question": "{question}",
    "options": [], 
    "explanation": "string:{explanation}",
    "image_1": "{image_1}",
    "image_2": "{image_2}",
    "image_3": "{image_3}",
    "image_4": "{image_4}",
    "image_5": "{image_5}",
    "img_type": "{img_type}",
    "answer": "{answer}",
    "topic_difficulty": "{topic_difficulty}",
    "question_type": "{question_type}",
    "subfield": "{subfield}",
    "metadata": {
        "source": "{source}",
        "author": "{author}",
        "license": "{license}",
        "url": "{url}",
        #"tag": "{tag}",
        "language": "{language}",
        "verbose_answer" : "{verbose_answer}",
    }
}

In [22]:
perguntas_dict=load_json('/workdir/important_datasets/500P_500R_json_EN/1.json')

In [23]:
perguntas_dict

{'QAs': [{'QA id': 486,
   'question': 'Can the cowpea be consumed as a pod, in the same way as the common bean?',
   'answer': 'Yes. In Brazil, two cultivar groups of cowpea are cultivated, the Unguiculata which comprises almost the entire set of local and improved varieties that are mainly destined for dry or green grain consumption, and can be consumed as a pod in softer and less fibrous varieties. The second cultivar group is the Sesquipedalis, commonly known as the yardlong bean, and destined for green pod production and consumption.',
   'book title': '500 Perguntas 500 Respostas - Feijão-caupi',
   'book id': 1,
   'chapter title': 'Pós-Colheita e Industrialização'},
  {'QA id': 487,
   'question': 'Are there industrialized products manufactured with cowpea grains?',
   'answer': 'Yes. Sterile commercial cowpea, in hermetically sealed packaging, ready for consumption, is one of these products. Another product is the flour for acarajé, which is obtained through the grinding of co

In [24]:
translate_title_dict = load_json('/workdir/translate_title.json')
def translate_title(pt_title):
    title = pt_title.replace('500 Perguntas e 500 Respostas:','') #Colección 500 Preguntas 500 Respuestas
    title = title.replace('Colección 500 Preguntas 500 Respuestas','')
    title = title.replace('500 Perguntas 500 Respostas','')
    title = title.replace('500 perguntas 500 respostas ','')
    title = title.replace('- ','')
    title = title.replace(': ','').strip()
    return translate_title_dict[title] 

In [25]:
translate_chapters_dict = load_json('/workdir/translate_chapters.json')
def translate_chapter(pt_chapter):
    return translate_chapters_dict[pt_chapter] 

In [26]:
OPTIONS = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
rows = []
base_dir = '/workdir/important_datasets/500P_500R_json_EN'
all_chapters = set()
all_book_title = set()
for file_path in os.listdir(base_dir):
    file_path = base_dir+'/'+file_path
    if file_path == '/workdir/important_datasets/500P_500R_json_EN/500P_500R.json':
        continue
    if not file_path.endswith('.json'):
        continue
    p_dict=load_json(file_path)['QAs']
    for q in p_dict:
    #link=agridict['link']
    #questions=p_dict['test']
        chapter_title = translate_chapter(q.get('chapter title'))
        book_title = translate_title(q.get('book title'))
        all_chapters.add(chapter_title)
        all_book_title.add(book_title)

        row['id'] = f"_500P_{len(rows)+1}"
        row['question'] = q['question']
        
        row['explanation'] = None
        row['image_1'] = None
        row['image_2'] = None
        row['image_3'] = None
        row['image_4'] = None
        row['image_5'] = None
        row['img_type'] = ''
        
        row['options'] = '[]'
        row['answer'] = q['answer']
        
        row['question_type'] = 'open-ended'
        row['topic_difficulty'] = 5
        
        row['metadata']['book_title'] = book_title
        row['metadata']['chapter_title'] = chapter_title
        row['metadata']['source'] = file_path
        row['metadata']['question_id'] = q['QA id']
        row['metadata']['author'] = '500 Perguntas e 500 Respostas'
        row['metadata']['license'] = ''
        row['metadata']['language'] = 'English'
        rows.append(deepcopy(row))
    

In [27]:
# split rows in dev, validation and test sets 
df = pd.DataFrame(rows)
df.metadata = df.metadata.apply(json.dumps)
index = df.index.tolist()
random.shuffle(index)
dev, test, validate = np.split(index, [int(len(index)*0.1), int(len(index)*0.2)])

In [28]:
# overwrite row['id'] with new id {set}_Identification_{number}
df.loc[dev, 'id'] = df.loc[dev].apply(lambda x: f"dev_{x['id']}", axis=1)
df.loc[validate, 'id'] = df.loc[validate].apply(lambda x: f"validation_{x['id']}", axis=1)
df.loc[test, 'id'] = df.loc[test].apply(lambda x: f"test_{x['id']}", axis=1)

In [29]:
len(dev), len(validate), len(test)

(2021, 16174, 2022)

In [30]:
from datasets import Features, Value, Sequence, Image, DatasetInfo

features = Features({'id': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'options': Value(dtype='string', id=None),
 'explanation': Value(dtype='string', id=None),
 'image_1': Image(decode=True, id=None),
 'image_2': Image(decode=True, id=None),
 'image_3': Image(decode=True, id=None),
 'image_4': Image(decode=True, id=None),
 'image_5': Image(decode=True, id=None),
 'img_type': Value(dtype='string', id=None),
 'answer': Value(dtype='string', id=None),
 'topic_difficulty': Value(dtype='string', id=None),
 'question_type': Value(dtype='string', id=None),
 'subfield': Value(dtype='string', id=None),
 'metadata': Value(dtype='string', id=None)})

In [65]:
from datasets import Dataset

for idx in [dev, validate, test]:
    # rename id with progressive number {set}_Identification_{number}
    dev_set = df.loc[idx].copy().reset_index(drop=True)
    dev_set['index'] = range(1, len(dev_set)+1)
    dev_set['id'] = dev_set.apply(lambda x: f"{x['id'][:x['id'].rindex('_')]}_{x['index']}", axis=1)
    # remove index column
    dev_set.drop(columns=['index'], inplace=True)
    name = dev_set.id[0].split('_')[0]
    # save each batch of length 10000
    max_len = 20000
    #for i in range(0, len(dev_set), max_len):
    #    Dataset.from_pandas(dev_set[i:i+max_len], features=features).to_parquet(f'/workdir/AGRIVQA/500P/{name}-{str(i//max_len).zfill(5)}-of-{str((len(dev_set)//max_len)+1).zfill(5)}.parquet')


In [66]:
aids = load_dataset("parquet", data_files={'validation': '/workdir/AGRIVQA/500P/validation-*.parquet',
                                               'dev': '/workdir/AGRIVQA/500P/dev-*.parquet',
                                               'test': '/workdir/AGRIVQA/500P/test-*.parquet'})

In [69]:
aids['validation']['question']

['Do intact male and confined male lambs exhibit greater weight gain than castrated lambs?',
 'What is the ideal size of the maternity pen?',
 'What are the damages caused by high populations of the whitefly?',
 'For how long should the bull remain in the herd?',
 'What is partial budgeting method?',
 'Which horticultural crops have the greatest economic importance in Brazil?',
 'What is a stretch yarn composed of?',
 'How does the issue of ownership of an asset arise if it is developed in conjunction with partners?',
 'What is liquid supplementation? When and how should it be used?',
 'What does the value of liquid protein mean in terms of nutrition?',
 'Where do female broomrape flies usually lay their eggs?',
 'What consists of obtaining coconut hybrids through directed fertilization?',
 'In which horticultural products are stresses more relevant?',
 'What is the importance of using corn in the feeding of poultry and swine?',
 'What are the main grafting methods used in budding?',
 

In [33]:
prompt = 'Just answer yes or no. Tell me if there is a Portuguese word in the following text: '
tasks = []
for row in tqdm(rows[:1000]):
    tasks.append(chat(prompt+row['question']))

100%|██████████| 1000/1000 [00:00<00:00, 1294538.27it/s]


# EPPO to GBIF

In [34]:
row = {
    "id": "_EPPO_{number}",
    "question": "{question}",
    "options": [], 
    "explanation": "string:{explanation}",
    "image_1": "{image_1}",
    "image_2": "{image_2}",
    "image_3": "{image_3}",
    "image_4": "{image_4}",
    "image_5": "{image_5}",
    "img_type": "{img_type}",
    "answer": "{answer}",
    "options_difficulty": "{options_difficulty}",
    "question_type": "{question_type}",
    "subfield": "{subfield}",
    "metadata": {
        "source": "{source}",
        "author": "{author}",
        "license": "{license}",
        "url": "{url}",
        #"tag": "{tag}",
        "language": "{language}",
        "verbose_answer" : "{verbose_answer}",
    }
}

In [35]:
eppo_dict=load_json('/workdir/important_datasets/EPPO_questions/visual_identification.json')

In [36]:
seen_tags = []

for i, q in eppo_dict.items():
    tag = q['tag']
    if tag not in seen_tags:
        seen_tags.append(tag)
        print(f'{tag:<30}', q['question'])

scientific_name                What is the scientific name of this bacterium?
common_name                    What is the common name in English of this bacterium?
genus_name                     Which genus does this bacterium belong to?
damage_cause                   What is the scientific name of the bacterium causing this damage?
weed_identification            What is the scientific name of the invasive weed in this image?
growth_stage                   At what growth stage is this animal?


In [37]:
tags = [
'scientific_name',                
'common_name',               
'genus_name',                     
'damage_cause',                   
'weed_identification',            
'growth_stage'
]

In [48]:
file_path = '/workdir/important_datasets/EPPO_questions/visual_identification.json'
eppo_dict=load_json(file_path)

done = []
next_difficulty = random.randint(1,5)
questions = []
for i, q in tqdm(eppo_dict.items()):
    tag = q['tag']
    image_path = q['image_path']
    difficulty = q['difficulty']
    if ((tag, image_path) not in done and next_difficulty==difficulty) or tag=='growth_stage':
        next_difficulty = random.randint(1,5)
        done.append((tag, image_path))
        questions.append(q)
        

100%|██████████| 107277/107277 [00:26<00:00, 4001.81it/s] 


In [49]:
def create_name(eppo_url):
    return eppo_url.split('/')[-4]+'_'+eppo_url.split('/')[-1]


In [50]:
import asyncio
import aiohttp
import os

# Directory to save images
SAVE_DIR = "/workdir/images/EPPO"
# Ensure the save directory exists
os.makedirs(SAVE_DIR, exist_ok=True)

async def download_image(session, url, file_path):
    """
    Downloads an image from a given URL and saves it to the specified file path.
    """
    try:
        async with session.get(url) as response:
            if response.status == 200:
                with open(file_path, 'wb') as f:
                    f.write(await response.read())
                print(f"Downloaded: {file_path}")
            else:
                print(f"Failed to download {url}: HTTP {response.status}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")

async def download_images(urls):
    """
    Downloads multiple images asynchronously from a list of URLs.
    """
    async with aiohttp.ClientSession() as session:
        tasks = []
        for i, url in enumerate(urls):
            file_path=os.path.join(SAVE_DIR, create_name(url))  # Adjust the extension based on the file type
            tasks.append(download_image(session, url, file_path))
        await asyncio.gather(*tasks)




In [51]:
image_urls = list(set([question['image_path'] for question in questions if isinstance(question['image_path'], str)]))

In [52]:
len(os.listdir(SAVE_DIR))

11466

In [53]:
import os
import requests
from PIL import Image
from io import BytesIO


def image_to_byte_array(image: Image) -> bytes:
  # BytesIO is a file-like buffer stored in memory
  imgByteArr = BytesIO()
  # image.save expects a file-like as a argument
  image.save(imgByteArr, format=image.format)
  # Turn the BytesIO object back into a bytes object
  imgByteArr = imgByteArr.getvalue()
  return imgByteArr

def check_if_image_exists(image_path, save_dir):
    image_name = create_name(image_path)
    image_path = os.path.join(save_dir, image_name)
    return os.path.exists(image_path)

def get_image(image_path, save_dir):
    if save_dir:
        if check_if_image_exists(image_path, save_dir):
            image = Image.open(os.path.join(save_dir, create_name(image_path)))
        else:
            response = requests.get(image_path)
            image = Image.open(BytesIO(response.content))
            image.save(os.path.join(save_dir, create_name(image_path)))
        
    return image_to_byte_array(image)

In [54]:
questions[0]

{'question': 'What is the scientific name of this bacterium?',
 'correct_answer': 'Phytoplasma solani',
 'options': ['Phytoplasma tritici',
  'Phytoplasma sacchari',
  'Phytoplasma solani',
  'Phytoplasma pruni'],
 'image_path': 'https://gd.eppo.int/media/data/taxon/P/PHYPSO/pics/1024x0/1225.jpg',
 'difficulty': 5,
 'tag': 'scientific_name',
 'language': '',
 'EPPO_code': 'PHYPSO',
 'GBIF_key': 10839170,
 'kingdom': 'Bacteria',
 'task': 'visual_identification'}

In [55]:
SAVE_DIR = "/workdir/images/EPPO"

OPTIONS = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
rows = []

for q in tqdm(questions):
#link=agridict['link']
#questions=p_dict['test']

    row['id'] = f"_EPPO_{len(rows)+1}"
    row['question'] = q['question'] + ' <image 1>'
    
    row['explanation'] = None
    row['image_1'] = get_image(q['image_path'], SAVE_DIR)
    row['image_2'] = None
    row['image_3'] = None
    row['image_4'] = None
    row['image_5'] = None
    row['img_type'] = str(['Picture'])
    
    row['options'] = str(q['options'])
    
    row['answer'] = OPTIONS[q['options'].index(q['correct_answer'])]
    
    row['question_type'] = 'multiple-choice'
    row['options_difficulty'] = q['difficulty']
    

    row['subfield'] = q['kingdom'] + ' / ' + q['tag']
    row['metadata']['source'] = 'EPPO'
    row['metadata']['author'] = 'EPPO'
    row['metadata']['license'] = ''
    row['metadata']['url'] = q['image_path']
    row['metadata']['eppo_code'] = str(q['EPPO_code'])
    row['metadata']['gbif_key'] = str(q['GBIF_key'])
    row['metadata']['kingdom'] = q['kingdom']
    row['metadata']['tag'] = q['tag']
    row['metadata']['common_name_language'] = q['language']
    row['metadata']['language'] = 'English'
    
    rows.append(deepcopy(row))
    

  0%|          | 0/20648 [00:00<?, ?it/s]

100%|██████████| 20648/20648 [04:00<00:00, 85.83it/s] 


In [57]:
len(rows)

20648

In [58]:
# split rows in dev, validation and test sets 
df = pd.DataFrame(rows)
df.metadata = df.metadata.apply(json.dumps)
index = df.index.tolist()
random.shuffle(index)
dev, test, validate = np.split(index, [int(len(index)*0.1), int(len(index)*0.2)])

In [59]:
# overwrite row['id'] with new id {set}_Identification_{number}
df.loc[dev, 'id'] = df.loc[dev].apply(lambda x: f"dev_{x['id']}", axis=1)
df.loc[validate, 'id'] = df.loc[validate].apply(lambda x: f"validation_{x['id']}", axis=1)
df.loc[test, 'id'] = df.loc[test].apply(lambda x: f"test_{x['id']}", axis=1)

In [60]:
len(dev), len(validate), len(test)

(2064, 16519, 2065)

In [61]:
from datasets import Features, Value, Sequence, Image, DatasetInfo

features = Features({'id': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'options': Value(dtype='string', id=None),
 'explanation': Value(dtype='string', id=None),
 'image_1': Image(decode=True, id=None),
 'image_2': Image(decode=True, id=None),
 'image_3': Image(decode=True, id=None),
 'image_4': Image(decode=True, id=None),
 'image_5': Image(decode=True, id=None),
 'img_type': Value(dtype='string', id=None),
 'answer': Value(dtype='string', id=None),
 'options_difficulty': Value(dtype='string', id=None),
 'question_type': Value(dtype='string', id=None),
 'subfield': Value(dtype='string', id=None),
 'metadata': Value(dtype='string', id=None)})

In [70]:
from datasets import Dataset

for idx in [dev, validate, test]:
    # rename id with progressive number {set}_Identification_{number}
    dev_set = df.loc[idx].copy().reset_index(drop=True)
    dev_set['index'] = range(1, len(dev_set)+1)
    dev_set['id'] = dev_set.apply(lambda x: f"{x['id'][:x['id'].rindex('_')]}_{x['index']}", axis=1)
    # remove index column
    dev_set.drop(columns=['index'], inplace=True)
    name = dev_set.id[0].split('_')[0]
    # save each batch of length 10000
    max_len = 20000
    # for i in range(0, len(dev_set), max_len):
    #     Dataset.from_pandas(dev_set[i:i+max_len], features=features).to_parquet(f'/workdir/AGRIVQA/EPPO/{name}-{str(i//max_len).zfill(5)}-of-{str((len(dev_set)//max_len)+1).zfill(5)}.parquet')


In [63]:
aids = load_dataset("parquet", data_files={'validation': '/workdir/AGRIVQA/EPPO/validation-*.parquet',
                                               'dev': '/workdir/AGRIVQA/EPPO/dev-*.parquet',
                                               'test': '/workdir/AGRIVQA/EPPO/test-*.parquet'})

Generating validation split: 16519 examples [00:05, 3034.96 examples/s]
Generating dev split: 2064 examples [00:00, 4004.27 examples/s]
Generating test split: 2065 examples [00:00, 3472.06 examples/s]


# WikiHow

##### TODO Salvare nei metadati il nome della procedura ecc.

In [86]:
row = {
    "id": "_agri_identification_{number}",
    "question": "{question}",
    "options": [], 
    "explanation": "string:{explanation}",
    "image_1": "{image_1}",
    "image_2": "{image_2}",
    "image_3": "{image_3}",
    "image_4": "{image_4}",
    "image_5": "{image_5}",
    "img_type": "{img_type}",
    "answer": "{answer}",
    "options_difficulty": "{options_difficulty}",
    "question_type": "{question_type}",
    "subfield": "{subfield}",
    "metadata": {
        "source": "{source}",
        "author": "{author}",
        "license": "{license}",
        "url": "{url}",
        #"tag": "{tag}",
        "language": "{language}",
        "verbose_answer" : "{verbose_answer}",
    }
}

In [72]:
wikihow_dict=load_json('/workdir/wikihow/wikihow.json')

In [90]:
wikihow_dict[1]

{'question': 'Write down all the steps from the "Propagating Baby Spider Plants in Water" instructions in the "Prune Lilacs" procedure.',
 'options': None,
 'correct_answer': '1. Snip off the baby spider plants on the plant’s stems.\n2. Set the baby spider plant in a small jar of distilled water.\n3. Place the baby spider plants in indirect sun and watch them grow roots.\n4. Plant the baby plant in soil when its roots are 2-3 inches (5-7.6 cm) long.',
 'options_difficulty': None,
 'question_type': 'open_question'}

In [92]:
def transform_options(options):
    if options:
        options = options.split('\n')
        return [option[3:] for option in options]
    else:
        return []

In [94]:
OPTIONS = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
rows = []
base_dir = '/workdir/important_datasets/wikihow.json'
wikihow_dict = load_json(base_dir)

for q in wikihow_dict:
    row['id'] = f"_wikihow_{len(rows)+1}"
    row['question'] = q['question']
    
    row['explanation'] = None
    row['image_1'] = None
    row['image_2'] = None
    row['image_3'] = None
    row['image_4'] = None
    row['image_5'] = None
    row['img_type'] = ''
    
    row['options'] = transform_options(q['options'])
    
    
    if q['options']:
        row['answer'] = OPTIONS[row['options'].index(q['correct_answer'])]
        row['question_type'] = 'multiple-choice'
    else:
        row['answer'] = q['correct_answer']
        row['question_type'] = 'open-ended'
        
    row['options'] = str(row['options'])
    row['options_difficulty'] = q['options_difficulty'] if q['options_difficulty'] else 5
    
    row['metadata']['question_type'] = q['question_type']
    row['metadata']['source'] = 'WikiHow'
    row['metadata']['author'] = 'WikiHow'
    row['metadata']['license'] = ''
    row['metadata']['language'] = 'English'
    rows.append(deepcopy(row))
    

In [96]:
# split rows in dev, validation and test sets 
df = pd.DataFrame(rows)
df.metadata = df.metadata.apply(json.dumps)
index = df.index.tolist()
random.shuffle(index)
dev, test, validate = np.split(index, [int(len(index)*0.1), int(len(index)*0.2)])

In [97]:
# overwrite row['id'] with new id {set}_Identification_{number}
df.loc[dev, 'id'] = df.loc[dev].apply(lambda x: f"dev_{x['id']}", axis=1)
df.loc[validate, 'id'] = df.loc[validate].apply(lambda x: f"validation_{x['id']}", axis=1)
df.loc[test, 'id'] = df.loc[test].apply(lambda x: f"test_{x['id']}", axis=1)

In [98]:
len(dev), len(validate), len(test)

(214, 1717, 215)

In [99]:
from datasets import Features, Value, Sequence, Image, DatasetInfo

features = Features({'id': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'options': Value(dtype='string', id=None),
 'explanation': Value(dtype='string', id=None),
 'image_1': Image(decode=True, id=None),
 'image_2': Image(decode=True, id=None),
 'image_3': Image(decode=True, id=None),
 'image_4': Image(decode=True, id=None),
 'image_5': Image(decode=True, id=None),
 'img_type': Value(dtype='string', id=None),
 'answer': Value(dtype='string', id=None),
 'options_difficulty': Value(dtype='string', id=None),
 'question_type': Value(dtype='string', id=None),
 'subfield': Value(dtype='string', id=None),
 'metadata': Value(dtype='string', id=None)})

In [100]:
from datasets import Dataset

for idx in [dev, validate, test]:
    # rename id with progressive number {set}_Identification_{number}
    dev_set = df.loc[idx].copy().reset_index(drop=True)
    dev_set['index'] = range(1, len(dev_set)+1)
    dev_set['id'] = dev_set.apply(lambda x: f"{x['id'][:x['id'].rindex('_')]}_{x['index']}", axis=1)
    # remove index column
    dev_set.drop(columns=['index'], inplace=True)
    name = dev_set.id[0].split('_')[0]
    # save each batch of length 10000
    max_len = 10000
    for i in range(0, len(dev_set), max_len):
        Dataset.from_pandas(dev_set[i:i+max_len], features=features).to_parquet(f'/workdir/AGRIVQA/WikiHow/{name}-{str(i//max_len).zfill(5)}-of-{str((len(dev_set)//max_len)+1).zfill(5)}.parquet')


Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 656.42ba/s]
Creating parquet from Arrow format: 100%|██████████| 18/18 [00:00<00:00, 1252.70ba/s]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 1213.63ba/s]


In [101]:
aids = load_dataset("parquet", data_files={'validation': '/workdir/AGRIVQA/WikiHow/validation-*.parquet',
                                               'dev': '/workdir/AGRIVQA/WikiHow/dev-*.parquet',
                                               'test': '/workdir/AGRIVQA/WikiHow/test-*.parquet'})

Generating validation split: 1717 examples [00:00, 28954.96 examples/s]
Generating dev split: 214 examples [00:00, 23559.79 examples/s]
Generating test split: 215 examples [00:00, 21292.89 examples/s]


In [102]:
aids['validation'][0]

{'id': 'validation__wikihow_1',
 'question': 'What is the next step in the "Choosing a Filler Material" instructions in the "Prune Lilacs" procedure?\n\nSteps:\n1. Use collected soda bottles or cans for a low-cost, long-lasting option.\n2. Try a smaller inverted pot if you have an extra one.\n3. Stand a tall, rectangular wood beam into your planter for a simple choice.\n4. Use rocks or broken crockery for a sturdy option.\n5. Choose wood chip mulch or pinecones as a lightweight choice.\n6. Use Styrofoam packing peanuts if you have them..',
 'options': "['Use glass jars, plastic Easter eggs, or metal containers as a long-lasting choice.', 'Try paper bags, plastic Easter eggs, or cardboard boxes as a long-lasting choice.', 'Try plastic Easter eggs, juice bottles, or containers as a long-lasting choice.', 'Try plastic Easter eggs, milk jugs, or containers as a long-lasting choice.']",
 'explanation': None,
 'image_1': None,
 'image_2': None,
 'image_3': None,
 'image_4': None,
 'image_5':