In [34]:
from tqdm import tqdm
import os
from datasets import load_dataset, concatenate_datasets
import json
import nltk
import re
import difflib
from copy import deepcopy

import asyncio
import math
import json
import numpy as np
import random
from random import shuffle, sample
import pandas as pd
from ollama import AsyncClient
import json

from collections import Counter



# Utils

In [2]:
def save_to_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
        
def load_from_json(filename):
    with open(filename, 'r') as f:
        return json.load(f)

In [29]:
client_ollama = AsyncClient(
  host='http://192.168.1.5:11434',
)

async def chat(content,image_path):
    message = {'role': 'user', 'content': content}
    if image_path != None:
        message.update({'images': [image_path]})
    response = await client_ollama.chat(model='llama3.3', messages=[message],options={"temperature":0.2})
    return response['message']['content']

# Dataset

In [4]:
task_name = '500P'

dataset_name = "parquet"
data_files = {
    "dev": "/workdir/important_datasets/AGRIVQA/"+task_name+"/dev-00000-of-00001.parquet",
    "test": "/workdir/important_datasets/AGRIVQA/"+task_name+"/test-00000-of-00001.parquet",
    "validation": "/workdir/important_datasets/AGRIVQA/"+task_name+"/validation-00000-of-00001.parquet"
}
split = "validation"

dataset = load_dataset(dataset_name,data_files=data_files)

Generating dev split: 16 examples [00:00, 2462.98 examples/s]
Generating test split: 2021 examples [00:00, 64267.00 examples/s]
Generating validation split: 18180 examples [00:00, 72761.15 examples/s]


## Unsplit Dataset

In [5]:
# Load dataset with splits

# Combine (concatenate) splits into a single dataset
combined_dataset = concatenate_datasets([dataset['dev'], dataset['test'], dataset['validation']])

# Now combined_dataset contains data from all splits
print(combined_dataset)


Dataset({
    features: ['id', 'question', 'options', 'explanation', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'img_type', 'answer', 'topic_difficulty', 'question_type', 'subfield', 'metadata'],
    num_rows: 20217
})


In [6]:
dataset=combined_dataset
dataset

Dataset({
    features: ['id', 'question', 'options', 'explanation', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'img_type', 'answer', 'topic_difficulty', 'question_type', 'subfield', 'metadata'],
    num_rows: 20217
})

## Collect all contexts as book_title: chapter_title

In [67]:
contexts_set = set()

for doc in dataset:
    metadata = eval(doc['metadata'])
    book_title = metadata['book_title'].replace(' – The Producer Asks, Embrapa Answers', 's')
    chapter_title = metadata['chapter_title'].replace('Strawberry production in greenhouses is a more common topic, but I assume you meant... ', '')
    contexts_set.add(f"{book_title}: {chapter_title}")

contexts = "\n".join(contexts_set)

In [68]:
print(contexts)

Banana: Varieties
Sesame: Harvest and Processing
Castor Bean: Cuphea seed oil
Post-Harvest of Vegetables: Vegetable Quality
Soybean: Seed Production
Corn: Second Crop Corn
Genetic Resources: Principles and Concepts on Genetic Resources
Peanut: Transfer of Technology and Production Organization
Small Fruits: Main diseases and their control/management in production areas
Grapes: Varieties
Citrus: Irrigation and Fertilization by Irrigation
Beef Cattle: Animal Health
Grapes: Weather or Climate
Pineapple: Nutritional Value and Fruit Processing
Grapes: Harvest and Post-Harvest
Banana: Processing and Products
Cotton: Colored Cotton in Brazil and around the World
Castor Bean: Improvement, Crops and Biotechnology
Cassava: Climate and Soil
Sorghum: Weed Management
Vegetables: Wild Plants and Solarization
Irrigated Fruit Farming: Organic Soil Management
Beans: Interplanting
Citrus: Post-harvest
Cotton: Cotton Harvest
Cassava: Selection and Preparation of Sowing Material
Cotton: Direct Cotton Plan

## Classify context based on category

In [69]:
print(contexts)

Banana: Varieties
Sesame: Harvest and Processing
Castor Bean: Cuphea seed oil
Post-Harvest of Vegetables: Vegetable Quality
Soybean: Seed Production
Corn: Second Crop Corn
Genetic Resources: Principles and Concepts on Genetic Resources
Peanut: Transfer of Technology and Production Organization
Small Fruits: Main diseases and their control/management in production areas
Grapes: Varieties
Citrus: Irrigation and Fertilization by Irrigation
Beef Cattle: Animal Health
Grapes: Weather or Climate
Pineapple: Nutritional Value and Fruit Processing
Grapes: Harvest and Post-Harvest
Banana: Processing and Products
Cotton: Colored Cotton in Brazil and around the World
Castor Bean: Improvement, Crops and Biotechnology
Cassava: Climate and Soil
Sorghum: Weed Management
Vegetables: Wild Plants and Solarization
Irrigated Fruit Farming: Organic Soil Management
Beans: Interplanting
Citrus: Post-harvest
Cotton: Cotton Harvest
Cassava: Selection and Preparation of Sowing Material
Cotton: Direct Cotton Plan

In [108]:
categories = [
    "Soil and Fertilization Management",
    "Irrigation and Water Management",
    "Pest, Disease, and Weed Management",
    "Plant Breeding and Genetic Improvement",
    "Post-Harvest Management and Processing",
    "Livestock Management",
    "Crop-Livestock-Forestry Integration",
    "Environmental and Geotechnological Considerations",
    "Sustainable and Organic Agriculture",
    "Economics, Market, and Development",
    "Biotechnology and Advanced Techniques",
    "Animal Nutrition and Breeding",
    "Nutrition, Health Benefits, and Food Processing",
    "Agricultural Experimentation, Extension, and Technology Transfer"
]

In [107]:
shuffle(categories)
categories_string = '\n'.join(categories)
print(categories_string)

Livestock Management
Miscellanea
Nutrition, Health Benefits, and Food Processing
Environmental and Geotechnological Considerations
Soil and Fertilization Management
Irrigation and Water Management
Sustainable and Organic Agriculture
Plant Breeding and Genetic Improvement
Crop-Livestock-Forestry Integration
Post-Harvest Management and Processing
Pest, Disease, and Weed Management
Biotechnology and Advanced Techniques
Agricultural Experimentation, Extension, and Technology Transfer
Economics, Market, and Development
Animal Nutrition and Breeding


## Classify contexts via LLM

In [110]:
prompt = '''
Choose from the following options the one that best categorizes the title.
[Title]
{title}
[End of Title]

[Options]
{categories_string}
[End of Options]
Your answer must include only the chosen option and nothing else. If you cannot find a match answer 'Miscellanea'.
'''

In [112]:
categories_dict = {c: [] for c in categories+['Miscellanea']}
for context in tqdm(list(contexts_set)):
    response = await chat(prompt.format(title=context, categories_string=categories_string), None)
    response = response.strip()
    if response == 'Miscellanea':
        print(f'Toppettignao: {context}')
    if response not in categories:
        print(f'Response: {response}, Context: {context}')
    else:
        categories_dict[response].append(context)

 23%|██▎       | 158/673 [02:17<07:03,  1.22it/s]

Toppettignao: Castor Bean: Curious facts or Curiosities
Response: Miscellanea, Context: Castor Bean: Curious facts or Curiosities


 29%|██▉       | 198/673 [02:52<06:15,  1.26it/s]

Toppettignao: Apple: Generalities
Response: Miscellanea, Context: Apple: Generalities


 32%|███▏      | 217/673 [03:09<06:00,  1.27it/s]

Toppettignao: Castor Oil Plant: Curious facts or Curiosities
Response: Miscellanea, Context: Castor Oil Plant: Curious facts or Curiosities


100%|██████████| 673/673 [09:52<00:00,  1.14it/s]


In [77]:
for k, v in categories_dict.items():
    print(k, len(v))
    for x in v:
        print('\t\t', x)

Irrigation and Water Management 33
		 Citrus: Irrigation and Fertilization by Irrigation
		 Irrigated Fruit Farming: Soil-Water-Plant Relationship
		 Cowpea Beans: Irrigation
		 Irrigated Fruit Farming: Grapefruit Management
		 Irrigated Fruit Farming: Fertigation
		 Cashew: Irrigation and Fertilization by Irrigation
		 Irrigated Fruit Farming: Banana Culture Management
		 Sesame: Irrigation
		 Corn: Irrigation
		 Small Fruits: Principles on irrigation and fertigation
		 Beans: Irrigation
		 Peach, Nectarine, and Plum: Irrigation and Fertigation
		 Irrigated Fruit Farming: Vineyard Management
		 Irrigated Fruit Farming: Irrigation
		 Rice: Irrigation
		 Mango: Irrigation
		 Sorghum: Irrigation
		 Gardens: Irrigation
		 Vegetables: Water Management
		 Coconut: Irrigation
		 Papaya: Irrigation and Fertilization by Irrigation
		 Pear: Irrigation and Fertilization by Irrigation
		 Castor Bean: Irrigation and Drainage
		 Apple: Irrigation and Fertilization by Irrigation
		 Banana: Irrigatio

## We remove a category and reclassify its elements, because it is to small and prone to errors.

In [86]:
reclassify = [
'Cashew: Establishment of Orchard and Cultural Practices',
'Irrigated Fruit Farming: Mango Culture Management',
'Passion Fruit: Passion Fruit Production in Associated Systems or Agroforestry',
'Pear: Orchard Installation'
]

new_categories = [c for c in categories if not c.startswith('Agroforestry and Perennial Farming')]
new_string = '\n'.join(new_categories)
new_categories_dict = {c: [] for c in new_categories}

for context in tqdm(reclassify):
    response = await chat(prompt.format(title=context, categories_string=new_string), None)
    response = response.strip()
    if response not in new_categories:
        print(f'Response: {response}, Context: {context}')
    else:
        new_categories_dict[response].append(context)



100%|██████████| 4/4 [00:03<00:00,  1.02it/s]


In [87]:
for k, v in new_categories_dict.items():
    if len(v)>0:
        print(k, len(v))
        for x in v:
            print('\t\t', x)

Irrigation and Water Management 1
		 Irrigated Fruit Farming: Mango Culture Management
Agricultural Experimentation, Extension, and Technology Transfer 2
		 Cashew: Establishment of Orchard and Cultural Practices
		 Pear: Orchard Installation
Crop-Livestock-Forestry Integration 1
		 Passion Fruit: Passion Fruit Production in Associated Systems or Agroforestry


In [90]:
for category, contexts in new_categories_dict.items():
    for context in contexts:
        categories_dict[category].append(context)

categories_dict = {k: v for k, v in categories_dict.items() if k != 'Agroforestry and Perennial Farming'}

In [92]:
for k, v in categories_dict.items():
    if len(v)>0:
        print(k, len(v))
        for x in v:
            print('\t\t', x)

Irrigation and Water Management 34
		 Citrus: Irrigation and Fertilization by Irrigation
		 Irrigated Fruit Farming: Soil-Water-Plant Relationship
		 Cowpea Beans: Irrigation
		 Irrigated Fruit Farming: Grapefruit Management
		 Irrigated Fruit Farming: Fertigation
		 Cashew: Irrigation and Fertilization by Irrigation
		 Irrigated Fruit Farming: Banana Culture Management
		 Sesame: Irrigation
		 Corn: Irrigation
		 Small Fruits: Principles on irrigation and fertigation
		 Beans: Irrigation
		 Peach, Nectarine, and Plum: Irrigation and Fertigation
		 Irrigated Fruit Farming: Vineyard Management
		 Irrigated Fruit Farming: Irrigation
		 Rice: Irrigation
		 Mango: Irrigation
		 Sorghum: Irrigation
		 Gardens: Irrigation
		 Vegetables: Water Management
		 Coconut: Irrigation
		 Papaya: Irrigation and Fertilization by Irrigation
		 Pear: Irrigation and Fertilization by Irrigation
		 Castor Bean: Irrigation and Drainage
		 Apple: Irrigation and Fertilization by Irrigation
		 Banana: Irrigatio

In [None]:
for category, contexts in categories_dict.items():
    if category == 'Agroforestry and Perennial Farming':
        for context in contexts:
            for k, v in new_categories_dict:
                if context in v:
                    categories_dict[k] 
        categories_dict[]

## Manually adjust some classifications

In [None]:
def find_category(context):
    for category, contexts in categories_dict.items():
        if context in contexts:
            return category

In [102]:
context = 'Rice: Chemical treatment'
categories_dict[find_category(context)].remove(context)
categories_dict['Pest, Disease, and Weed Management'].append(context)
print(find_category(context))

Pest, Disease, and Weed Management


In [93]:
for c in categories_dict:
    print(c)

Irrigation and Water Management
Plant Breeding and Genetic Improvement
Animal Nutrition and Breeding
Livestock Management
Agricultural Experimentation, Extension, and Technology Transfer
Pest, Disease, and Weed Management
Soil and Fertilization Management
Crop-Livestock-Forestry Integration
Economics, Market, and Development
Nutrition, Health Benefits, and Food Processing
Sustainable and Organic Agriculture
Biotechnology and Advanced Techniques
Environmental and Geotechnological Considerations
Post-Harvest Management and Processing


In [None]:
#backup = deepcopy(categories_dict)

## Save dictionary categories-contexts

In [None]:
data = categories_dict
filename = '/workdir/500P_categories.json'
save_to_json(data, filename)

## Double Check

In [104]:
for k, v in categories_dict.items():
    if len(v)>0:
        print(k, len(v))
        for x in v:
            print('\t\t', x)

Irrigation and Water Management 34
		 Citrus: Irrigation and Fertilization by Irrigation
		 Irrigated Fruit Farming: Soil-Water-Plant Relationship
		 Cowpea Beans: Irrigation
		 Irrigated Fruit Farming: Grapefruit Management
		 Irrigated Fruit Farming: Fertigation
		 Cashew: Irrigation and Fertilization by Irrigation
		 Irrigated Fruit Farming: Banana Culture Management
		 Sesame: Irrigation
		 Corn: Irrigation
		 Small Fruits: Principles on irrigation and fertigation
		 Beans: Irrigation
		 Peach, Nectarine, and Plum: Irrigation and Fertigation
		 Irrigated Fruit Farming: Vineyard Management
		 Irrigated Fruit Farming: Irrigation
		 Rice: Irrigation
		 Mango: Irrigation
		 Sorghum: Irrigation
		 Gardens: Irrigation
		 Vegetables: Water Management
		 Coconut: Irrigation
		 Papaya: Irrigation and Fertilization by Irrigation
		 Pear: Irrigation and Fertilization by Irrigation
		 Castor Bean: Irrigation and Drainage
		 Apple: Irrigation and Fertilization by Irrigation
		 Banana: Irrigatio

In [28]:
print(response)

This appears to be a list of topics related to agriculture, specifically focusing on various crops, farming practices, and agricultural management. The topics cover a wide range of subjects, including:

1. Crop-specific information (e.g., wheat, soybean, corn, cotton, etc.)
2. Farming practices (e.g., irrigation, fertilization, pest management, etc.)
3. Agricultural management (e.g., soil conservation, crop rotation, etc.)
4. Post-harvest handling and storage
5. Marketing and trade
6. Climate change and environmental modeling
7. Genetic resources and conservation

Some of the specific topics that caught my attention include:

* "Wheat: Integrated Pest Management"
* "Soybean: Organic Soybean"
* "Peach, Nectarine, and Plum: Health benefits"
* "Banana: Integrated Production"
* "Coconut: Composting"
* "Genetic Resources: In Situ Conservation of Genetic Resources"

These topics suggest that the list is focused on providing information on sustainable agricultural practices, crop management, 

In [None]:
categories = [Crop Management and Systems, Soil Management, Irrigation and Water Management, Pest and Disease Management, Genetic Improvement and Biotechnology, Post-Harvest Management and Processing, Plant Nutrition and Fertilization, Animal Husbandry and Health, Agro-Economic Aspects, Sustainability and Environmental Management, Cultural Practices, Integrated Systems, Harvesting Techniques, Climate and Weather Adaptation, Agrometeorology and Geoinformation, Organic and Sustainable Production, Legislation and Certification, Rural Development and Technology Transfer, Ecophysiology and Plant Biology, Value Addition and Product Development]

In [None]:
def get_prompt(prompt):
    question=doc['question']
    options=doc['options']
    if not categories:
        return '[Question]\n'+question+'\nOptions: '+options+'\n[End of Question]\n'+prompt
    return '[Question]\n'+question+'\nOptions: '+options+'\n[End of Question]\n'+'[Categories]\n'+categories+'\n[End of Categories]\n'+prompt

In [11]:
prompt = '''Try to assign one category from the list of categories to the previous question, if the question is not related to any of them create a new category. 
This question is taken from an agronomy exam.
Your answer should start with one line containing only the category name.'''

In [13]:
print(get_prompt(dataset[10], prompt))

[Question]
Pink boll worm is a serious pest of?
Options: ['Mustard', 'Cotton', 'Gram', 'Soybean']
[End of Question]
Try to assign one category from the list of categories to the previous question, if the question is not related to any of them create a new category. 
This question is taken from an agronomy exam.
Your answer should start with one line containing only the category name.


In [15]:
ds = sample(list(dataset), 2000)

In [264]:
# , so try to be more specific than 'Agriculture' if it is a subfield.

In [277]:
categories = {
    'Agricultural Engineering',
    'Animal Science',
    'General Agriculture',
    'Agricultural Genetics',
    'Agricultural Health',
    'Agricultural History',
    'Plant Breeding',
    'Soil Science',
    'Agricultural Statistics',
    'Wordplay and Analogies',
    'Economics',
    'Geography and History',
    'Geology',
    'Geometry',
    'Health Science',
    'Leadership and Management',
    'Math Concepts',
    'Mathematics',
    'Sports Knowledge'
}


In [323]:
categories = {'Agricultural Economics',
 'Agricultural Education',
 'Agricultural Engineering',
 'Agricultural Genetics',
 'Agricultural Geography',
 'Agricultural Health',
 'Agricultural History',
 'Agricultural Mathematics',
 'Agricultural Science',
 'Agricultural Statistics',
 'Anatomy',
 'Animal Science',
 'Biochemistry',
 'Economics',
 'English Language',
 'Environmental Science',
 'General Agriculture',
 'General Knowledge',
 'Geography and History',
 'Geology',
 'Geometry',
 'Health Science',
 'Leadership and Management',
 'Math Concepts',
 'Mathematics',
 'Plant Biology',
 'Plant Breeding',
 'Social Stratification',
 'Soil Science',
 'Sports Knowledge',
 'Wordplay and Analogies'}

In [None]:

#categories = set()
for doc in tqdm(dataset):
    if not categories:
        string_categories='[]'
    else:
        string_categories=str(list(categories))
    response = await chat(get_prompt(doc,prompt,string_categories),None)
    try:
        category = response.split('\n')[0]
        if category not in categories:
            print(f"Category: {category} | Question: {doc['question']} | Options: {doc['options']}")
        categories.add(category)
    except:
        print('Gnappows')

  4%|▎         | 230/6253 [00:32<14:16,  7.03it/s]

Category: Environmental Science | Question: Ozone concentration in the atmosphere is reduced by? | Options: ['CO2 and O2', 'H2S and CH4', 'CFC and Cl2', 'N2 and N2O']


100%|██████████| 6253/6253 [15:17<00:00,  6.82it/s]


In [324]:
from collections import defaultdict
categories_dict = defaultdict(list)

for doc in tqdm(dataset):
    string_categories=str(list(categories))
    response = await chat(get_prompt(doc,prompt,string_categories),None)
    try:
        category = response.split('\n')[0]
        if category not in categories:
            print(f"Category: {category} | Question: {doc['question']} | Options: {doc['options']}")
        else:
            categories_dict[category].append(doc)
    except:
        print('Gnappows')

 41%|████      | 2555/6253 [06:16<09:07,  6.76it/s]

Category: Computer Security | Question: Which of the following is a threat for electronic payment systems? | Options: ['Computer Virus', 'Computer Worms', 'Trojan Horses', 'All of the above']


100%|██████████| 6253/6253 [15:18<00:00,  6.81it/s]


In [332]:
for key in categories_dict:
    print(key,len(categories_dict[key]))
    print()
    for i in range(3):
        print(categories_dict[key][i]['question'])
    print()

General Knowledge 56

Japanese style of gardens also known as?
FOB in international trade stands for?
In Indians context meaning of the term extension’ is transfer of technology while in USA it is?

General Agriculture 1240

Identify the correct order of the following methods of irrigation as per their water-use efficiency from the maximum to minimum efficiency.?
The insect, which attacks potato both in field as well as in storage is?
Disc Harrow which type tillage instrument?

Animal Science 167

A ______ dry matter intake100 kg body weight is recommended for breeding bull?
Which Fish has the highest protein content?
What is the Average Gestation period of buffalo? / 

Soil Science 394

Which of the following is the CORRECT sequence of soil-water erosion?
What is the total number of soil orders?
The most important feature of soil taxonomy is that it is based upon?

Plant Breeding 967

Synthetic seeds are…..?
Sundaram is a very high yielding clone of?
A variety developed by mixing the 

In [373]:
prompt_agronomy = '''Evaluate the exam question provided and determine if it pertains to the field of agronomy. Please begin your response with a single word: 'Yes' if the question pertains to or is relevant in the context of agronomy, or 'No' if it is not.'''

In [None]:
dataset_light = list(dataset)[:200]

In [374]:
agronomy_dict = {'Yes': [], 'No': []}

for doc in tqdm(dataset):
    response = await chat(get_prompt(doc, prompt_agronomy),None)
    if response.lower().startswith('yes'):
        agronomy_dict['Yes'].append(doc)
    elif response.lower().startswith('no'):
        agronomy_dict['No'].append(doc)        
    else:
        print(f"Question: {doc['question']} | Options: {doc['options']}")

100%|██████████| 6253/6253 [32:00<00:00,  3.26it/s] 


In [377]:
print(len(agronomy_dict['No']))

1670


In [384]:
no = sample(agronomy_dict['No'], 40)
for doc in no:
    print(f"Question: {doc['question']} | Options: {doc['options']}")

Question: Which one of the following compounds forms the backbone of fats and oils? | Options: ['Glycerol', 'Glucose', 'Palmitiacid', 'Amino alcohol']
Question: National Botanical Research Institute is located at? | Options: ['New Delhi', 'Lucknow', 'Vadodara', 'Chennai']
Question: Blood group system was discovered by? | Options: ['Camillo Golgi', 'Ernst Haeckel', 'Thomas Cooley', 'Karl Landsteiner']
Question: “The first task of this Assembly is to free India through a new Constitution, to feed the starving people and to cloth the naked masses and to give every Indian the fullest opportunity to develop himself according to his capacity.” This quotation was given by? | Options: ['Pandit Jawaharlal Nehru', 'Dr. B. R. Ambedkar', 'Dr. S. Radha Krishnan', 'Dr. Rajendra Prasad']
Question: Nucleic acids are? | Options: ['Micro molecular compounds', 'Macro molecular compounds', 'Micro and macro molecular compounds', 'Non-molecular compounds']
Question: Watershed is a? | Options: ['Hydrological

In [383]:
prompt_categories = '''Try to assign to the previous question one category from the list of categories, if the question is not related to any of them create a new category. 
This question is taken from an agronomy exam.
Your answer should start with one line containing only the category name.'''

In [386]:
categories = {'Agricultural Economics',
 'Agricultural Education',
 'Agricultural Engineering',
 'Agricultural Genetics',
 'Agricultural Geography',
 'Agricultural Health',
 'Agricultural History',
 'Agricultural Mathematics',
 'Agricultural Science',
 'Agricultural Statistics',
 'Anatomy',
 'Animal Science',
 'Biochemistry',
 'Economics',
 'English Language',
 'Environmental Science',
 'General Agriculture',
 'General Knowledge',
 'Geography and History',
 'Geology',
 'Geometry',
 'Health Science',
 'Leadership and Management',
 'Math Concepts',
 'Mathematics',
 'Plant Biology',
 'Plant Breeding',
 'Social Stratification',
 'Soil Science',
 'Sports Knowledge',
 'Wordplay and Analogies'}

In [398]:
categories = {
    'Agricultural Economics',
    'Agricultural Education',
    'Agricultural Engineering',
    'Genetics and Breeding',
    'Sustainable Farming',
    'Agricultural Health',
    'Agricultural History',
    'Agricultural Geography',
    'Botany',
    'Data Science in Agriculture',
    'Climate and Agriculture',
    'Animal Science',
    'Biochemistry in Agriculture',
    'Environmental Economics',
    'Farm Management',
    'Food Science',
    'Irrigation and Water Management',
    'Plant Protection',
    'Precision Agriculture',
    'Rural Sociology',
    'Soil Science',
    'Crop Production',
    'Weed and Pest Management'
}

In [396]:
from collections import defaultdict

#categories = set()
categories_dict = defaultdict(list)

for doc in tqdm(agronomy_dict['Yes']):
    if not categories:
        string_categories='[]'
    else:
        string_categories=str(list(categories))
    response = await chat(get_prompt(doc,prompt_categories,string_categories),None)
    try:
        category = response.split('\n')[0]
        if category not in categories:
            print(f"Category: {category} | Question: {doc['question']} | Options: {doc['options']}")
        #categories.add(category)
        categories_dict[category].append(doc)
    except:
        print('Gnappows')

  5%|▌         | 245/4583 [00:34<10:41,  6.76it/s]

Category: Food Science in Agriculture | Question: The Indian name for clarified butterfat is…? | Options: ['(A) cream', '(B) buttermilk', '(C) dahi', '(D) ghee']


  6%|▌         | 264/4583 [00:36<09:52,  7.29it/s]

Category: Agriculture | Question: Golden revolution is related to? | Options: ['Dairy', 'Pulses', 'Agriculture', 'Horticulture']


 18%|█▊        | 839/4583 [01:55<08:18,  7.52it/s]

Category: Plant Anatomy | Question: Complex tissue is? | Options: ['Parenchyma', 'Collenchyma', 'Sclerenchyma', 'Phloem']


 28%|██▊       | 1304/4583 [03:00<06:52,  7.96it/s]

Category: Plant Physiology | Question: Name the condition in which protoplast of the plant cell shrinks away.? | Options: ['Turgid', 'Plasmolysis', 'Flaccid', 'Rigid']


 30%|██▉       | 1356/4583 [03:08<07:24,  7.26it/s]

Category: Plant Physiology | Question: Match List I with List II
Table:
| List I | List II |
|--------|---------|
| A.Phototropism | I.Bending towards soil |
| B.Geotropism | II.Response to day length |
| C.Thigmotropism | III.Bending towards light |
| D.Photoperiodism | IV.Touch me not |

Choose the correct answer from the options given below | Options: ['A-I, B-II, C-IV, D-III', 'A-II, B-III, C-I, D-IV', 'A-IV, B-II, G-III, D-I', 'A-III, B-I, C-IV, D-II']


 43%|████▎     | 1953/4583 [04:31<05:50,  7.51it/s]

Category: Community Ecology | Question: Biotic components, producers, consumers and decomposers are structural components of? | Options: ['Habitat', 'Community Ecology', 'Population Ecology', 'Ecosystem']


 47%|████▋     | 2134/4583 [04:56<05:52,  6.95it/s]

Category: Agricultural Geography | Question: Indian Grassland and Fodder Research Institute is located at? | Options: ['Barapani', 'Jhansi', 'Ranchi', 'Patna']


 50%|████▉     | 2276/4583 [05:16<04:56,  7.77it/s]

Category: Botany | Question: Inflorescence of fig is known as? | Options: ['Hypanthodium', 'Panicle', 'Catkin', 'Balusta']


 53%|█████▎    | 2422/4583 [05:36<05:00,  7.19it/s]

Category: Plant Physiology | Question: Which plant hormone causes plant tropism? | Options: ['Cytokinin', 'Auxin', 'Ethylene', 'GA3']


 62%|██████▏   | 2819/4583 [06:31<03:49,  7.69it/s]

Category: Plant Anatomy | Question: Conjoint, collateral and open vascular bundles are found in a? | Options: ['Dicot stem', 'Monocot stem', 'Dicot root', 'Momocot root']


 64%|██████▍   | 2934/4583 [06:47<03:43,  7.39it/s]

Category: Crop Ecology | Question: Crop Ecology means.? | Options: ['(A) Relationship of crop plants to their cropping system', '(B) Relationship of crop plants to their distribution', '(C) Relationship of crop plants to economic factors', '(D) Relationship of crop plants to their environment']


 69%|██████▉   | 3178/4583 [07:21<03:02,  7.69it/s]

Category: Plant Physiology | Question: An aquatic plant with floating leaf have? | Options: ['Stomata', 'Stomata on petiole only', 'Stomata on upper surface', 'Stomata on lower surface']


 71%|███████   | 3256/4583 [07:32<03:10,  6.96it/s]

Category: Plant Biology | Question: In plant cells non-pigmented plastids are called? | Options: ['Chloroplast', 'Leucoplast', 'Amyloplast', 'Chromoplast']


 72%|███████▏  | 3288/4583 [07:36<02:50,  7.58it/s]

Category: Plant Anatomy | Question: The largest plant cells are? | Options: ['Xylem vessel cell', 'Parenchyma cells', 'Sieve tube cells', 'Sclerenchyma fibers']


 73%|███████▎  | 3364/4583 [07:47<02:42,  7.49it/s]

Category: Plant Physiology | Question: The cell to cell continuity is maintained by? | Options: ['Middle lamella', 'Thickening of lignin', 'Plasmodesmata', 'Cell membrane']


 78%|███████▊  | 3564/4583 [08:14<02:17,  7.43it/s]

Category: Plant Physiology | Question: Which of the following is the Volatile Plant Hormone? | Options: ['(A) Auxin', '(B) Ethylene', '(C) Cytokinins', '(D) Abscisic acid']


 80%|███████▉  | 3648/4583 [08:26<02:13,  7.01it/s]

Category: Plant Anatomy | Question: The hypodermis in moncot stem is? | Options: ['Parenchymatous', 'Chlorenchymatous', 'Collenchymatous', 'Sclerenchymatous']


 91%|█████████ | 4166/4583 [09:38<00:59,  7.06it/s]

Category: Plant Physiology | Question: Study of the plant processes is called? | Options: ['Biotechnology', 'Plant physiology', 'Plant molecular biology', 'Ecology']


 94%|█████████▍| 4324/4583 [10:01<00:35,  7.31it/s]

Category: Plant Anatomy | Question: The cork in dicotyledonous plants is formed by? | Options: ['Phellogen', 'Phelloderm', 'Phellum', 'Cambium']


100%|██████████| 4583/4583 [10:36<00:00,  7.20it/s]


In [407]:
for key in categories_dict:
        
        print(key, len(categories_dict[key]))

Irrigation and Water Management 152
Animal Science 167
Soil Science 633
Plant Protection 503
Genetics and Breeding 927
Agricultural Engineering 81
Geography in Agriculture 43
Climate and Agriculture 56
Crop Production 1296
Agricultural Economics 170
Agricultural Health 5
Data Science in Agriculture 19
Biochemistry in Agriculture 152
Agricultural History 81
Environmental Economics 7
Precision Agriculture 19
Weed and Pest Management 111
Food Safety 14
Agricultural Education 45
Farm Management 54
Sustainable Farming 25
Food Science in Agriculture 1
Agriculture 1
Plant Anatomy 5
Plant Physiology 7
Community Ecology 1
Agricultural Geography 1
Rural Sociology 4
Botany 1
Crop Ecology 1
Plant Biology 1


In [406]:
for c in categories_dict['Data Science in Agriculture']:
    #agronomy_dict['Yes'].remove(c)
    print(c['question'])

The value of r (correlation coefficient) ranges between?
Which test is used to test the significance of the difference between two means?
Which of the following is ideal measure of disperson?
The correlation coefficient is used to determine?
Coefficient of variation is expressed as percentage of?
Analysis of variance is a statistical method of comparing the of several populations?
What will be the error d.f. for a R.B.D. field experiment with 8 treatments and 3 replications?
Comprehension:
The table given here shows the production of five types of tractors by a company in the year 2012 to 2017. Study the table and answer the questions that follow
Table: Production of Tractors by a Company
Table:
| Company /Year | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | Total |
|---------------|------|------|------|------|------|------|-------|
| A | 8 | 20 | 16 | 17 | 21 | 6 | 88 |
| B | 16 | 10 | 14 | 12 | 12 | 14 | 78 |
| C | 21 | 17 | 16 | 15 | 13 | 8 | 90 |
| D | 4 | 6 | 10 | 16 | 20 | 31 | 87 |


# Categorize Agronomy questions

## Load questions that are about agronomy

In [14]:
dataset = load_from_json('/workdir/agriexam_category.json')
dataset

[{'id': 'dev__agriexam_2',
  'question': 'Identify the correct order of the following methods of irrigation as per their water-use efficiency from the maximum to minimum efficiency.?',
  'options': "['Continuous flowing> Flooding >Cablegation> Bubbler', 'Bubbler >Cablegation> Flooding > Continuous flowing', 'Cablegation> Flooding » Bubbler > Continuous flowing', 'Bubbler> Flooding > Continuous flowing Cablegation']",
  'explanation': None,
  'image_1': None,
  'image_2': None,
  'image_3': None,
  'image_4': None,
  'image_5': None,
  'img_type': '',
  'answer': 'B',
  'topic_difficulty': '3',
  'question_type': 'multiple-choice',
  'subfield': '{subfield}',
  'metadata': '{"source": "AgriExam", "author": "AgriExam", "license": "", "url": "https://www.agriexam.com/agronomy-jrf-2020", "language": "English", "verbose_answer": "Bubbler >Cablegation> Flooding > Continuous flowing"}'},
 {'id': 'dev__agriexam_3',
  'question': 'A ______ dry matter intake100 kg body weight is recommended for 

## Establish possible categories

In [39]:
categories = {
    
    'Agricultural Economics',
    'Agricultural Education',
    'Agricultural Engineering',
    'Agricultural Health',
    'Agricultural History',
    'Agricultural Geography',
    
    'Data Science in Agriculture',
    'Precision Agriculture',
    
    'Genetics, Breeding and Seeds Management',
    
    'Sustainability and Ecology',
    'Climate and Agriculture',
    
    'Biochemistry',
    
    'Rural Sociology',
    
    'Irrigation and Water Management',
    'Animal Science',
    'Farm Management',
    'Food Science',
    'Soil Science',
    'Crop Production',
    
    'Pests and Weeds Management',
    'Plant Pathology',
    'Plant Protection',
    
    'Horticulture and Ornamental Plants',
    'Plant Structure and Physiology',
    'Plant Taxonomy and Scientific Naming',
    'PLant Science'
}

In [40]:
len(categories)

26

## Prompt LLM to attribute a category to each question

In [27]:
prompt_categories = '''Assign to the previous question one category from the list of categories, if in doubt between categories, choose the more specific option. Your answer should start with one line containing only the category name.'''

In [28]:
print(prompt_categories)

Assign to the previous question one category from the list of categories, if in doubt between categories, choose the more specific option. Your answer should start with one line containing only the category name.


In [29]:
categories_dict = {c: [] for c in categories}

dataset_light = sample(list(dataset), 200)
dataset_all = list(dataset)

for doc in tqdm(dataset_all):
    response = await chat(get_prompt(doc, prompt_categories, str(list(categories))), None)
    try:
        category = response.split('\n')[0]
        if category not in categories:
            print(f"Category: {category} | Question: {doc['question']} | Options: {doc['options']}")
        #categories.add(category)
        categories_dict[category].append(doc)
    except:
        print(f"Split error, Question: {doc['question']} | Options: {doc['options']}")

 64%|██████▍   | 2924/4564 [06:27<03:48,  7.17it/s]

Category: Crop Ecology | Question: Crop Ecology means.? | Options: ['(A) Relationship of crop plants to their cropping system', '(B) Relationship of crop plants to their distribution', '(C) Relationship of crop plants to economic factors', '(D) Relationship of crop plants to their environment']
Split error, Question: Crop Ecology means.? | Options: ['(A) Relationship of crop plants to their cropping system', '(B) Relationship of crop plants to their distribution', '(C) Relationship of crop plants to economic factors', '(D) Relationship of crop plants to their environment']


 69%|██████▉   | 3171/4564 [06:59<03:54,  5.94it/s]

Category: Nutrition | Question: ICMR recommendation of vegetables  in grams? (From AgriExam Free Material) | Options: ['300 gram', '400 gram', '500 gram', '200 gram', '100 gram']
Split error, Question: ICMR recommendation of vegetables  in grams? (From AgriExam Free Material) | Options: ['300 gram', '400 gram', '500 gram', '200 gram', '100 gram']


100%|██████████| 4564/4564 [10:03<00:00,  7.57it/s]


In [30]:
from collections import Counter

def print_keys_by_length(categories_dict):
    """
    Prints the keys and lengths of their corresponding values from a dictionary,
    in decreasing order of length.

    Args:
        categories_dict (dict): A dictionary where the values are iterable.
    """
    # Create a list of tuples (key, length of value)
    lengths = [(k, len(v)) for k, v in categories_dict.items()]
    
    # Sort the list by length (second item in tuple) in descending order
    sorted_lengths = sorted(lengths, key=lambda x: x[1], reverse=True)

    # Print the sorted keys and lengths
    for k, length in sorted_lengths:
        print(k, length)

print_keys_by_length(categories_dict)

Botany 1158
Soil Science 857
Crop Production 805
Plant Protection 568
Genetics and Breeding 278
Agricultural Economics 188
Animal Science 150
Irrigation and Water Management 102
Agricultural History 75
Climate and Agriculture 70
Agricultural Geography 66
Agricultural Engineering 43
Farm Management 37
Weed and Pest Management 33
Food Science 32
Agricultural Education 30
Sustainability and Ecology 24
Rural Sociology 23
Precision Agriculture 14
Agricultural Health 8
Data Science in Agriculture 1
Environmental Economics 0
Biochemistry in Agriculture 0


In [None]:
# Plant Science 1036
# Crop Production 984
# Soil Science 788
# Genetics and Breeding 492
# Plant Protection 434
# Agricultural Economics 182
# Animal Science 141
# Irrigation and Water Management 101
# Agricultural History 72
# Climate and Agriculture 62
# Agricultural Education 46
# Agricultural Engineering 45
# Farm Management 39
# Agricultural Geography 38
# Weed and Pest Management 31
# Sustainable Farming 17
# Precision Agriculture 17
# Rural Sociology 14
# Food Science 13
# Agricultural Health 6
# Environmental Economics 4
# Data Science in Agriculture 0
# Others 0
# Biochemistry in Agriculture 0

In [38]:
for doc in sample(categories_dict['Climate and Agriculture'],10):
    print(f"Question: {doc['question']} | Options: {doc['options']}")

Question: In general, lower the latitude of a place, the? | Options: ['More the solar energy it receives', 'Lesser the solar energy it receives', 'The more windy is the place', 'The climate is cold']
Question: Which of the following climate classification was modified by ICRISAT? | Options: ['Troll', 'Thornthwaite', 'Lang', 'Martnees']
Question: J & K, Uttarakhand, Himachal pradesh lies under which agro-climatic zone? | Options: ['Eastern Himalaya', 'Central Himalaya', 'Western Himalayas', 'Central Plains', 'None of these']
Question: When an abnormal failure of rainfall occurs, the type of drought is referred as? | Options: ['Seasonal drought', 'Contingent drought', 'Invisible drought', 'Atmospheric drought']
Question: Which crop has a harmful effect on the increase of RH? | Options: ['Sunflower', 'Maize', 'Sugarcane', 'Sorghum']
Question: When the annual rainfall is less than normal for a prolonged period over an area, it is considered as | Options: ['Hydrological drought', 'Agricultu

In [42]:
to_check = ['Agricultural Geography', 'Agricultural History', 'Farm Management', 'Food Science']

for category in categories:
    docs = categories_dict[category]
    if len(docs) <= 10:
        for doc in docs:
            print(f"{category} | Question: {doc['question']} | Options: {doc['options']}")

Agricultural Geography | Question: NRC on Seed Spices is located at? | Options: ['Bikaner', 'Jaipur', 'Ajmer', 'Udaipur']
Agricultural Geography | Question: “Regional Centre of International Rice Research Institute” is recently established in India at? | Options: ['Varanasi', 'Lucknow', 'New Delhi', 'Hyderabad']
Agricultural Geography | Question: The Central Seed Testing Laboratory is located at: | Options: ['New Delhi', 'Banglore', 'Kolkata', 'Chandigarh']
Agricultural History | Question: In India Lab to Land programme was started in the year… ? | Options: ['(A) 1951', '(B) 1947', '(C) 1976', '(D) 1979']
Agricultural History | Question: Yellow revolution represents the production of ………………………..? | Options: ['Oilseeds', 'Forage', 'Pulse', 'Cereal']
Agricultural History | Question: The word “Agriculture” has been derived from….word. | Options: ['(A) German', '(B) Greek', '(C) Latin', '(D) American']
Farm Management | Question: Which of the following is a management decision ? | Options: