In [1]:
from tqdm import tqdm
import os
from datasets import load_dataset, concatenate_datasets
import json
import re
from copy import deepcopy

import asyncio
import math
import numpy as np
import random
import pandas as pd
from ollama import AsyncClient

from random import sample
from collections import Counter
from openai import OpenAI

  from .autonotebook import tqdm as notebook_tqdm


# Utils

In [24]:
def save_to_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
        
def load_from_json(filename):
    with open(filename, 'r') as f:
        return json.load(f)

In [2]:
client = OpenAI(
    # Defaults to os.environ.get("OPENAI_API_KEY")
)

def chat_openai(content,image_path, system_prompt=None):
    messages = []
    if system_prompt:
        system_message = {'role': 'system', 'content': system_prompt}
        messages.append(system_message)
    message = {'role': 'user', 'content': content}
    messages.append(message)
    if image_path != None:
        message.update({'images': [image_path]})
    chat_completion = client.chat.completions.create(model="gpt-4o",messages=messages )
    return chat_completion.choices[0].message.content

In [46]:
client_ollama = AsyncClient(
  host='http://192.168.1.5:11434',
)

async def chat(content,image_path, system_prompt=None):
    messages = []
    if system_prompt:
        system_message = {'role': 'system', 'content': system_prompt}
        messages.append(system_message)
    message = {'role': 'user', 'content': content}
    messages.append(message)
    if image_path != None:
        message.update({'images': [image_path]})
    response = await client_ollama.chat(model='llama3.3', messages=messages,options={"temperature":0.2})
    return response['message']['content']

In [28]:
def get_prompt(doc, prompt, categories=None):
    question=doc['question']
    options=doc['options']
    if not categories:
        return '[Question]\n'+question+'\nOptions: '+options+'\n[End of Question]\n'+prompt
    return '[Question]\n'+question+'\nOptions: '+options+'\n[End of Question]\n'+'[Categories]\n'+categories+'\n[End of Categories]\n'+prompt

# Locura

In [5]:
task_name = 'AgriExam'

dataset_name = "parquet"
data_files = {
    "dev": "/workdir/important_datasets/AGRIVQA/"+task_name+"/dev-00000-of-00001.parquet",
    "test": "/workdir/important_datasets/AGRIVQA/"+task_name+"/test-00000-of-00001.parquet",
    "validation": "/workdir/important_datasets/AGRIVQA/"+task_name+"/validation-00000-of-00001.parquet"
}
split = "validation"

dataset = load_dataset(dataset_name,data_files=data_files)

Generating dev split: 16 examples [00:00, 2091.47 examples/s]
Generating test split: 624 examples [00:00, 37828.03 examples/s]
Generating validation split: 5613 examples [00:00, 56699.71 examples/s]


In [6]:
dataset

DatasetDict({
    dev: Dataset({
        features: ['id', 'question', 'options', 'explanation', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'img_type', 'answer', 'topic_difficulty', 'question_type', 'subfield', 'metadata'],
        num_rows: 16
    })
    test: Dataset({
        features: ['id', 'question', 'options', 'explanation', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'img_type', 'answer', 'topic_difficulty', 'question_type', 'subfield', 'metadata'],
        num_rows: 624
    })
    validation: Dataset({
        features: ['id', 'question', 'options', 'explanation', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'img_type', 'answer', 'topic_difficulty', 'question_type', 'subfield', 'metadata'],
        num_rows: 5613
    })
})

In [8]:
# Load dataset with splits

# Combine (concatenate) splits into a single dataset
combined_dataset = concatenate_datasets([dataset['dev'], dataset['test'], dataset['validation']])

# Now combined_dataset contains data from all splits
print(combined_dataset)


Dataset({
    features: ['id', 'question', 'options', 'explanation', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'img_type', 'answer', 'topic_difficulty', 'question_type', 'subfield', 'metadata'],
    num_rows: 6253
})


In [9]:
dataset=combined_dataset
dataset

Dataset({
    features: ['id', 'question', 'options', 'explanation', 'image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'img_type', 'answer', 'topic_difficulty', 'question_type', 'subfield', 'metadata'],
    num_rows: 6253
})

In [11]:
prompt = '''Try to assign one category from the list of categories to the previous question, if the question is not related to any of them create a new category. 
This question is taken from an agronomy exam.
Your answer should start with one line containing only the category name.'''

In [13]:
print(get_prompt(dataset[10], prompt))

[Question]
Pink boll worm is a serious pest of?
Options: ['Mustard', 'Cotton', 'Gram', 'Soybean']
[End of Question]
Try to assign one category from the list of categories to the previous question, if the question is not related to any of them create a new category. 
This question is taken from an agronomy exam.
Your answer should start with one line containing only the category name.


In [15]:
ds = sample(list(dataset), 2000)

In [264]:
# , so try to be more specific than 'Agriculture' if it is a subfield.

In [277]:
categories = {
    'Agricultural Engineering',
    'Animal Science',
    'General Agriculture',
    'Agricultural Genetics',
    'Agricultural Health',
    'Agricultural History',
    'Plant Breeding',
    'Soil Science',
    'Agricultural Statistics',
    'Wordplay and Analogies',
    'Economics',
    'Geography and History',
    'Geology',
    'Geometry',
    'Health Science',
    'Leadership and Management',
    'Math Concepts',
    'Mathematics',
    'Sports Knowledge'
}


In [323]:
categories = {'Agricultural Economics',
 'Agricultural Education',
 'Agricultural Engineering',
 'Agricultural Genetics',
 'Agricultural Geography',
 'Agricultural Health',
 'Agricultural History',
 'Agricultural Mathematics',
 'Agricultural Science',
 'Agricultural Statistics',
 'Anatomy',
 'Animal Science',
 'Biochemistry',
 'Economics',
 'English Language',
 'Environmental Science',
 'General Agriculture',
 'General Knowledge',
 'Geography and History',
 'Geology',
 'Geometry',
 'Health Science',
 'Leadership and Management',
 'Math Concepts',
 'Mathematics',
 'Plant Biology',
 'Plant Breeding',
 'Social Stratification',
 'Soil Science',
 'Sports Knowledge',
 'Wordplay and Analogies'}

In [None]:

#categories = set()
for doc in tqdm(dataset):
    if not categories:
        string_categories='[]'
    else:
        string_categories=str(list(categories))
    response = await chat(get_prompt(doc,prompt,string_categories),None)
    try:
        category = response.split('\n')[0]
        if category not in categories:
            print(f"Category: {category} | Question: {doc['question']} | Options: {doc['options']}")
        categories.add(category)
    except:
        print('Gnappows')

  4%|▎         | 230/6253 [00:32<14:16,  7.03it/s]

Category: Environmental Science | Question: Ozone concentration in the atmosphere is reduced by? | Options: ['CO2 and O2', 'H2S and CH4', 'CFC and Cl2', 'N2 and N2O']


100%|██████████| 6253/6253 [15:17<00:00,  6.82it/s]


In [324]:
from collections import defaultdict
categories_dict = defaultdict(list)

for doc in tqdm(dataset):
    string_categories=str(list(categories))
    response = await chat(get_prompt(doc,prompt,string_categories),None)
    try:
        category = response.split('\n')[0]
        if category not in categories:
            print(f"Category: {category} | Question: {doc['question']} | Options: {doc['options']}")
        else:
            categories_dict[category].append(doc)
    except:
        print('Gnappows')

 41%|████      | 2555/6253 [06:16<09:07,  6.76it/s]

Category: Computer Security | Question: Which of the following is a threat for electronic payment systems? | Options: ['Computer Virus', 'Computer Worms', 'Trojan Horses', 'All of the above']


100%|██████████| 6253/6253 [15:18<00:00,  6.81it/s]


In [332]:
for key in categories_dict:
    print(key,len(categories_dict[key]))
    print()
    for i in range(3):
        print(categories_dict[key][i]['question'])
    print()

General Knowledge 56

Japanese style of gardens also known as?
FOB in international trade stands for?
In Indians context meaning of the term extension’ is transfer of technology while in USA it is?

General Agriculture 1240

Identify the correct order of the following methods of irrigation as per their water-use efficiency from the maximum to minimum efficiency.?
The insect, which attacks potato both in field as well as in storage is?
Disc Harrow which type tillage instrument?

Animal Science 167

A ______ dry matter intake100 kg body weight is recommended for breeding bull?
Which Fish has the highest protein content?
What is the Average Gestation period of buffalo? / 

Soil Science 394

Which of the following is the CORRECT sequence of soil-water erosion?
What is the total number of soil orders?
The most important feature of soil taxonomy is that it is based upon?

Plant Breeding 967

Synthetic seeds are…..?
Sundaram is a very high yielding clone of?
A variety developed by mixing the 

In [373]:
prompt_agronomy = '''Evaluate the exam question provided and determine if it pertains to the field of agronomy. Please begin your response with a single word: 'Yes' if the question pertains to or is relevant in the context of agronomy, or 'No' if it is not.'''

In [None]:
dataset_light = list(dataset)[:200]

In [374]:
agronomy_dict = {'Yes': [], 'No': []}

for doc in tqdm(dataset):
    response = await chat(get_prompt(doc, prompt_agronomy),None)
    if response.lower().startswith('yes'):
        agronomy_dict['Yes'].append(doc)
    elif response.lower().startswith('no'):
        agronomy_dict['No'].append(doc)        
    else:
        print(f"Question: {doc['question']} | Options: {doc['options']}")

100%|██████████| 6253/6253 [32:00<00:00,  3.26it/s] 


In [377]:
print(len(agronomy_dict['No']))

1670


In [384]:
no = sample(agronomy_dict['No'], 40)
for doc in no:
    print(f"Question: {doc['question']} | Options: {doc['options']}")

Question: Which one of the following compounds forms the backbone of fats and oils? | Options: ['Glycerol', 'Glucose', 'Palmitiacid', 'Amino alcohol']
Question: National Botanical Research Institute is located at? | Options: ['New Delhi', 'Lucknow', 'Vadodara', 'Chennai']
Question: Blood group system was discovered by? | Options: ['Camillo Golgi', 'Ernst Haeckel', 'Thomas Cooley', 'Karl Landsteiner']
Question: “The first task of this Assembly is to free India through a new Constitution, to feed the starving people and to cloth the naked masses and to give every Indian the fullest opportunity to develop himself according to his capacity.” This quotation was given by? | Options: ['Pandit Jawaharlal Nehru', 'Dr. B. R. Ambedkar', 'Dr. S. Radha Krishnan', 'Dr. Rajendra Prasad']
Question: Nucleic acids are? | Options: ['Micro molecular compounds', 'Macro molecular compounds', 'Micro and macro molecular compounds', 'Non-molecular compounds']
Question: Watershed is a? | Options: ['Hydrological

In [383]:
prompt_categories = '''Try to assign to the previous question one category from the list of categories, if the question is not related to any of them create a new category. 
This question is taken from an agronomy exam.
Your answer should start with one line containing only the category name.'''

In [386]:
categories = {'Agricultural Economics',
 'Agricultural Education',
 'Agricultural Engineering',
 'Agricultural Genetics',
 'Agricultural Geography',
 'Agricultural Health',
 'Agricultural History',
 'Agricultural Mathematics',
 'Agricultural Science',
 'Agricultural Statistics',
 'Anatomy',
 'Animal Science',
 'Biochemistry',
 'Economics',
 'English Language',
 'Environmental Science',
 'General Agriculture',
 'General Knowledge',
 'Geography and History',
 'Geology',
 'Geometry',
 'Health Science',
 'Leadership and Management',
 'Math Concepts',
 'Mathematics',
 'Plant Biology',
 'Plant Breeding',
 'Social Stratification',
 'Soil Science',
 'Sports Knowledge',
 'Wordplay and Analogies'}

In [398]:
categories = {
    'Agricultural Economics',
    'Agricultural Education',
    'Agricultural Engineering',
    'Genetics and Breeding',
    'Sustainable Farming',
    'Agricultural Health',
    'Agricultural History',
    'Agricultural Geography',
    'Botany',
    'Data Science in Agriculture',
    'Climate and Agriculture',
    'Animal Science',
    'Biochemistry in Agriculture',
    'Environmental Economics',
    'Farm Management',
    'Food Science',
    'Irrigation and Water Management',
    'Plant Protection',
    'Precision Agriculture',
    'Rural Sociology',
    'Soil Science',
    'Crop Production',
    'Weed and Pest Management'
}

In [396]:
from collections import defaultdict

#categories = set()
categories_dict = defaultdict(list)

for doc in tqdm(agronomy_dict['Yes']):
    if not categories:
        string_categories='[]'
    else:
        string_categories=str(list(categories))
    response = await chat(get_prompt(doc,prompt_categories,string_categories),None)
    try:
        category = response.split('\n')[0]
        if category not in categories:
            print(f"Category: {category} | Question: {doc['question']} | Options: {doc['options']}")
        #categories.add(category)
        categories_dict[category].append(doc)
    except:
        print('Gnappows')

  5%|▌         | 245/4583 [00:34<10:41,  6.76it/s]

Category: Food Science in Agriculture | Question: The Indian name for clarified butterfat is…? | Options: ['(A) cream', '(B) buttermilk', '(C) dahi', '(D) ghee']


  6%|▌         | 264/4583 [00:36<09:52,  7.29it/s]

Category: Agriculture | Question: Golden revolution is related to? | Options: ['Dairy', 'Pulses', 'Agriculture', 'Horticulture']


 18%|█▊        | 839/4583 [01:55<08:18,  7.52it/s]

Category: Plant Anatomy | Question: Complex tissue is? | Options: ['Parenchyma', 'Collenchyma', 'Sclerenchyma', 'Phloem']


 28%|██▊       | 1304/4583 [03:00<06:52,  7.96it/s]

Category: Plant Physiology | Question: Name the condition in which protoplast of the plant cell shrinks away.? | Options: ['Turgid', 'Plasmolysis', 'Flaccid', 'Rigid']


 30%|██▉       | 1356/4583 [03:08<07:24,  7.26it/s]

Category: Plant Physiology | Question: Match List I with List II
Table:
| List I | List II |
|--------|---------|
| A.Phototropism | I.Bending towards soil |
| B.Geotropism | II.Response to day length |
| C.Thigmotropism | III.Bending towards light |
| D.Photoperiodism | IV.Touch me not |

Choose the correct answer from the options given below | Options: ['A-I, B-II, C-IV, D-III', 'A-II, B-III, C-I, D-IV', 'A-IV, B-II, G-III, D-I', 'A-III, B-I, C-IV, D-II']


 43%|████▎     | 1953/4583 [04:31<05:50,  7.51it/s]

Category: Community Ecology | Question: Biotic components, producers, consumers and decomposers are structural components of? | Options: ['Habitat', 'Community Ecology', 'Population Ecology', 'Ecosystem']


 47%|████▋     | 2134/4583 [04:56<05:52,  6.95it/s]

Category: Agricultural Geography | Question: Indian Grassland and Fodder Research Institute is located at? | Options: ['Barapani', 'Jhansi', 'Ranchi', 'Patna']


 50%|████▉     | 2276/4583 [05:16<04:56,  7.77it/s]

Category: Botany | Question: Inflorescence of fig is known as? | Options: ['Hypanthodium', 'Panicle', 'Catkin', 'Balusta']


 53%|█████▎    | 2422/4583 [05:36<05:00,  7.19it/s]

Category: Plant Physiology | Question: Which plant hormone causes plant tropism? | Options: ['Cytokinin', 'Auxin', 'Ethylene', 'GA3']


 62%|██████▏   | 2819/4583 [06:31<03:49,  7.69it/s]

Category: Plant Anatomy | Question: Conjoint, collateral and open vascular bundles are found in a? | Options: ['Dicot stem', 'Monocot stem', 'Dicot root', 'Momocot root']


 64%|██████▍   | 2934/4583 [06:47<03:43,  7.39it/s]

Category: Crop Ecology | Question: Crop Ecology means.? | Options: ['(A) Relationship of crop plants to their cropping system', '(B) Relationship of crop plants to their distribution', '(C) Relationship of crop plants to economic factors', '(D) Relationship of crop plants to their environment']


 69%|██████▉   | 3178/4583 [07:21<03:02,  7.69it/s]

Category: Plant Physiology | Question: An aquatic plant with floating leaf have? | Options: ['Stomata', 'Stomata on petiole only', 'Stomata on upper surface', 'Stomata on lower surface']


 71%|███████   | 3256/4583 [07:32<03:10,  6.96it/s]

Category: Plant Biology | Question: In plant cells non-pigmented plastids are called? | Options: ['Chloroplast', 'Leucoplast', 'Amyloplast', 'Chromoplast']


 72%|███████▏  | 3288/4583 [07:36<02:50,  7.58it/s]

Category: Plant Anatomy | Question: The largest plant cells are? | Options: ['Xylem vessel cell', 'Parenchyma cells', 'Sieve tube cells', 'Sclerenchyma fibers']


 73%|███████▎  | 3364/4583 [07:47<02:42,  7.49it/s]

Category: Plant Physiology | Question: The cell to cell continuity is maintained by? | Options: ['Middle lamella', 'Thickening of lignin', 'Plasmodesmata', 'Cell membrane']


 78%|███████▊  | 3564/4583 [08:14<02:17,  7.43it/s]

Category: Plant Physiology | Question: Which of the following is the Volatile Plant Hormone? | Options: ['(A) Auxin', '(B) Ethylene', '(C) Cytokinins', '(D) Abscisic acid']


 80%|███████▉  | 3648/4583 [08:26<02:13,  7.01it/s]

Category: Plant Anatomy | Question: The hypodermis in moncot stem is? | Options: ['Parenchymatous', 'Chlorenchymatous', 'Collenchymatous', 'Sclerenchymatous']


 91%|█████████ | 4166/4583 [09:38<00:59,  7.06it/s]

Category: Plant Physiology | Question: Study of the plant processes is called? | Options: ['Biotechnology', 'Plant physiology', 'Plant molecular biology', 'Ecology']


 94%|█████████▍| 4324/4583 [10:01<00:35,  7.31it/s]

Category: Plant Anatomy | Question: The cork in dicotyledonous plants is formed by? | Options: ['Phellogen', 'Phelloderm', 'Phellum', 'Cambium']


100%|██████████| 4583/4583 [10:36<00:00,  7.20it/s]


In [407]:
for key in categories_dict:
        
        print(key, len(categories_dict[key]))

Irrigation and Water Management 152
Animal Science 167
Soil Science 633
Plant Protection 503
Genetics and Breeding 927
Agricultural Engineering 81
Geography in Agriculture 43
Climate and Agriculture 56
Crop Production 1296
Agricultural Economics 170
Agricultural Health 5
Data Science in Agriculture 19
Biochemistry in Agriculture 152
Agricultural History 81
Environmental Economics 7
Precision Agriculture 19
Weed and Pest Management 111
Food Safety 14
Agricultural Education 45
Farm Management 54
Sustainable Farming 25
Food Science in Agriculture 1
Agriculture 1
Plant Anatomy 5
Plant Physiology 7
Community Ecology 1
Agricultural Geography 1
Rural Sociology 4
Botany 1
Crop Ecology 1
Plant Biology 1


In [406]:
for c in categories_dict['Data Science in Agriculture']:
    #agronomy_dict['Yes'].remove(c)
    print(c['question'])

The value of r (correlation coefficient) ranges between?
Which test is used to test the significance of the difference between two means?
Which of the following is ideal measure of disperson?
The correlation coefficient is used to determine?
Coefficient of variation is expressed as percentage of?
Analysis of variance is a statistical method of comparing the of several populations?
What will be the error d.f. for a R.B.D. field experiment with 8 treatments and 3 replications?
Comprehension:
The table given here shows the production of five types of tractors by a company in the year 2012 to 2017. Study the table and answer the questions that follow
Table: Production of Tractors by a Company
Table:
| Company /Year | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | Total |
|---------------|------|------|------|------|------|------|-------|
| A | 8 | 20 | 16 | 17 | 21 | 6 | 88 |
| B | 16 | 10 | 14 | 12 | 12 | 14 | 78 |
| C | 21 | 17 | 16 | 15 | 13 | 8 | 90 |
| D | 4 | 6 | 10 | 16 | 20 | 31 | 87 |


# Categorize Agronomy questions

## Load questions that are about agronomy

In [29]:
dataset = load_from_json('/workdir/agriexam_category.json')
dataset

[{'id': 'dev__agriexam_2',
  'question': 'Identify the correct order of the following methods of irrigation as per their water-use efficiency from the maximum to minimum efficiency.?',
  'options': "['Continuous flowing> Flooding >Cablegation> Bubbler', 'Bubbler >Cablegation> Flooding > Continuous flowing', 'Cablegation> Flooding » Bubbler > Continuous flowing', 'Bubbler> Flooding > Continuous flowing Cablegation']",
  'explanation': None,
  'image_1': None,
  'image_2': None,
  'image_3': None,
  'image_4': None,
  'image_5': None,
  'img_type': '',
  'answer': 'B',
  'topic_difficulty': '3',
  'question_type': 'multiple-choice',
  'subfield': '{subfield}',
  'metadata': '{"source": "AgriExam", "author": "AgriExam", "license": "", "url": "https://www.agriexam.com/agronomy-jrf-2020", "language": "English", "verbose_answer": "Bubbler >Cablegation> Flooding > Continuous flowing"}'},
 {'id': 'dev__agriexam_3',
  'question': 'A ______ dry matter intake100 kg body weight is recommended for 

## Establish possible categories

In [145]:
categories = [
    # Agricultural sub-topic
    'Agricultural Economics',
    'Agricultural Education',
    'Agricultural Engineering',
    'Agricultural Health',
    'Agricultural History',
    'Agricultural Geography',
    'Agricultural Extension',
    
    # Too specific / to eliminate
    'Data Science in Agriculture',
    'Precision Agriculture',
    'Rural Sociology',
    
    # Environment
    'Sustainability and Ecology',
    'Climate and Agriculture',
    'Forestry', 
    
    # Farming
    'Animal Science',
    'Farm Management',
    
    # Agriculture
    'Fertilizers',
    'Crop Nutrient Management',
    'Irrigation and Water Management',
    'Food Science',
    'Soil Science',
    'Crop Production',
    
    #  Plant Disease
    'Pests and Weeds Management',
    'Plant Pathology',
    'Plant Protection',
    
    # Plant Science / Scienze
    'Entomology',
    'Microbiology',
    'Plant Biochemistry',
    'Genetics, Breeding and Seeds Management',
    'Horticulture and Ornamental Plants',
    'Plant Anatomy',
    'Plant Physiology',
    'Plant Taxonomy and Scientific Naming',
    'Plant Science'
]

In [146]:
def list_to_string(categories):
    categories_string=''
    for i, category in enumerate(categories):
        categories_string += f'{str(i+1)}. {category}\n'
    return categories_string

In [147]:
from random import shuffle
shuffle(categories)
system_prompt = f'''You are an expert in agronomy, skilled at categorizing questions based on their topics. I will provide you with a questions from an agronomy exam. Your task is to assign one of the following categories to the question: 

{list_to_string(categories)}

Carefully review the question's content and context to determine the most appropriate category it belongs to. Select the most specific category available from the list. Your answer should start with a single line containing only the name of the chosen category.
'''

In [148]:
print(system_prompt)

You are an expert in agronomy, skilled at categorizing questions based on their topics. I will provide you with a questions from an agronomy exam. Your task is to assign one of the following categories to the question: 

1. Microbiology
2. Plant Science
3. Plant Pathology
4. Agricultural Education
5. Plant Physiology
6. Genetics, Breeding and Seeds Management
7. Entomology
8. Irrigation and Water Management
9. Farm Management
10. Crop Nutrient Management
11. Agricultural Health
12. Data Science in Agriculture
13. Climate and Agriculture
14. Fertilizers
15. Plant Anatomy
16. Agricultural Geography
17. Precision Agriculture
18. Plant Protection
19. Agricultural Economics
20. Sustainability and Ecology
21. Agricultural History
22. Pests and Weeds Management
23. Soil Science
24. Agricultural Engineering
25. Animal Science
26. Crop Production
27. Plant Biochemistry
28. Forestry
29. Rural Sociology
30. Horticulture and Ornamental Plants
31. Plant Taxonomy and Scientific Naming
32. Food Scien

## Prompt LLM to attribute a category to each question

In [30]:
prompt_categories = '''Assign to the previous question one category from the list of categories, if in doubt between categories, choose the more specific option. Your answer should start with one line containing only the category name.'''
prompt_categories_2 = '''Assign one category from the provided list to the previous question. If unsure, select the more specific category that best fits. Begin your response with a single line containing only the category name'''

In [149]:
categories_dict = {c: [] for c in categories}
extra_dict = {'suggestions' : [], 'catastrophies' : []}

end_prompt =  'Assign a category to the question above. Your answer should include only the with a single line containing only the name of the chosen category.'

dataset_light = sample(list(dataset), 200)
dataset_all = list(dataset)

for doc in tqdm(dataset_all):
    response = await chat(get_prompt(doc, end_prompt, ''), None, system_prompt=system_prompt)
    try:
        category = response.split('\n')[0].strip()
        if category not in categories:
            print('Suggestion: ', response)
            extra_dict['suggestions'].append((response,doc))
            category = response.split('\n')[-1].strip()
            if category not in categories:
                print('Category not present')
                print(f"Category: {category} | Question: {doc['question']} | Options: {doc['options']}")
        #categories.add(category)
        categories_dict[category].append(doc)
    except:
        extra_dict['catastrophies'].append((response,doc))
        print('Catastrophy', response)
        print(f"Question: {doc['question']} | Options: {doc['options']}")

  5%|▍         | 220/4564 [02:11<40:54,  1.77it/s] 

Suggestion:  Pest and Weeds Management
Category not present
Category: Pest and Weeds Management | Question: Active ingredient in pendimethalin is? | Options: ['33%', '30%', '27%', '35%']
Catastrophy Pest and Weeds Management
Question: Active ingredient in pendimethalin is? | Options: ['33%', '30%', '27%', '35%']


  5%|▌         | 236/4564 [02:20<44:59,  1.60it/s]

Suggestion:  Plant Nutrition or more specifically Crop Nutrient Management
Category not present
Category: Plant Nutrition or more specifically Crop Nutrient Management | Question: Little leaf of Litchi? | Options: ['Zn', 'Mn', 'Mg', 'Cu', 'Al']
Catastrophy Plant Nutrition or more specifically Crop Nutrient Management
Question: Little leaf of Litchi? | Options: ['Zn', 'Mn', 'Mg', 'Cu', 'Al']


  8%|▊         | 376/4564 [03:35<1:32:18,  1.32s/it]

Suggestion:  Plant Nutrition is not available, so the closest match would be Crop Nutrient Management or Plant Physiology, but since it specifically deals with how plants utilize nitrogen from the atmosphere, which involves microbial processes, 
Crop Nutrient Management


 11%|█         | 483/4564 [04:34<1:00:14,  1.13it/s]

Suggestion:  Plant Nutrition is not available, so the closest match is Crop Nutrient Management
Category not present
Category: Plant Nutrition is not available, so the closest match is Crop Nutrient Management | Question: How many elements are essential for plant growth?  | Options: ['15', '17', '22', '18']
Catastrophy Plant Nutrition is not available, so the closest match is Crop Nutrient Management
Question: How many elements are essential for plant growth?  | Options: ['15', '17', '22', '18']


 11%|█         | 488/4564 [04:37<47:12,  1.44it/s]  

Suggestion:  Plant Nutrition or more specifically Crop Nutrient Management
Category not present
Category: Plant Nutrition or more specifically Crop Nutrient Management | Question: Boron deficiency causes: | Options: ['Blue chaff disease', 'Die back disease', 'Gray speck disease', 'Brown heart disease']
Catastrophy Plant Nutrition or more specifically Crop Nutrient Management
Question: Boron deficiency causes: | Options: ['Blue chaff disease', 'Die back disease', 'Gray speck disease', 'Brown heart disease']


 15%|█▌        | 706/4564 [06:35<1:32:56,  1.45s/it]

Suggestion:  Plant Nutrition is not available, but the closest related field from the given options would be Crop Nutrient Management or Plant Science, however since pulses are being discussed in relation to their nutritional content, it seems more aligned with 
Crop Nutrient Management


 17%|█▋        | 758/4564 [07:05<1:24:56,  1.34s/it]

Suggestion:  Plant Nutrition or more specifically Crop Nutrient Management is not directly applicable here, however, considering the context of the question which seems related to nutritional aspects of plants (cereals), it can be best categorized under: 
Crop Production


 17%|█▋        | 795/4564 [07:27<1:01:44,  1.02it/s]

Suggestion:  Experimental Design is not explicitly listed, but based on the context, the closest match would be: 
Crop Production


 20%|██        | 928/4564 [08:41<46:19,  1.31it/s]  

Suggestion:  Nematology is not listed, but the closest related field is 
Plant Pathology


 21%|██        | 951/4564 [08:57<1:32:02,  1.53s/it]

Suggestion:  Plant Nutrition or more specifically Crop Nutrient Management is not directly applicable here, however, considering the context of nutrient deficiency in cereals which relates to their composition and quality, the closest match from the given categories would be: 
Crop Production


 24%|██▎       | 1075/4564 [10:10<2:09:58,  2.24s/it]

Suggestion:  Ecology is not listed, but the closest match would be a field that deals with interactions between organisms and their environment, however since 'Sustainability and Ecology' and 'Plant Science' are available options and this question seems to relate more closely with concepts typically found within ecology which can sometimes be encompassed by broader plant science topics or sustainability discussions, I'll choose the one most directly related to how plants interact with each other in terms of competition.
Plant Science


 24%|██▎       | 1076/4564 [10:11<1:45:33,  1.82s/it]

Suggestion:  Plant Nutrition or more specifically Crop Nutrient Management
Category not present
Category: Plant Nutrition or more specifically Crop Nutrient Management | Question: Boron deficiency causes? | Options: ['Blue chaff disease', 'Die back disease', 'Gray speck disease', 'Brown heart disease']
Catastrophy Plant Nutrition or more specifically Crop Nutrient Management
Question: Boron deficiency causes? | Options: ['Blue chaff disease', 'Die back disease', 'Gray speck disease', 'Brown heart disease']


 24%|██▍       | 1102/4564 [10:26<53:32,  1.08it/s]  

Suggestion:  Statistics is not an option, however, the closest related field from the given options would be 
Data Science in Agriculture


 36%|███▋      | 1658/4564 [15:36<2:49:21,  3.50s/it]

Suggestion:  Plant Nutrition is not available, so the closest match is Crop Nutrient Management, but since it specifically asks about the content of vegetables which could also relate to their nutritional value or composition, a more specific category would be Horticulture and Ornamental Plants as it deals with the cultivation and care of plants like vegetables. However, given the context of nutrition, another suitable option could be considered, yet based on the provided list, none directly mention nutrition in the context of human health or dietary needs. Among the options provided that are closest to discussing plant composition or nutritional aspects indirectly through the management or science of plants, Crop Nutrient Management seems less fitting than a category focused on the type of plants being discussed (vegetables), thus:
Horticulture and Ornamental Plants


 41%|████      | 1881/4564 [17:33<50:17,  1.12s/it]  

Suggestion:  Plant Nutrition or more specifically Crop Nutrient Management is not directly mentioned but related, however the closest and most specific match from the given list would be 
Crop Nutrient Management


 44%|████▍     | 2022/4564 [18:49<30:41,  1.38it/s]

Suggestion:  Plant Nutrition is not available, so the closest match is Crop Nutrient Management
Category not present
Category: Plant Nutrition is not available, so the closest match is Crop Nutrient Management | Question: Those elements whose absence prevent plants from completing its life cycle are called…..? | Options: ['Micro element', 'Macro element', 'Nonessential elements', 'Essential elements']
Catastrophy Plant Nutrition is not available, so the closest match is Crop Nutrient Management
Question: Those elements whose absence prevent plants from completing its life cycle are called…..? | Options: ['Micro element', 'Macro element', 'Nonessential elements', 'Essential elements']


 51%|█████     | 2334/4564 [22:29<33:23,  1.11it/s]

Suggestion:  Plant Nutrition or more specifically Crop Nutrient Management is not directly mentioned but closest would be 
Crop Nutrient Management


 56%|█████▌    | 2565/4564 [24:40<47:19,  1.42s/it]

Suggestion:  Plant Nutrition is not available, so the closest related category from the list provided would be Crop Nutrient Management, but since it specifically deals with components of cereal which leans more towards biochemistry aspects of plants, hence: 
Plant Biochemistry


 57%|█████▋    | 2580/4564 [24:54<1:03:52,  1.93s/it]

Suggestion:  Nematology is not listed, but the closest match would be Entomology's neighboring field or more specifically Plant Pathology's related area of nematology, however since "Entomology" and "Plant Pathology" are available options and considering nematodes are often studied in relation to plant pathology: 
Plant Pathology


 58%|█████▊    | 2625/4564 [25:18<23:37,  1.37it/s]  

Suggestion:  Plant Nutrition is not available, so the closest match is Crop Nutrient Management
Category not present
Category: Plant Nutrition is not available, so the closest match is Crop Nutrient Management | Question: As a general rule, symptoms of nitrogen deficiency is first noticeable in? | Options: ['Upper leaves', 'Lower leaves', 'Younger leaves', 'Lower and older leaves']
Catastrophy Plant Nutrition is not available, so the closest match is Crop Nutrient Management
Question: As a general rule, symptoms of nitrogen deficiency is first noticeable in? | Options: ['Upper leaves', 'Lower leaves', 'Younger leaves', 'Lower and older leaves']


 62%|██████▏   | 2830/4564 [27:17<29:16,  1.01s/it]

Suggestion:  Experimental Design and Statistical Analysis is not available, so the closest related field from the given options would be: 
Crop Production


 62%|██████▏   | 2835/4564 [27:20<23:11,  1.24it/s]

Suggestion:  Plant Nutrition is not available, so the closest match is: Crop Nutrient Management
Category not present
Category: Plant Nutrition is not available, so the closest match is: Crop Nutrient Management | Question: Which is non-mineral nutrient essential for plants? | Options: ['Nitrogen', 'Phosphorus', 'Carbon', 'Potassium']
Catastrophy Plant Nutrition is not available, so the closest match is: Crop Nutrient Management
Question: Which is non-mineral nutrient essential for plants? | Options: ['Nitrogen', 'Phosphorus', 'Carbon', 'Potassium']


 65%|██████▌   | 2969/4564 [28:34<26:02,  1.02it/s]

Suggestion:  Agricultural Awards or recognition is not explicitly listed, however the closest and most relevant category would be: 
Agricultural Education


 70%|███████   | 3214/4564 [30:52<13:42,  1.64it/s]

Suggestion:  Plant Nutrition or more specifically Crop Nutrient Management
Category not present
Category: Plant Nutrition or more specifically Crop Nutrient Management | Question: Indicator plant for boron deficiency is: | Options: ['Sugarbeet', 'Sunflower', 'Corn', 'Potato']
Catastrophy Plant Nutrition or more specifically Crop Nutrient Management
Question: Indicator plant for boron deficiency is: | Options: ['Sugarbeet', 'Sunflower', 'Corn', 'Potato']


 74%|███████▍  | 3388/4564 [32:25<16:19,  1.20it/s]

Suggestion:  Nematology is not listed, but the closest related field is 
Plant Pathology


 76%|███████▌  | 3463/4564 [33:08<17:54,  1.02it/s]

Suggestion:  Nematology is not listed, but the closest related field from the given options would be 
Entomology


 87%|████████▋ | 3949/4564 [37:49<07:11,  1.43it/s]

Suggestion:  Plant Nutrition or more specifically Crop Nutrient Management
Category not present
Category: Plant Nutrition or more specifically Crop Nutrient Management | Question: Indicator plant for P deficiency indication is …………………..? | Options: ['Maize', 'Tomato', 'Maize and Tomato both', 'None of these']
Catastrophy Plant Nutrition or more specifically Crop Nutrient Management
Question: Indicator plant for P deficiency indication is …………………..? | Options: ['Maize', 'Tomato', 'Maize and Tomato both', 'None of these']


 88%|████████▊ | 4031/4564 [38:38<11:41,  1.32s/it]

Suggestion:  Agricultural Statistics is not explicitly listed, but given the context of production data, the closest match would be related to the management and oversight of agricultural activities, hence: 
Farm Management


 92%|█████████▏| 4216/4564 [40:46<14:14,  2.45s/it]

Suggestion:  Agroforestry is most closely related to Sustainability and Ecology, but since Agroforestry can be considered a part of a broader category that encompasses various aspects including ecology, the closest match from the given list would be 
Sustainability and Ecology


 94%|█████████▍| 4299/4564 [42:06<14:26,  3.27s/it]

Suggestion:  Water-related issues are more closely associated with categories like Crop Nutrient Management or Irrigation and Water Management, but given the context of ecosystem changes due to nutrient enrichment, the most fitting category seems to be related to broader environmental impacts. However, considering the direct relation to water and nutrients affecting ecosystems, a category that encompasses ecological and sustainability aspects is most relevant. 
 Sustainability and Ecology


 94%|█████████▍| 4300/4564 [42:14<20:17,  4.61s/it]

Suggestion:  Plant Nutrition is not available, so the closest related field would be Crop Nutrient Management, but since it specifically deals with pulses which are plants and their nutrient content, a more specific category is not available, however,  the question seems to relate more directly to the nutritional aspects of plants (pulses), hence: 
Crop Nutrient Management


 98%|█████████▊| 4467/4564 [44:35<01:57,  1.22s/it]

Suggestion:  Plant Nutrition or more specifically Crop Nutrient Management
Category not present
Category: Plant Nutrition or more specifically Crop Nutrient Management | Question: Criteria of essentiality by given by: | Options: ['Julis Von Suches', 'Arnon', 'Jean Baptist', 'Liegbig']
Catastrophy Plant Nutrition or more specifically Crop Nutrient Management
Question: Criteria of essentiality by given by: | Options: ['Julis Von Suches', 'Arnon', 'Jean Baptist', 'Liegbig']


 98%|█████████▊| 4473/4564 [44:41<01:37,  1.07s/it]

Suggestion:  Plant Nutrition or more specifically Crop Nutrient Management
Category not present
Category: Plant Nutrition or more specifically Crop Nutrient Management | Question: Who among the following presented the theory of essential mineral nutrients in plants?  | Options: ['Aristotle', 'Arnon and stout', 'Liebig', 'Carl Linnaeus']
Catastrophy Plant Nutrition or more specifically Crop Nutrient Management
Question: Who among the following presented the theory of essential mineral nutrients in plants?  | Options: ['Aristotle', 'Arnon and stout', 'Liebig', 'Carl Linnaeus']


100%|██████████| 4564/4564 [45:57<00:00,  1.66it/s]


In [177]:
from collections import Counter

def print_keys_by_length(categories_dict):
    """
    Prints the keys and lengths of their corresponding values from a dictionary,
    in decreasing order of length.

    Args:
        categories_dict (dict): A dictionary where the values are iterable.
    """
    # Create a list of tuples (key, length of value)
    lengths = [(k, len(v)) for k, v in categories_dict.items()]
    
    # Sort the list by length (second item in tuple) in descending order
    sorted_lengths = sorted(lengths, key=lambda x: x[1], reverse=True)

    # Print the sorted keys and lengths
    for k, length in sorted_lengths:
        print(k, length)

print_keys_by_length(categories_dict)

Genetics, Breeding and Seeds Management 745
Soil Science 388
Horticulture and Ornamental Plants 377
Plant Pathology 347
Crop Production 340
Plant Physiology 317
Crop Nutrient Management 221
Plant Science 192
Animal Science 173
Agricultural Economics 166
Entomology 155
Plant Taxonomy and Scientific Naming 141
Irrigation and Water Management 114
Agricultural Geography 99
Pests and Weeds Management 88
Microbiology 79
Agricultural Education 74
Plant Anatomy 73
Fertilizers 67
Agricultural Engineering 66
Plant Biochemistry 52
Agricultural History 50
Agricultural Extension 47
Food Science 38
Climate and Agriculture 37
Plant Protection 31
Farm Management 30
Sustainability, Forestry and Ecology 25
Precision Agriculture 16


In [178]:
len(categories_dict)

29

In [175]:
# Combine the lists from 'key1' and 'key2' into a new key 'key_new'

def combine_keys(new_key,key1,key2):
    categories_dict[new_key] = categories_dict[key1] + categories_dict[key2]
    
    #categories_dict.pop(key1)
    categories_dict.pop(key2)

In [179]:
save_to_json(categories_dict, '/workdir/category_dictionary.json')

In [182]:
for doc in sample(categories_dict['Precision Agriculture'],16):
    print(f"Question: {doc['question']} | Options: {doc['options']}")

Question: Which system provides the accurate positional information, which will be more useful in location the spatial variability with accuracy?  | Options: ['DSS', 'GPS', 'GIS', 'All of these']
Question: Application of Remote sensing and GIS based technologies are essential for? | Options: ['Integrated Farming', 'Intensive Farming', 'Precision Farming', 'Mixed Farming']
Question: Which system provides the accurate positional information, which will be more useful in locating the spatial variability with accuracy? | Options: ['GPS', 'GIS', 'DSS', 'All of these']
Question: Which one of the following techniques is used to monitor crop health? | Options: ['Surveillance', 'Vigilance', 'Remote sensing', 'All of these']
Question: Development of precision GIS / GPS auto negative systems increased the efficiency of field operations in ……………………….? | Options: ['Processing farming', 'Conservation agriculture', 'Cooperative farming', 'Multistorey farming']
Question: Which software is used for spa