In [1]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"neonkazuha","key":"e308374317107aca47893808a0cd6b36"}'}

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
!kaggle datasets download -d jonathanomara/agronomic-question-and-answer-dataset

Dataset URL: https://www.kaggle.com/datasets/jonathanomara/agronomic-question-and-answer-dataset
License(s): Apache 2.0
Downloading agronomic-question-and-answer-dataset.zip to /content
  0% 0.00/71.4k [00:00<?, ?B/s]
100% 71.4k/71.4k [00:00<00:00, 39.5MB/s]


In [4]:
!unzip '/content/agronomic-question-and-answer-dataset.zip'

Archive:  /content/agronomic-question-and-answer-dataset.zip
  inflating: AgroQA Dataset.csv      


In [5]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

In [6]:
df = pd.read_csv('/content/AgroQA Dataset.csv')
df.head()

Unnamed: 0,Crop,Question,Answer
0,maize,"Apart from hand weeding, what other method use...",Machinery weeders are available
1,beans,"Apart from insecticide, what other method used...",Use resistant verities and increase on water a...
2,maize,Apart from sun drying which other method used ...,Use tarpaulins or cemented floor free from dust
3,cassava,"Apart from sun drying, what other method can I...",Solar driers
4,beans,As a farmer when should I harvest beans.,When the beans pods are yellowish green or dry...


## Creating Answers with TFidf

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
df.drop(['Crop'], axis=1, inplace=True)

In [9]:
df.head()

Unnamed: 0,Question,Answer
0,"Apart from hand weeding, what other method use...",Machinery weeders are available
1,"Apart from insecticide, what other method used...",Use resistant verities and increase on water a...
2,Apart from sun drying which other method used ...,Use tarpaulins or cemented floor free from dust
3,"Apart from sun drying, what other method can I...",Solar driers
4,As a farmer when should I harvest beans.,When the beans pods are yellowish green or dry...


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044 entries, 0 to 3043
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  3044 non-null   object
 1   Answer    3043 non-null   object
dtypes: object(2)
memory usage: 47.7+ KB


In [11]:
df.isnull().sum()

Question    0
Answer      1
dtype: int64

In [12]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3043 entries, 0 to 3043
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  3043 non-null   object
 1   Answer    3043 non-null   object
dtypes: object(2)
memory usage: 71.3+ KB


In [14]:
vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 1000)

In [15]:
len(df)

3043

In [16]:
data = vectorizer.fit_transform(df['Question'])
data

<3043x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 11796 stored elements in Compressed Sparse Row format>

In [17]:
vocab = vectorizer.get_feature_names_out()

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

## Generate Predictions

In [19]:
question = 'Which method can be used for drying maize'
question_vector = vectorizer.transform([question])
question_vector

<1x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [20]:
similarities = cosine_similarity(question_vector, data)
similarities.shape

(1, 3043)

In [21]:
type(similarities)

numpy.ndarray

In [22]:
max_sim_index = similarities.argmax()
max_sim_index

2

In [23]:
print(f'Question: {question} \nAnswer: {df["Answer"][max_sim_index]}')

Question: Which method can be used for drying maize 
Answer: Use tarpaulins or cemented floor free from dust


In [24]:
def generate_answer(question):
  question_vector = vectorizer.transform([question])
  similarities = cosine_similarity(question_vector, data)
  max_sim_index = similarities.argmax()
  return df['Answer'][max_sim_index]

In [25]:
generate_answer('Harvest Beans')

'Pull them out of ground with hands'

## Applying LSA

In [26]:
from sklearn.decomposition import TruncatedSVD

In [36]:
svd = TruncatedSVD(n_components=100)

In [40]:
Y = data
Y = svd.fit_transform(Y)
Y.shape

(3043, 100)

In [41]:
question = 'Which method can be used for drying maize'
question_vector = vectorizer.transform([question])
print(question_vector.shape)
question_vector = svd.transform(question_vector)
# similarities = cosine_similarity(question_vector, Y)
# max_sim_index = similarities.argmax()
question_vector.shape

(1, 1000)


(1, 100)

In [44]:
def generate_answer(question):
  question_vector = vectorizer.transform([question])
  question_vector = svd.transform(question_vector)
  similarities = cosine_similarity(question_vector, Y)
  max_sim_index = similarities.argmax()
  return df['Answer'][max_sim_index]

In [46]:
generate_answer('Which method can be used for drying maize')

'Use tarpaulins or cemented floor free from dust'