### 1. Generate Description

This version of description generation uses 3 functions to generate description. The methods used are RetrievalQA, create_pandas_dataframe_agent and chatGPT llm

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# LangChain imports
from langchain.agents import create_pandas_dataframe_agent
from langchain.chat_models import ChatOpenAI
from langchain.agents.agent_types import AgentType
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredExcelLoader, UnstructuredFileLoader
from langchain.llms import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain import PromptTemplate

# import openAI API key - put correct key
import os
os.environ["OPENAI_API_KEY"] = "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"
api_key = "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"
# Display all columns from dataset
pd.set_option('display.max_columns', None)

Bot responses dataset

In [2]:
file_ = "../DGDatasets/FLW Feedback on chilli bot.xlsx"
df = pd.ExcelFile(file_)
print(f"This file has {len(df.sheet_names)} sheet(s)")

This file has 3 sheet(s)


In [3]:
pd.read_excel(file_, df.sheet_names[0]).shape

(155, 6)

In [4]:
pd.read_excel(file_, df.sheet_names[1]).shape

(0, 6)

In [5]:
pd.read_excel(file_, df.sheet_names[2]).shape

(0, 6)

In [3]:
sub_df = pd.read_excel(file_)
sub_df.head(3)

Unnamed: 0,S.No,Name of the FLW,Question asked,Response received,Feedback on the response,Over all Observations
0,1.0,B Balaji Naik,Chilli varieties please,"The high yielding varieties in chilli are G3, ...",Appropriate,Response observations from female FLW:\n1. Var...
1,,,,,,
2,2.0,B Balaji Naik,Which pesticides use for black thrips,"To control the sucking pests (Aphids, thrips, ...",Appropriate,Summary: 1. Most of questions asked by the FLW...


In [4]:
llm = OpenAI(openai_api_key=api_key, temperature=0)

In [40]:
# create method for prompt template
# create method for Retrieval QA
# generate description 3 times then summarize the description, pass the 3 desc and dataset to chatgpt

#### Wrapping the description generations in functions

In [5]:
# function to engineer prompt with column names only
def create_prompt_col(col):
    prompt = PromptTemplate(
        input_variables = ["columns"],
        template = "Explain this dataset based on {columns}"
    )
    query = prompt.format(columns = list(col))
    return query

In [6]:
# function for prompt engineering
# makes prompt from column names and 5 first rows
def create_prompt(col, row):
    prompt = PromptTemplate(
    input_variables=["columns", "rows"],
    template = "Generate a summary about this dataset based on {columns} and {rows} information"
    )
    query = prompt.format(columns = list(col), rows = row)
    return query

In [7]:
# function to generate description with pandas agent
def generate_desc_pd_agent(df, question):
    agent = create_pandas_dataframe_agent(OpenAI(temperature=0),
                                         df,
                                         verbose=True
                                         )
    description = agent.run(question)
    return description

In [8]:
# function to generate description from chatGPT model
def gen_desc_gpt(query):
    description = llm(query)
    return description

In [9]:
# function to generaate description with Retrieval QA
def gen_desc_rQA(file, api_k, llm, query):
    df = pd.read_excel(file)
    query_gpt = create_prompt_col(df.columns)
    loader = UnstructuredFileLoader(file)
    doc = loader.load()
    
    # split docs
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=400)
    docs = text_splitter.split_documents(doc)
    
    total_num_char = sum([len(x.page_content) for x in docs])
    # print(f"Now you have {len(docs[:4])} documents that have an average of {total_num_char / len(docs):,.0f} characters")
    # embeddings
    # creating vectors db
    embeddings = OpenAIEmbeddings(openai_api_key=api_k)
    docsearch = FAISS.from_documents(docs[:4], embeddings)
    
    # Create retrieval QA engine
    qa = RetrievalQA.from_chain_type(llm = llm,
                                     chain_type = "stuff",
                                     retriever = docsearch.as_retriever(),
                                     verbose = True)
    
    try:                     
        description = qa.run(query)
    except Exception:
        print("Error message:")
        description = gen_desc_gpt(query_gpt)
    finally:
        return description

##### a. Generating description from create pandas agent

In [10]:
question = "explain this dataset using all information provided in dataset"
desc_agent = generate_desc_pd_agent(sub_df, question)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to look at the data and think about what it is telling me
Action: python_repl_ast
Action Input: print(df.head())[0m
Observation: [36;1m[1;3m   S.No Name of the FLW                                    Question asked   \
0   1.0  B Balaji Naik                             Chilli varieties please   
1   NaN             NaN                                                NaN   
2   2.0  B Balaji Naik               Which pesticides use for black thrips   
3   3.0  B Balaji Naik   Please share some pesticides available in the ...   
4   NaN             NaN                                                NaN   

                                  Response received   \
0  The high yielding varieties in chilli are G3, ...   
1                                                NaN   
2  To control the sucking pests (Aphids, thrips, ...   
3  Here are some pesticides that can be used to c...   
4                              

In [11]:
print(desc_agent)

This dataset contains information about questions asked by female farmers and the responses they received. It includes the question asked, the response received, the feedback on the response, and overall observations.


##### b. Generate description from chatGPT model

In [12]:
query_gpt = create_prompt(sub_df.columns, sub_df.head())
desc_gpt = gen_desc_gpt(query_gpt)

In [13]:
print(desc_gpt.strip())

This dataset contains information about the questions asked by a female farmer (FLW) and the responses received from the FLW. The responses received were appropriate and the feedback on the response was also appropriate. Overall, the observations from the female FLW were found to be satisfactory. The questions asked by the FLW included chilli varieties, pesticides for black thrips, and other pesticides available in the market. The responses received provided information about high yielding chilli varieties, pesticides to control sucking pests, and other pesticides available in the market.


##### c. Generate description from retrieval QA

In [22]:
query = "Provide a meaningful summary of this dataset given all provided information"
desc_qa = gen_desc_rQA(file_, api_key, llm, query)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [23]:
print(desc_qa)

 The dataset provides information on how to answer questions related to chilli cultivation. The bot is able to provide appropriate responses to questions about chilli varieties, pest control, fruit rot, intercropping, and yellow chilli cultivation. The bot is also able to provide responses in Telugu, although there are some translation issues. The bot is able to provide detailed answers to questions, although some basic questions are not answered.


In [24]:
# different question
query2 = "Give a meaningful summary of this dataset"
desc_qa2 = gen_desc_rQA(file_, api_key, llm, query2)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [25]:
print(desc_qa2)

 This dataset provides information about the cultivation of chilli crops, including tips on controlling pests, using resistant varieties, maintaining good field hygiene, and using organic fungicides. It also provides advice on intercropping and border cropping, as well as tips for cultivating yellow chilli.


In [26]:
# different question
query3 = "Explain this dataset"
desc_qa3 = gen_desc_rQA(file_, api_key, llm, query3)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [27]:
print(desc_qa3)

 This dataset contains information about conversations between farmers and a bot about chilli cultivation. The conversations include questions about chilli varieties, pests, and cultivation techniques. The dataset also includes observations from female farmers about the bot's responses.


##### Comparing description generated by RetrievalQA vs pandas dataframe agent

In [29]:
file_2 = "../DGDatasets/Woreda DA Registry Records.xlsx"

In [30]:
query = "Explain this dataset based on all available information"
desc_qa = gen_desc_rQA(file_2, api_key, llm, query)



[1m> Entering new RetrievalQA chain...[0m
Error message:


In [31]:
print(desc_qa.strip())

This dataset contains information about individuals in Ethiopia. It includes their salutation, name, father's name, grandfather's name, sex, birth month and year, maritial status, phone number, alternate phone number (optional), email, education level, specialization, specialization (other), position, employment month and year (in the Ethiopian calendar), assignment month and year at Kebele (in the Ethiopian calendar), pension number, region, zone, woreda, kebele, kebele (translated), CIAT equivalent kebele, and similarity index (confidence). This data could be used to track individuals in Ethiopia, as well as to analyze trends in education, employment, and other demographic information.


In [32]:
pd.read_excel(file_2).head()

Unnamed: 0,Salutation,Name,Father Name,Grand Father Name,Sex,Birth Month,Birth Year,Maritial Status,Phone No,Alternate Phone No (Optional),Email,Education Level,Specialization,Specialization (Other),Position,Employment Month (Ethiopian Calendar),Employment Year (Ethiopian Calendar),Assignment Month at Kebele (Ethiopian Calendar),Assignment Year at Kebele (Ethiopian Calendar),Pension Number,Region,Zone,Woreda,Kebele,Kebele (Translated),CIAT Equivalent Kebele,Similarity Index (Confidence)
0,አቶ,ይበልጣል,ደሴ,ባያብል,Male,8.0,1984,ያላገባ,941134440.0,,,level4,እን/ሀ/ል/ማ/ኤክስቴንሽን,,እንሰሳትእርባታ,5,2005,5,2005.0,,Amhara,North Shewa,Basona Worena,መ/አምባ,Me/Amba,Amba,90.0
1,ወ/ሪት,የሺአረግ,ዘነበ,ብዙነህ,Female,12.0,1976,ያላገባ,921744060.0,,,ድግሪ,እንሰሳትሳይንስ,,እንሰሳትእርባታ,1,1997,1,1997.0,,Amhara,North Shewa,Basona Worena,ሳሪያ,Sariya,Sariya,100.0
2,አቶ,አበራ,አበበ,ያግቡ,Male,2.0,1968,ያላገባ,912907732.0,,,ድግሪ,እንሰሳትሳይንስ,,እንሰሳትእርባታ,11,1992,11,1992.0,,Amhara,North Shewa,Basona Worena,ባቄሎ,Bakelo,Bakelo,100.0
3,ወ/ሪት,ሰውሃረግ,አለሙ,ሞላ,Male,2.0,1987,ያላገባ,923547939.0,,,level4,ዲያሪፕሮዳከሽንቴክኒክ,,እንሰሳትእርባታ,7,2005,7,2005.0,,Amhara,North Shewa,Basona Worena,ውሻውሽኝ,Wshawshny,Wushawshegn,80.0
4,አቶ,ሞላ,ሲሳይ,ተፈራ,Male,,1984,ያላገባ,922919013.0,,,ድግሪ,,,እንሰሳትእርባታ,7,2010,7,2010.0,,Amhara,North Shewa,Basona Worena,ደሊላ,Delila,Del,90.0


In [33]:
question2 = "explain this dataset using all information provided in dataset"
desc_agent3 = generate_desc_pd_agent(pd.read_excel(file_2), question2)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I should look at the column names and the values in each row
Action: python_repl_ast
Action Input: print(df.columns)[0m
Observation: [36;1m[1;3mIndex(['Salutation', 'Name', 'Father Name', 'Grand Father Name', 'Sex',
       'Birth Month', 'Birth Year', 'Maritial Status', 'Phone No',
       'Alternate Phone No (Optional)', 'Email', 'Education Level',
       'Specialization', 'Specialization (Other)', 'Position',
       'Employment Month (Ethiopian Calendar)',
       'Employment Year (Ethiopian Calendar)',
       'Assignment Month at Kebele (Ethiopian Calendar)',
       'Assignment Year at Kebele (Ethiopian Calendar)', 'Pension Number',
       'Region', 'Zone', 'Woreda', 'Kebele', 'Kebele (Translated)',
       'CIAT Equivalent Kebele', 'Similarity Index (Confidence)'],
      dtype='object')
[0m
Thought:[32;1m[1;3m I should look at the values in each row
Action: python_repl_ast
Action Input: print(df.head())[0m
Ob

In [34]:
print(desc_agent3)

This dataset contains information about individuals, including their salutation, name, father name, grand father name, sex, birth month and year, maritial status, phone number, alternate phone number, email, education level, specialization, position, employment month and year, assignment month and year at kebele, pension number, region, zone, woreda, kebele, kebele (translated), CIAT equivalent kebele, and similarity index (confidence).


Video library dataset

In [38]:
file_3 = "../DGDatasets/VideoLibrary.xlsx"
df2 = pd.ExcelFile(file_)
# counting number of sheets in file
print(f"This file has {len(df2.sheet_names)} sheet(s)")

This file has 3 sheet(s)


In [47]:
df2.sheet_names

['APTS', 'Bihar', 'AP FLWs']

In [51]:
pd.read_excel(file_3).head(3)

Unnamed: 0,title,description,youtubeID,youtube_link,duration_sec,date,onlineLikes,onlineViews,offlineViews,adoptions,category,subcategory,topic,subtopic,subject,language,state,country
0,Vermicompost production in FYM pit,Producing Vermicompost in farm yard manure pi...,EikW0U9m3w0,https://www.youtube.com/watch?v=EikW0U9m3w0,1018,2011-01-14T00:00:00.000Z,20,7559,677,9,,,,,,Kannada,Karnataka,India
1,Mulching system for plant basin,Mulching system by agri wastes protects moistu...,1OW4UWiX_0Q,https://www.youtube.com/watch?v=1OW4UWiX_0Q,978,2011-01-25T00:00:00.000Z,45,9565,2244,46,Agriculture,Water Management,Irrigation Management,,,Kannada,Karnataka,India
2,Dehorning for cross bred calves,De horning system is dis budding of horn for ...,Nx7Dm373H_g,https://www.youtube.com/watch?v=Nx7Dm373H_g,1118,2012-05-31T00:00:00.000Z,14,2719,1037,54,Animal Husbandry,Young Ones management,Early Management Practices,Dehorning,Calves,Kannada,Karnataka,India


In [55]:
# Define an empty list to store individual DataFrames
list_dfs = []

# Iterate through each sheet
for sheet in df2.sheet_names:
    
    # Parse data from each sheet as a DataFrame
    data = df2.parse(sheet)

    # Append individual df to list
    list_dfs.append(data)
    
# Combine all DataFrames into one
df_mgd = pd.concat(list_dfs, ignore_index=True)

In [57]:
# For code below check if row > 1 otherwise do not append to list

In [56]:
# Preview head
df_mgd.head()

Unnamed: 0,S.No,Name of the FLW,Question asked,Response received,Feedback on the response,Over all Observations,Name of the FLW / JEEViKA field staff
0,1.0,B Balaji Naik,Chilli varieties please,"The high yielding varieties in chilli are G3, ...",Appropriate,Response observations from female FLW:\n1. Var...,
1,,,,,,,
2,2.0,B Balaji Naik,Which pesticides use for black thrips,"To control the sucking pests (Aphids, thrips, ...",Appropriate,Summary: 1. Most of questions asked by the FLW...,
3,3.0,B Balaji Naik,Please share some pesticides available in the ...,Here are some pesticides that can be used to c...,Appropriate,,
4,,,,,,,


In [58]:
# Generate description

In [59]:
q = "Explain this dataset"
desc_qa1 = gen_desc_rQA(file_3, api_key, llm, q)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [60]:
print(desc_qa1)

 This dataset contains information about videos related to agriculture, animal husbandry, and horticulture in the Kannada language from Karnataka, India. The videos cover topics such as nutrient management, income generation activities, erosion control, seed treatment, cattle insurance, and plant protection. Each video includes information such as the title, YouTube link, date published, number of views, number of likes, number of comments, and the category of the video.


Wheat rust forecast dataset

In [64]:
file_4 = "../DGDatasets/Wheat_rust_forecast_final.xlsx"
dfEx = pd.ExcelFile(file_4)
# number of sheets
print(len(df4.sheet_names))

1


In [66]:
df4 = pd.read_excel(file_4)
df4.head(3)

Unnamed: 0,Year,Disease,Severity,Growth Stage,Region,Zone,Woreda,Kebele,Forecast Date,Message,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,2020,Yellow Rust / Stripe Rust,High (> 60%),Tillering,Oromia,West Arsi,Gedeb Assassa,,,Hi Mr. / Ms xxx I am a re fd mnpresentative fr...,,,
1,2020,Stem Rust,High (> 40%),Boot,Oromia,West Arsi,Jeju,"Huruta Dore, Hijara",,,,,
2,2020,Stem Rust,High (> 40%),Milk,Oromia,East Shoa,Boset,Sifa,,,,,


In [67]:
df4.columns

Index(['Year', 'Disease', 'Severity', 'Growth Stage', 'Region', 'Zone',
       'Woreda', 'Kebele', 'Forecast Date', 'Message', 'Unnamed: 10',
       'Unnamed: 11', 'Unnamed: 12'],
      dtype='object')

In [68]:
df4.shape

(55, 13)

In [69]:
# remove columns with NaN > 80%?

In [73]:
df4['Unnamed: 12'].isna().sum()

54

In [74]:
df4.drop(columns=["Unnamed: 10", "Unnamed: 11", "Unnamed: 12"], inplace = True)

In [75]:
df4.head(3)

Unnamed: 0,Year,Disease,Severity,Growth Stage,Region,Zone,Woreda,Kebele,Forecast Date,Message
0,2020,Yellow Rust / Stripe Rust,High (> 60%),Tillering,Oromia,West Arsi,Gedeb Assassa,,,Hi Mr. / Ms xxx I am a re fd mnpresentative fr...
1,2020,Stem Rust,High (> 40%),Boot,Oromia,West Arsi,Jeju,"Huruta Dore, Hijara",,
2,2020,Stem Rust,High (> 40%),Milk,Oromia,East Shoa,Boset,Sifa,,


In [76]:
q = "Explain this dataset"
desc_qa1 = gen_desc_rQA(file_4, api_key, llm, q)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [77]:
print(desc_qa1)

 This dataset contains information about wheat rust incidence in Oromia, Ethiopia in 2020. It includes information about the type of rust (Stem Rust or Leaf Rust/Stripe Rust), the severity (Low, Moderate, or High), the stage of wheat growth (Tillering, Flowering, Boot, Maturity, or Dough), the region (East Shewa, West Arsi, Arsi, East Shoa, or Boset), and the specific location (Fentale, Sire, Tedechabela-hasse, Fentale Garadimaa, etc.).


### 2. Extract keywords

In [None]:
# Approaches for keyword extraction
# 1. Unique values from columns (col with unique vals <=3)
# 2. From generated description

In [79]:
question = "Extract top 10 words or tags related to agriculture, or year, or region included in this summary"
keywords = llm(desc_qa1 + question)

In [80]:
print(keywords)



1. Agriculture
2. Year (2020)
3. Region (Oromia, Ethiopia)
4. Rust (Stem Rust, Leaf Rust/Stripe Rust)
5. Severity (Low, Moderate, High)
6. Stage (Tillering, Flowering, Boot, Maturity, Dough)
7. East Shewa
8. West Arsi
9. Arsi
10. East Shoa


In [87]:
# Format the above dataset
question2 = "Extract top 10 tags related to agriculture, or date/ year, or region/ country included in this summary"

In [88]:
keywords2 = llm(desc_qa1 + question2)

In [89]:
print(keywords2)

.

1. Agriculture
2. Wheat Rust
3. Ethiopia
4. 2020
5. Stem Rust
6. Leaf Rust/Stripe Rust
7. Severity
8. Tillering
9. Flowering
10. Boot


In [90]:
type(keywords2)

str

In [91]:
# pass keywords to llm and extract words in a list removing the numbers