In [10]:
import pandas as pd
from pathlib import Path
import numpy as np
from datetime import datetime
import calendar

In [11]:
DIR_RAW_DATA = Path.cwd().joinpath("data", "raw")
DIR_TABLES_DATA = Path.cwd().joinpath("data", "tables")

In [12]:
def convert2float(s):
    if isinstance(s, float):
        return s
    elif s == "#DIV/0!":
        return np.NaN
    elif "," in s:
        # Remove the comma
        s = s.replace(",", "")
        # Convert to float
        return float(s)
    else:
        return float(s)

# Process Price Data 

In [16]:
def get_last_day_of_month(month_name, year=2024):
    # Convert month name to month number
    month_number = datetime.strptime(month_name, '%B').month
    
    # If year is not provided, use the current year
    if year is None:
        year = datetime.now().year
    
    # Get the last day of the month
    last_day = calendar.monthrange(year, month_number)[1]
    
    # Create the date object for the last day of the month
    last_day_date = datetime(year, month_number, last_day)

    # Format the date object to string in 'YYYY-MM-DD' format
    last_day_date_str = last_day_date.strftime('%Y-%m-%d')
    
    return last_day_date_str

## Maize Prices

In [17]:
file_maize_prices = DIR_RAW_DATA.joinpath("prices-maize.csv")
df_maize = pd.read_csv(file_maize_prices)
df_maize.dropna(subset=['ADD'], inplace=True)
df_maize['Price'] = df_maize.Price.apply(convert2float)


df_maize.drop(columns=['Wk1', 'Wk2',
       'Wk3', 'Wk4', 'Wk5'], inplace=True)
df_maize['Year'] = df_maize.Year.astype(int)
df_maize.rename(columns={'District ': "District", 
                         "Month":'Month_Name', "Year":'Yr',
                         "ADD": "ADD_Name", "EPA": "EPA_Name"}, 
                inplace=True)
df_maize['Collection_Date'] = df_maize.Month_Name.apply(get_last_day_of_month)

In [22]:
df_maize.ADD.value_counts()

Lilongwe       75
Kasungu        70
Machinga       50
Blantyre       50
Mzuzu          45
Karonga        25
Shirevalley    25
Salima         20
Name: ADD, dtype: int64

In [29]:
df_maize.EPA.value_counts().index

Index(['Chitsime', 'Thondwe', 'Kaluluma', 'Misuku', 'Manjawira', 'Nasenga',
       'Mbwadzulu', 'Mpiya', 'Chikweo', 'Bazale', 'Nsipe', 'Mpokwa',
       'Tsangano', 'Sharpevale', 'Njolomole', 'Bembeke', 'Linthipe', 'Mtubwi',
       'Ngwelero', 'Lobi', 'Mbulumbuzi', 'Lunzu', 'Mtonda', 'Dwale',
       'Nkhonjeni', 'Mwanza', 'Neno', 'Nkhulambe', 'Naminjiwa', 'Milonde',
       'Mitole', 'Mikalango', 'Mbewe', 'Magoti', 'Mtakataka', 'Mitundu',
       'Chitekwere', 'Mwansambo', 'Lufita', 'Mpata', 'Vinthukutu', 'Bolero',
       'Katowo', 'Emsizini', 'Kazomba', 'Mbawa', 'Champhira', 'Mpamba',
       'Nkhata Bay', 'Chintheche', 'Nkhunga', 'Linga', 'Tembwe', 'Kavukuku',
       'Chikwatula', 'Malomo', 'Nachisaka', 'Mponela', 'Madisi', 'Bowe',
       'Nambuma', 'Santhe', 'Chipala', 'Chamama', 'Mkanda', 'Zulu', 'Chilaza',
       'Demera', 'Zunde'],
      dtype='object')

In [34]:
df_maize.District.value_counts().index

Index(['Ntcheu', 'Kasungu', 'Dedza', 'Dowa West', 'Zomba', 'Chikwawa',
       'Machinga', 'Lilongwe East', 'Lilongwe West', 'Chitipa', 'Nkhotakota',
       'Nkhata Bay', 'Mzimba South', 'Nsanje', 'Mchinji', 'Rumphi', 'Karonga',
       'Mangochi', 'Ntchisi', 'Phalombe', 'Blantyre', 'Thyolo', 'Neno',
       'Mulanje', 'Dowa East', 'Mwanza', 'Chiradzulu', 'Balaka', 'Salima',
       'Mzimba North'],
      dtype='object')

In [124]:
df_maize

Unnamed: 0,ADD_Name,EPA_Name,District,Market,Month_Name,Yr,Commodity,Price
0,Karonga,Misuku,Chitipa,Misuku,January,2024,Maize,544.445000
1,Karonga,Kavukuku,Chitipa,Nthalire,January,2024,Maize,
2,Karonga,Lufita,Chitipa,Chitipa,January,2024,Maize,
3,Karonga,Mpata,Karonga,Karonga,January,2024,Maize,752.330000
4,Karonga,Vinthukutu,Karonga,Chilumba,January,2024,Maize,601.866667
...,...,...,...,...,...,...,...,...
355,Shirevalley,Mitole,Chikwawa,Chikhwawa,May,2024,Maize,517.815000
356,Shirevalley,Mikalango,Chikwawa,Ngabu,May,2024,Maize,530.307500
357,Shirevalley,Mbewe,Chikwawa,Nchalo,May,2024,Maize,508.411667
358,Shirevalley,Magoti,Nsanje,Bangula,May,2024,Maize,


## Rest of the commodities Prices

In [47]:
prices = [df_maize]
for file in DIR_RAW_DATA.iterdir():
    if not "prices" in file.parts[-1]:
        continue
    if file.parts[-1] == "prices-maize.csv":
        continue
    df = pd.read_csv(file)
    df['Price'] = df.Price.apply(lambda x: convert2float(x))
    df.dropna(subset=['ADD'], inplace=True)
    df.rename(columns={'District ': "District", 
                         "Month":'Month_Name', "Year":'Yr',
                         "ADD": "ADD_Name", "EPA": "EPA_Name"}, inplace=True)
    df['Yr'] = df_maize.Yr.astype(int)
    df['Collection_Date'] = df.Month_Name.apply(get_last_day_of_month)
    prices.append(df)
df_prices = pd.concat(prices)
df_prices['District'] = df_prices.District.apply(lambda x: x.split()[0])


In [64]:
assert df_prices.shape[0] == len(df_maize)*6

In [35]:
dists = list(df_prices.District.unique())

In [50]:
df_prices.to_csv(DIR_TABLES_DATA.joinpath("prices.csv"), index=False)

In [56]:
df_prices.query('District == "Karonga" & Month_Name == "May" & Commodity == "Maize"').Price.mean()

524.6175000000001

In [59]:
df_prices.Commodity.unique()

array(['Maize', 'Rice', 'Soya beans', 'Beans', 'Cow peas', 'Groundnuts'],
      dtype=object)

# Process Production Data

In [109]:
df_prod = pd.read_csv(DIR_RAW_DATA.joinpath("production.csv"))
df_prod.dropna(subset=['District'], inplace=True)
df_prod['District'] = df_prod.District.apply(lambda x: x.split()[0])
df_prod2 = df_prod.groupby("District").sum().reset_index()
df_prod2['Tobacco'] = df_prod2.apply(lambda x: x['NDDF Tobacco'] + x['Burley tobacco'], axis=1)
df_prod2.drop(columns=["NDDF Tobacco", "Burley tobacco"], inplace=True)

In [111]:
value_cols = ["Maize", 'Beans', 'Cow peas', 'Dolichus beans ',
       'Soy beans', 'Ground beans', 'Paprika', 'Rice', 'Pigeon peas', 'Grams',
       'Sesame ', 'Field peas', 'Velvet beans', 'Chick peas', 'Wheat',
       'Millet', 'Sorghum ', 'Groundnuts', 'Cassava', 'Sweet potatoes',
       'Potatoes', 'Tobacco', 'Flue cured',
       'Sunflower ', 'Chillies', 'Cotton ', 'Bananas', 'Mangoes', 'Oranges',
       'Tangerines', 'Coffee', 'Pineapples', 'Guava', 'Pawpaws', 'Peaches',
       'Lemons', 'Grape fruits', 'Apples', 'Avocado pear', 'Macademia',
       'Tomatoes', 'Onions', 'Cabbage', 'Egg plants', 'Okra', 'Cucumber']
df_prod_long = df_prod2.melt(id_vars='District', value_vars=value_cols,
var_name= "Crop", value_name="Yield")
df_prod_long['Season'] = "2023-2024"
df_prod_long.replace("Rumphi ", 'Rumphi', inplace=True)

In [112]:
df_prod_long.head(10)

Unnamed: 0,District,Crop,Yield,Season
0,Balaka,Maize,43565.0,2023-2024
1,Blantyre,Maize,34369.0,2023-2024
2,Chikwawa,Maize,55804.0,2023-2024
3,Chiradzulu,Maize,35078.0,2023-2024
4,Chitipa,Maize,102319.0,2023-2024
5,Dedza,Maize,227724.0,2023-2024
6,Dowa,Maize,259890.0,2023-2024
7,Karonga,Maize,67998.0,2023-2024
8,Kasungu,Maize,318203.0,2023-2024
9,Likoma,Maize,255.1,2023-2024


In [79]:
df_prod_long.District.unique()

array(['Mzimba South', 'Mzimba North', 'Rumphi ', 'Nkhata Bay', 'Likoma',
       'Blantyre', 'Thyolo', 'Mulanje', 'Phalombe', 'mwanza',
       'Chiradzulu', 'Neno', 'Chitipa', 'Karonga', 'Kasungu', 'Mchinji',
       'Dowa East', 'Dowa West', 'Ntchisi', 'Lilongwe West',
       'Lilongwe East', 'Dedza', 'Ntcheu', 'Chikwawa', 'Nsanje',
       'Nkhotakota', 'Salima', 'Machinga', 'Mangochi', 'Zomba', 'Balaka'],
      dtype=object)

In [80]:
df_prod_long.District.unique()

array(['Mzimba South', 'Mzimba North', 'Rumphi ', 'Nkhata Bay', 'Likoma',
       'Blantyre', 'Thyolo', 'Mulanje', 'Phalombe', 'mwanza',
       'Chiradzulu', 'Neno', 'Chitipa', 'Karonga', 'Kasungu', 'Mchinji',
       'Dowa East', 'Dowa West', 'Ntchisi', 'Lilongwe West',
       'Lilongwe East', 'Dedza', 'Ntcheu', 'Chikwawa', 'Nsanje',
       'Nkhotakota', 'Salima', 'Machinga', 'Mangochi', 'Zomba', 'Balaka'],
      dtype=object)

In [113]:
df_prod_long.to_csv(DIR_TABLES_DATA.joinpath("production.csv"), index=False)

In [67]:
df_prod = pd.read_csv(DIR_TABLES_DATA.joinpath("production.csv"))

In [105]:
for c in list(df_prod_long.Crop.unique()):

    if "tobacco" in c.lower():
        print(c)

NDDF Tobacco
Burley tobacco


In [123]:
df_prices.query('Month_Name == "May" & Market == "Dowa" & Commodity == "Maize"')

Unnamed: 0,ADD_Name,EPA_Name,District,Market,Month_Name,Yr,Commodity,Price,Collection_Date
308,Kasungu,Nachisaka,Dowa,Dowa,May,2024,Maize,562.5,2024-05-31


In [124]:
df_prices.Commodity.unique()

array(['Maize', 'Rice', 'Soya beans', 'Beans', 'Cow peas', 'Groundnuts'],
      dtype=object)

In [128]:
x ="Which districts produce most Maize?"
x.lower().split()

['which', 'districts', 'produce', 'most', 'maize?']

In [151]:
df_prices.Price.isna().sum()

1116

# Process Security 

In [201]:
detailed_insecurity_levels = {1: "Households are able to meet essential food and\
non-food needs without engaging in atypical and unsustainable strategies to\
access food and income", 
2: "Households have minimally adequate food consumption\
but are unable to afford some essential non-food expenditures without\
engaging in stress-coping strategies",
3: "Households either have food consumption gaps that are\
reflected by high or above-usual acute malnutrition; or are marginally able to\
meet minimum food needs but only by depleting essential livelihood assets or\
through crisis-coping strategies.",
4: "Households either have large food consumption gaps\
that are reflected in very high acute malnutrition and excess mortality; or are able to\
mitigate large food consumption gaps but only by employing emergency livelihood\
strategies and asset liquidation",
5: "Households have an extreme lack of food\
and/or cannot meet other basic needs even after full employment of coping\
strategies. Starvation, death, destitution and extremely critical acute malnutrition\
levels are evident. For famine classification, area needs to have extreme critical levels\
of acute malnutrition and mortality"}
brief_insecurity_levels = {1: "None or minimal", 2: "stressed", 
                           3: "Crisis", 4: "Emergency", 5: "Catastrophe or famine"}
phaase2num = {"phase{}".format(i):i for i in range(1,6)}

In [213]:
df_list = []

for file in DIR_RAW_DATA.iterdir():
    if "food" not in file.parts[-1]:
        continue
    df = pd.read_csv(file)
    if "may" in file.parts[-1]:
        df['period'] = "May-Sept 2024"
    else:
        df['period'] = "October 2024-March 2025"
    df2 = df.melt(id_vars=["district", 'population', 'period'], 
            value_vars=['phase1', 'phase2', 'phase3', 'phase4',
       'phase5'],value_name="percentage_population", 
       var_name="insecurity")
    df_list.append(df2)
df_insec = pd.concat(df_list)
df_insec["insecurity_level"] = df_insec.insecurity.map(phaase2num)
df_insec["insecurity_desc_short"] = df_insec.insecurity_level.map(brief_insecurity_levels)
df_insec["insecurity_desc_long"] = df_insec.insecurity_level.map(detailed_insecurity_levels)
df_insec.drop(columns=['insecurity'], inplace=True)
df_insec['population'] = df_insec.population.apply(lambda x: convert2float(x))


In [214]:
df_insec.to_csv(DIR_TABLES_DATA.joinpath("food-insecurity.csv"), index=False)

In [62]:
df_prices_may = df_prices.query('Month_Name == "May"')


In [137]:
df_prices_may.query('Commodity == "Maize"').Price.mean()

599.5864912289474

In [64]:
df_prices_may.Market.nunique() == len(df_prices_may)

False

In [136]:
df_prices.query()

1776.6735181990423

# Create and Save Example SQL Queries 

In [139]:
import json

EXAMPLES_EN = [
    {
        "input": "What is the price of Maize in Rumphi", 
        "query": "SELECT district, AVG(price) as average_price FROM commodity_prices WHERE commodity = 'Maize' AND collection_date = (SELECT MAX(collection_date) FROM commodity_prices WHERE commodity = 'Maize') AND district = 'Rumphi' GROUP BY district;"
    },
    {
        "input": "What is the price of rice in Lilongwe district now", 
        "query": "SELECT district, AVG(price) as average_price FROM commodity_prices WHERE commodity = 'Rice' AND collection_date = (SELECT MAX(collection_date) FROM commodity_prices WHERE commodity = 'Rice') AND district = 'Lilongwe' GROUP BY district;"
    },
    {
        "input": "Which district has the lowest price for groundnuts?", 
        "query": """WITH latest_date AS (
    SELECT MAX(collection_date) AS max_date
    FROM commodity_prices
    WHERE commodity = 'Groundnuts'
),
average_prices AS (
    SELECT district, AVG(price) AS average_price
    FROM commodity_prices
    WHERE commodity = 'Groundnuts' AND collection_date = (SELECT max_date FROM latest_date)
    GROUP BY district
)
SELECT district, average_price
FROM average_prices
ORDER BY average_price ASC
LIMIT 1;"""
    },
    {
        "input": "Where is it cheap to buy Maize?", 
        "query": """WITH latest_date AS (
    SELECT MAX(collection_date) AS max_date
    FROM commodity_prices
    WHERE commodity = 'Maize'
),
average_prices AS (
    SELECT market, AVG(price) AS average_price
    FROM commodity_prices
    WHERE commodity = 'Maize' AND collection_date = (SELECT max_date FROM latest_date)
    GROUP BY market
)
SELECT market, average_price
FROM average_prices
ORDER BY average_price ASC
LIMIT 5;"""
    },
    {
        "input": "Where is the best place to sale soya?", 
        "query": """WITH latest_date AS (
    SELECT MAX(collection_date) AS max_date
    FROM commodity_prices
    WHERE commodity = 'Soya beans'
),
average_prices AS (
    SELECT market, AVG(price) AS average_price
    FROM commodity_prices
    WHERE commodity = 'Soya beans' AND collection_date = (SELECT max_date FROM latest_date)
    GROUP BY market
)
SELECT market, average_price
FROM average_prices
ORDER BY average_price DESC
LIMIT 1;"""
    },
    {
        "input": "Whats the price of beans?",
        "query": """WITH latest_date AS (
    SELECT MAX(collection_date) AS max_date
    FROM commodity_prices
    WHERE commodity = 'Beans'
)
SELECT AVG(price) AS average_price
FROM commodity_prices
WHERE commodity = 'Beans' AND collection_date = (SELECT max_date FROM latest_date);"""
    },
    {
        "input": "Which district produced the most Maize",
        "query": """WITH max_yield_district AS (
    SELECT district
    FROM production
    WHERE crop = 'Maize'
    ORDER BY yield DESC
    LIMIT 1
)
SELECT district
FROM max_yield_district;"""
    },
    {
        "input": "Where can I find alot of rice to purchase?",
        "query": """WITH max_yield_district AS (
    SELECT district
    FROM production
    WHERE crop = 'Rice'
    ORDER BY yield DESC
    LIMIT 1)
SELECT district
FROM max_yield_district;"""
    },
    {
        "input": "Which crops did well in Rumphi",
        "query": "SELECT crop FROM production WHERE district = 'Rumphi' ORDER BY yield DESC LIMIT 5;"
    },
    {
        "input": "Which districts harvested the most tobacco?",
        "query": """WITH max_yield_district AS (
    SELECT district
    FROM production
    WHERE crop = 'Tobacco'
    ORDER BY yield DESC
    LIMIT 5)
SELECT district
FROM max_yield_district;"""
    },
    {
        "input": "Where can I buy soya?",
        "query": """WITH max_yield_district AS (
        SELECT district
        FROM production
        WHERE crop = 'Soy beans'
        ORDER BY yield DESC
        LIMIT 5)
    SELECT district
    FROM max_yield_district;"""
    },
    {
        "input": "Which district produced more maize: Lilongwe or Kasungu",
        "query": """WITH district_yields AS (
    SELECT district, SUM(yield) AS total_yield
    FROM production
    WHERE crop = 'Maize' AND district IN ('Lilongwe', 'Kasungu')
    GROUP BY district
        )
        SELECT district
        FROM district_yields
        ORDER BY total_yield DESC
        LIMIT 1;"""
    },
    {
        "input": "How much is maize in Dowa?",
        "query": """WITH latest_date AS (
    SELECT MAX(collection_date) AS max_date
    FROM commodity_prices
    WHERE commodity = 'Maize' AND market = 'Dowa'
)
SELECT price
FROM commodity_prices
WHERE commodity = 'Maize' AND market = 'Dowa' AND collection_date = (SELECT max_date FROM latest_date);"""
    },
    {
        "input": "Districts with cheap beans",
        "query": """SELECT market, price
            FROM commodity_prices
            WHERE commodity = 'Beans' AND collection_date = (SELECT MAX(collection_date) FROM commodity_prices WHERE Commodity = 'Beans')
            ORDER BY price ASC
            LIMIT 5;"""
    },
    {
        "input": "Where can i get cheap groundnuts",
        "query": """SELECT market, price
            FROM commodity_prices
            WHERE commodity = 'Groundnuts' AND collection_date = (SELECT MAX(collection_date) FROM commodity_prices WHERE Commodity = 'Groundnuts')
            ORDER BY price ASC
            LIMIT 5;"""
    },
    {
        "input": "current price of Maize in the country",
        "query": """-- Query to retrieve the average price of a given commodity on the most recent collection_date
        SELECT AVG(Price) as average_price
        FROM commodity_prices
        WHERE Commodity = 'Maize' AND collection_date = (SELECT MAX(collection_date) FROM commodity_prices WHERE Commodity = 'Maize');"""
    }
]

# Save the list to a text file
with open('sql_examples_en.txt', 'w') as f:
    json.dump(EXAMPLES_EN, f, indent=4)


In [None]:
## Machine Translate Chichewa Questions into English and Use them as Example