In [18]:
import csv
import json
from tqdm import tqdm
import tiktoken
from openai import OpenAI
import concurrent.futures
import requests
import os
from settings import (
    YOUTUBE_API_KEY, 
    OPENAI_API_KEY
)

"""
wb_ag_datasets.csv,	
wb_youtube_videos.json,
all_usecases.csv
"""
data = []

Process datasets file

In [11]:
with open("data/wb_ag_datasets.csv", "r") as f:
    datasets = [r for r in csv.DictReader(f)]

print(datasets[0:3])
print(datasets[0].keys())
print([type(v) for v in datasets[0].values()])
print(datasets[0]["files"])

datasets = [{ **d, "id": d["dataset_id"], "type": "dataset", "title": d["name"]} for d in datasets]

data.extend(datasets)

[{'name': 'World - World Food Security Outlook', 'description': "Key components of the WFSO database cover severe food insecurity prevalence, estimates for countries lacking official data, population sizes of the severely food insecure, and required safety net financing. Data is presented in a user-friendly format.<br><br>WFSO data primarily relies on hunger and malnutrition data from the State of Food Security and Nutrition in the World (SOFI) report, led by the Food and agriculture Organization (FAO) in collaboration with multiple UN agencies. WFSO complements SOFI data by providing estimates for unreported countries. Historical estimates are produced with a World Bank machine learning model leveraging World Development Indicators (WDI) for global coverage. <br><br>Financing needs for safety nets are calculated similarly to past approaches by the International Development Association (IDA) to assess food insecurity response needs. Preliminary estimates and projections rely on the sam

In [14]:
with open("data/all_usecases.csv", "r") as f:
    usecases = [r for r in csv.DictReader(f)]

print(usecases[0:3])
print(usecases[0].keys())
print([type(v) for v in usecases[0].values()])
print(usecases[0]["document"])

usecases = [{**u, "title":u["use_case"], "type": "usecase"} for u in usecases]
data.extend(usecases)   

[{'id': 'P154784', 'use_case': 'Upscaling Climate-Smart Practices', 'project': 'Kenya Climate Smart Agriculture Project', 'description': "The Kenya Climate Smart Agriculture Project (KCSAP) successfully implemented various methodologies for upscaling climate-smart agricultural practices, which led to remarkable outcomes in Kenyan agricultural sectors. One core methodology was mobilizing nearly 23,400 Common Interest Groups (CIGs) and Vulnerable and Marginalized Groups (VMGs), thereby impacting 370,000 farmers, 55 percent of whom were women. This effort surpassed initial targets by facilitating the adoption of climate-resilient practices among a significant number of farmers. Another notable strategy was the disbursement of 11,200 micro-projects versus an initial goal of 10,400, demonstrating efficient use of resources to catalyze agricultural transformation. Additionally, the project identified and trained 20,000 lead farmers, contributing significantly to community-level skill develop

In [15]:
# Create mapping to store video titles (since several/many snippets may come from the same video)
video_id_to_title = {}

In [17]:
with open("data/wb_youtube_videos.json", "r") as f:
    videos = json.load(f)

print(videos[0:3])
print(videos[0].keys())
print([type(v) for v in videos[0].values()])

videos = [{
    "video_id": v["excerpt link"].replace("https://www.youtube.com/watch?v=", "").split("&")[0],
    "timestamp": v["excerpt link"].replace("https://www.youtube.com/watch?v=", "").split("&")[1].replace("t=", "")[:-1],
    "id": v["excerpt link"].replace("https://www.youtube.com/watch?v=", "").replace("&t=", "_")[:-1],
    "type": "video",
    **v} for v in videos]

for v in tqdm(videos): 
    if not video_id_to_title.get(v["video_id"]):     
        r = requests.get(f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={v['video_id']}&key={YOUTUBE_API_KEY}")
        if not r.json()["items"]: 
            video_id_to_title[v["video_id"]] = "VIDEO NO LONGER AVAILABLE"
        else:
            video_id_to_title[v["video_id"]] = r.json()["items"][0]["snippet"]["title"]
    v["title"] = video_id_to_title[v["video_id"]]

data.extend(videos)
 

[{'excerpt link': 'https://www.youtube.com/watch?v=sHMD3wBlD7E&t=1s', 'transcript excerpt': "agriculture webinar what's cooking series uh this has been a series which has been launched uh during the pandemic the the amount of knowledge and learning which is happening is exponential and we host in this series a number of uh practitioners thought leaders CEOs of uh technology data companies uh public sector decision makers and a number of people who are working on this field together and the idea is to have a free exchange of Innovations digital Solutions which are emerging at a very rapid rate across the world particularly in the area of Agriculture and rural development uh so we have a very interesting panel today and uh today's uh topic is also very important because", 'video summary': "A recording of a webinar from the World Bank's Data Driven Digital Agriculture YouTube channel detailing how KUZA leverages rural youth as change agents in agri-food sector transformation at scale."}, 

100%|██████████| 10194/10194 [00:15<00:00, 656.52it/s]


In [9]:
client = OpenAI(api_key=OPENAI_API_KEY)

def get_text_to_embed(d: dict):
    return "\n ".join([f"{key}: {value}" for key, value in d.items() if value is not None])

def get_tokens(text: str):
    text = text.replace("\n", " ")
    encoding = tiktoken.get_encoding("cl100k_base")
    return encoding.encode(text)


def get_embedding(tokens: list):
    
    if len(tokens) > 8191:
        print("WARNING: Token length execeeds 8191 tokens, truncating to 8191 tokens")
        tokens = tokens[:8191]

    return (
        client.embeddings.create(input=tokens, model="text-embedding-3-small")
        .data[0]
        .embedding
    )

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    embeddings = list(
        tqdm(
            executor.map(
                lambda d: get_embedding(get_tokens(get_text_to_embed(d))), 
                data
            ), 
            total=len(data) # sets total length of progressbar
        )
    ) 

100%|██████████| 2348/2348 [01:06<00:00, 35.45it/s]


In [12]:
data = [{**d, "embedding": embedding} for d, embedding in zip(data, embeddings)]

In [13]:
# Standardize keys
key_set = set()
for d in data: 
    key_set.update(set(d.keys()))    
data = [{**{k:None for k in key_set}, **d} for d in data]

In [14]:
import json
with open("records_v1.0.json", "w") as f: 
    f.write(json.dumps(data))