In [2]:
import csv
import json
import os
from tqdm import tqdm
import tiktoken
from openai import OpenAI
import concurrent.futures
import requests
from dotenv import load_dotenv
import os

load_dotenv()

youtube_api_key = os.environ["YOUTUBE_DATA_API_KEY"]

"""
wb_ag_datasets.csv,	
wb_ag_projects.csv,	
wb_youtube_videos.json,
wb_ag_ext_papers.csv,
wb_ag_usecases.csv
"""
data = []

Process datasets file

In [3]:
with open("../../data/wb_ag_datasets.csv", "r") as f:
    datasets = [r for r in csv.DictReader(f)]

print(datasets[0:3])
print(datasets[0].keys())
print([type(v) for v in datasets[0].values()])
print(datasets[0]["files"])

datasets = [{ **d, "id": d["dataset_id"], "type": "dataset", "title": d["name"]} for d in datasets]

data.extend(datasets)

[{'name': 'Subnational Poverty and Inequality Database (SPID)', 'description': 'This data entry includes subnational poverty direct estimates from household surveys over time for more than 141 economies at the administrative unit level 1. Administrative unit level 1 refers to the highest subnational unit level (examples include ‘state’, ‘governorate’, ‘province’) from household surveys.<br>In 2013, the World Bank announced the goals of fighting poverty in all its forms by 2030 and promoting shared prosperity. Despite a remarkable progress made on reducing poverty in recent years, reaching the targets remains challenging. The decline in poverty has been uneven. The poverty reduction in recent years was dominated by the East Asia and Pacific (notably China and Indonesia) and South Asia (notably India) (World Bank, 2017). Despite the progress made so far, the number of extreme poor remains high, especially in Sub-Saharan Africa. In many countries, vast differences in poverty levels persis

In [4]:
with open("../../data/wb_ag_projects.csv", "r") as f:
    projects = [r for r in csv.DictReader(f)]

print(projects[0:3])
print(projects[0].keys())
print([type(v) for v in projects[0].values()])
print(projects[0]["documents"])

projects = [{**p, "type": "project", "title":p["project"]} for p in projects]
data.extend(projects)

[{'id': 'P149827', 'project': 'BF REDD+ Readiness Preparation', 'implementer': "Programme d'Investissement Forestier - Coordination Nationale", 'region': 'Western and Central Africa', 'country': 'Burkina Faso', 'documents': '["\'Auditing Document\': \'http://documents.worldbank.org/curated/en/099081723082536082/pdf/P1498270c1377d0940a4f609eb0bb65ed60.pdf\'", "\'Procurement Plan\': \'http://documents.worldbank.org/curated/en/099143002072310138/pdf/P1498270780df40450a1750b0790a799001.pdf\'"]', 'sectors': "['Public Administration - Agriculture, Fishing & Forestry', 'Other Agriculture, Fishing and Forestry', 'Forestry']", 'years': "['2025']", 'contacts': "['Mirko Ivo Serkovic', 'Yasmina Oodally']"}, {'id': 'P180732', 'project': 'Ukraine Agriculture Recovery Inclusive Support Emergency (ARISE) Project', 'implementer': 'Ministry of Agrarian Policy and Food, Business Development Fund', 'region': 'Europe and Central Asia', 'country': 'Ukraine', 'documents': '["\'Environmental and Social Commit

In [5]:
with open("../../data/wb_ag_ext_papers.csv", "r") as f:
    papers = [r for r in csv.DictReader(f)]

print(papers[0:3])
print(papers[0].keys())
print([type(v) for v in papers[0].values()])
print(papers[0]["document"]) # Title

papers = [{**p, "title": p["document"], "type":"paper"} for p in papers]
data.extend(papers)

[{'id': '166128', 'document': 'Empowering Women Farmers : Evidence from a RandomizedControl Trial in Mozambique', 'abstract': 'These results show how a psychological mindset training targeted to women farmers can lead to greater investment and entrepreneurship, helping their households transition out of subsistence farming and into market-oriented agriculture and off-farm businesses. The results also suggest the promise of psychology to improve the design and effectiveness of standard agricultural extension programs, showing the importance of teaching not only best production technologies and practices, but also developing an entrepreneurial mindset.', 'date': '16-Nov-21', 'type': 'Working Paper', 'authors': "['Montalvao Machado', 'Joao H. C.']", 'sectors': "['Crops and Crop Management Systems', 'Agricultural Extension', 'Climate Change and Agriculture', 'Private Sector Development Law', 'Marketing', 'Private Sector Economics', 'Labor Markets']", 'implementer': 'Gender Impact Evaluatio

In [6]:
with open("../../data/wb_ag_usecases.csv", "r") as f:
    usecases = [r for r in csv.DictReader(f)]

print(usecases[0:3])
print(usecases[0].keys())
print([type(v) for v in usecases[0].values()])
print(usecases[0]["documents"])

usecases = [{**u, "title":u["use_case"], "type": "usecase"} for u in usecases]
data.extend(usecases)   

[{'id': 'P180732', 'use_case': 'Accessible Agricultural Financing', 'project': 'Ukraine Agriculture Recovery Inclusive Support Emergency (ARISE) Project', 'description': 'This use case example reflects a strategic implementation of accessible financial support mechanisms to bolster agricultural productivity in the face of adversity. In the context of the Ukraine Agriculture Recovery Inclusive Support Emergency (ARISE) Project, financial accessibility was addressed through a two-pronged approach aimed at enhancing the resilience and sustainability of the agricultural sector amidst significant challenges, including conflict and natural or man-made disasters. Firstly, affordable credit was made available to farms, facilitating capital investment and operational continuity. This component targeted the broader agricultural ecosystem, ensuring entities of varying sizes could access the financial resources necessary for recovery and future growth. Secondly, the project instituted a grant syst

In [7]:
# Create mapping to store video titles (since several/many snippets may come from the same video)
video_id_to_title = {}

In [8]:
with open("../../data/wb_youtube_videos.json", "r") as f:
    videos = json.load(f)

print(videos[0:3])
print(videos[0].keys())
print([type(v) for v in videos[0].values()])

videos = [{
    "video_id": v["link"].replace("https://www.youtube.com/watch?v=", "").split("&")[0],
    "timestamp": v["link"].replace("https://www.youtube.com/watch?v=", "").split("&")[1].replace("t=", "")[:-1],
    "id": v["link"].replace("https://www.youtube.com/watch?v=", "").replace("&t=", "_")[:-1],
    "type": "video",
    **v} for v in videos]

for v in tqdm(videos): 
    if not video_id_to_title.get(v["video_id"]):     
        r = requests.get(f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={v['video_id']}&key={youtube_api_key}")
        if not r.json()["items"]: 
            video_id_to_title[v["video_id"]] = "VIDEO NO LONGER AVAILABLE"
        else:
            video_id_to_title[v["video_id"]] = r.json()["items"][0]["snippet"]["title"]
    v["title"] = video_id_to_title[v["video_id"]]

data.extend(videos)
 

[{'link': 'https://www.youtube.com/watch?v=g-0XrjMYUBo&t=13s', 'excerpt': "Srimathi Sridhar: Hello everyone, and welcome\nback to The Development Podcast from the World Bank. I'm Srimathi Sridhar, coming to you from Washington,\nDC. This is the first of some very special episodes\nwe've got for you here, where we're exploring how to end poverty on a livable planet, and\nwhat a livable planet really means. Today, we'll be turning our attention to climate\nchange. And as I speak to you, the 28th UN Climate\nSummit, or COP28, is already underway. The focus there is on cutting emissions, adaptation,\nand solutions that can address the challenges of a world that is warming far too quickly. We'll hear about the urgent need for climate\naction from the Prime Minister of the Pacific Island Nation of Samoa. Fiamē Naomi Mata'Afa: We know what the challenge\nis. We know the targets that we need to reach\nif we are to survive. Srimathi Sridhar: From youth activists urging\nthe international commun

100%|██████████| 944/944 [00:09<00:00, 97.97it/s] 


In [9]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def get_text_to_embed(d: dict):
    return "\n ".join([f"{key}: {value}" for key, value in d.items() if value is not None])

def get_tokens(text: str):
    text = text.replace("\n", " ")
    encoding = tiktoken.get_encoding("cl100k_base")
    return encoding.encode(text)


def get_embedding(tokens: list):
    
    if len(tokens) > 8191:
        print("WARNING: Token length execeeds 8191 tokens, truncating to 8191 tokens")
        tokens = tokens[:8191]

    return (
        client.embeddings.create(input=tokens, model="text-embedding-3-small")
        .data[0]
        .embedding
    )

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    embeddings = list(
        tqdm(
            executor.map(
                lambda d: get_embedding(get_tokens(get_text_to_embed(d))), 
                data
            ), 
            total=len(data) # sets total length of progressbar
        )
    ) 

100%|██████████| 2348/2348 [01:06<00:00, 35.45it/s]


In [12]:
data = [{**d, "embedding": embedding} for d, embedding in zip(data, embeddings)]

In [13]:
# Standardize keys
key_set = set()
for d in data: 
    key_set.update(set(d.keys()))    
data = [{**{k:None for k in key_set}, **d} for d in data]

In [14]:
import json
with open("records_v1.0.json", "w") as f: 
    f.write(json.dumps(data))