In [1]:
import csv
import json
import os
from tqdm import tqdm
import tiktoken
from openai import OpenAI
import concurrent.futures

data_files = [
    "wb_ag_datasets.csv",	
    "wb_ag_projects.csv",	
    "wb_youtube_videos.json",
    "wb_ag_ext_papers.csv",
    "wb_ag_usecases.csv"
]
data = []

for file in data_files: 
    print(f"FILE: {file}")
    if file.endswith(".json"):
        with open(f"../../data/{file}") as f: 
            rows = json.loads(f.read())
    else:
        with open(f"../../data/{file}", "r") as f: 
            rows = [r for r in csv.DictReader(f)]
    print(rows[0].keys())
    data.extend([
        {**row, "type": file.replace("wb_", "").replace("ag_", "").split(".")[0][:-1]} for row in rows
    ])
len(data)


FILE: wb_ag_datasets.csv
dict_keys(['name', 'description', 'dataset_id', 'project_id', 'files'])
FILE: wb_ag_projects.csv
dict_keys(['id', 'project', 'implementer', 'region', 'country', 'documents', 'sectors', 'years', 'contacts'])
FILE: wb_youtube_videos.json
dict_keys(['link', 'excerpt', 'summary'])
FILE: wb_ag_ext_papers.csv
dict_keys(['id', 'document', 'abstract', 'date', 'type', 'authors', 'sectors', 'implementer', 'url'])
FILE: wb_ag_usecases.csv
dict_keys(['id', 'use_case', 'project', 'description', 'implementer', 'region', 'country', 'documents', 'sectors', 'years', 'contacts'])


2348

In [2]:
for d in data: 
    if not d.get("id") and d.get("dataset_id"): 
        d["id"] = d["dataset_id"]
    elif not d.get("id") and d["type"] == "youtube_video": 
        d["video_id"] = d["link"].replace("https://www.youtube.com/watch?v=", "").split("&")[0]
        d["timestamp"]= d["link"].replace("https://www.youtube.com/watch?v=", "").split("&")[1].replace("t=", "")[:-1]
        d["id"] = d["link"].replace("https://www.youtube.com/watch?v=", "").replace("&t=", "_")[:-1]
        

In [3]:
for d in data: 
    d["text_to_embed"] = ". ".join([v for v in d.values()])

In [4]:
key_set = set()
for d in data: 
    key_set.update(set(d.keys()))    
data = [{**{k:None for k in key_set}, **d} for d in data]

In [10]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])


def get_tokens(text: str):
    text = text.replace("\n", " ")
    encoding = tiktoken.get_encoding("cl100k_base")
    return encoding.encode(text)


def get_embedding(tokens: list):
    
    if len(tokens) > 8191:
        raise Exception("Token length execeeds 8191 tokens, truncating to 8191 tokens")
        tokens = tokens[:8191]

    return (
        client.embeddings.create(input=tokens, model="text-embedding-3-small")
        .data[0]
        .embedding
    )

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    embeddings = list(
        tqdm(
            executor.map(
                lambda d: get_embedding(get_tokens(d["text_to_embed"])), # some boto3 operation
                data
            ), 
            total=len(data) # sets total length of progressbar
        )
    ) 
# for d in tqdm(data): 
#     d["embedding"] = get_embedding(get_tokens(d["text_to_embed"]))

100%|█████████████████████████████████████████████████████████████████| 2348/2348 [01:14<00:00, 31.65it/s]


In [11]:
data = [{**d, "embedding": embedding} for d, embedding in zip(data, embeddings)]

In [12]:
import json
with open("records_v1.0.json", "w") as f: 
    f.write(json.dumps(data))