In [None]:
import os
import random
import asyncio

In [None]:
import sys
sys.path.append('../../src')

from baml_client.async_client import b
from utils import read_raw_dataset

In [None]:
kaggle_dataset_path = "../../../data/Wikipedia.json"
df = read_raw_dataset(kaggle_dataset_path)

In [None]:
metadata_path = "../../data/metadata"
os.makedirs(metadata_path, exist_ok=True)

In [None]:
async def process_batch(batch):
    to_sleep = random.randint(1, 60)
    print(f"Sleeping for {to_sleep} seconds")
    await asyncio.sleep(to_sleep)

    for _, row in batch.iterrows():
        filename = f'{metadata_path}/{row._id}.parquet'
        if os.path.exists(filename):
            print(f"Skipping {filename}")
            continue

        article = row['article']
        metadata = await b.ExtractArticleMetadata(article)
        entities = [
            {"name": e.name, "type": e.type.name}
            for e in metadata.entities
        ]

        df_row = row.to_frame().T.assign(
            entities=[entities],
            triples=[metadata.triples],
        )

        df_row = df_row[['_id', 'entities', 'triples']]
        df_row.to_parquet(filename, index=False)

        await asyncio.sleep(6)

In [None]:
nbatch = 100
batch_size = len(df) // nbatch

In [None]:
batches = [
    df[i:i + batch_size]
    for i in range(0, len(df), batch_size)
]

In [None]:
await asyncio.gather(*[
    process_batch(batch)
    for batch in batches
])