In [None]:
from loguru import logger
from typing import Dict, Optional, Generator, List, TypedDict, TypeVar, Iterable
from pydantic import BaseModel
from pathlib import Path
from pydantic import ValidationError
import jsonlines
from models.posts import PostEntry, PostRaw, PostMediaVariantEntry, PostFileEntry
from models.tags import TagEntry
from models.tag_alias import TagAliasEntry
from models.artists import ArtistEntry
from IPython.display import display, Markdown, HTML


In [None]:
def split_tags(tags: str) -> Iterable[str]:
    """Split tags"""
    return map(lambda x: x.strip(), tags.split(" "))

def get_id_tag_pairs(posts: Iterable[PostRaw]) -> Generator[tuple[int, str], None, None]:
    for post in posts:
        tags = split_tags(post["tag_string"])
        id_tags = ((post["id"], tag) for tag in tags)
        yield from id_tags

def batched_read_objs(path: str | Path,
                      batch_size: int = 1000) -> Generator[List[Dict[str, any]], None, None]:
    """Read objects from file"""
    with jsonlines.open(path) as reader:
        acc: List[Dict[str, any]] = []
        for obj in reader:
            if len(acc) >= batch_size:
                yield acc
                acc.clear()
            acc.append(obj)
        if acc:
            yield acc


PROJECT_ROOT = Path("../..")
RAW_DIR = PROJECT_ROOT / "raw"
POSTS_JSON = RAW_DIR / "posts.json"
BATCH_SIZE = 5
READ_BATCHES = 1

In [None]:
with POSTS_JSON.open("rb") as f:
  couter = 0
  for batch in batched_read_objs(POSTS_JSON, BATCH_SIZE):
    posts = [PostRaw(**post) for post in batch]
    id_tag_pairs = get_id_tag_pairs(posts)
    for id, tag in id_tag_pairs:
      logger.info(f"{id} {tag}")
    couter += 1
    if couter >= READ_BATCHES:
      break