# **Setup**
## Imports

In [1]:
# Auto-reload edited scripts
%load_ext autoreload
%autoreload 2

# Logging
from ngram_prep.ngram_acquire.logger import setup_logger

# Ngram pivot functions
from pathlib import Path
from ngram_prep.ngram_pivot.config import PipelineConfig
from ngram_prep.ngram_pivot.pipeline import run_pivot_pipeline

# Utilities
from ngram_prep.utilities.peek import *

## Set up logging to file

In [2]:
setup_logger(
    db_path="/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_pivoted.db",
    console=False,
    rotate=True,
    max_bytes=100_000_000,
    backup_count=5,
    force=True
)

PosixPath('/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_pivoted.db/ngram_download_20251019_121326.log')

# **Pivot Multigrams to Create Yearly Indices**

In [3]:
pipeline_cfg = PipelineConfig(
    src_db=Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"),
    dst_db=Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_pivoted.db"),
    tmp_dir=Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp"),
    num_workers=30,
    num_initial_work_units=5000,
    max_split_depth=50,
    work_unit_claim_order="sequential",
    split_check_interval_s=45.0,
    progress_every_s=60.0,
    mode="restart",
    ingest_batch_items=100_000,
    max_items_per_bucket=1_000_000,
    max_bytes_per_bucket=128 * 1024 * 1024
)

run_pivot_pipeline(pipeline_cfg)


PARALLEL N-GRAM DATABASE PIVOT
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Start Time: 2025-10-19 12:13:26
Mode:       RESTART

Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
Source DB:            ...dk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db
Target DB:            .../edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_pivoted.db
Temp directory:       /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp

Parallelism
────────────────────────────────────────────────────────────────────────────────────────────────────
Workers:              30
Initial work units:   5000
Max split depth:      50
Split check interval: 45.0s

Database Profiles
────────────────────────────────────────────────────────────────────────────────────────────────────
Reader profile:       read:packed24
Writer profile:       write

RuntimeError: Pipeline completed but only 6704/9191 units were ingested. Data is incomplete.

# **Inspect the Procesed Database**
## `db_head`: Print the first _N_ key–value pairs

In [None]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"

db_head(db_path, key_format="utf-8", value_format="packed", n=5)

## `db_peek`: Print _N_ key-value pairs starting at the specified key

In [None]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"

db_peek(db_path, start_key=b"quick brown <UNK> <UNK> <UNK>", key_format="utf-8", value_format="packed", n=5)


## `db_peek_prefix`: Print key-value pairs containing the specified prefix

In [None]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"

db_peek_prefix(db_path, prefix=b"<UNK> united state <UNK> <UNK>", key_format="utf-8", value_format="summary", n=1)