# **Setup**

In [1]:
# Auto-reload edited scripts
%load_ext autoreload
%autoreload 2

# Logging
from ngram_prep.ngram_acquire.logger import setup_logger

# Ngram pivot functions
from pathlib import Path
from ngram_prep.ngram_pivot.config import PipelineConfig
from ngram_prep.ngram_pivot.pipeline import run_pivot_pipeline

# Utilities
from ngram_prep.utilities.peek import *

## Log to file

In [2]:
setup_logger(
    db_path="/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_pivoted.db",
    console=False,
    rotate=True,
    max_bytes=100_000_000,
    backup_count=5,
    force=True
)

OSError: [Errno 30] Read-only file system: '/vast'

# **Pivot Multigrams to Create Yearly Indices**

In [3]:
pipeline_cfg = PipelineConfig(
    src_db=Path("/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db"),
    dst_db=Path("/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_pivoted.db"),
    tmp_dir=Path("/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/pivot_tmp"),
    num_workers=40,
    num_initial_work_units=80,
    max_split_depth=50,
    work_unit_claim_order="sequential",
    split_check_interval_s=45.0,
    progress_every_s=15.0,
    mode="restart",
    max_items_per_bucket=10_000_000,
    max_bytes_per_bucket=256 * 1024 * 1024,
    num_ingest_readers=40,
    ingest_batch_items=1_000_000
)

run_pivot_pipeline(pipeline_cfg)


PARALLEL N-GRAM DATABASE PIVOT
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Start Time: 2025-10-24 17:45:52
Mode:       RESTART

Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
Source DB:            ...dk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db
Target DB:            .../edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_pivoted.db
Temp directory:       /scratch/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/pivot_tmp

Parallelism
────────────────────────────────────────────────────────────────────────────────────────────────────
Workers:              40
Initial work units:   80
Max split depth:      50
Split check interval: 45.0s

Database Profiles
────────────────────────────────────────────────────────────────────────────────────────────────────
Reader profile:       read:packed24
Writer profile:       writ

Shards Ingested: 100%|█████████████████████████████████████████████████████████| 83/83 [20:26<00:00]




Phase 4: Finalizing database...
════════════════════════════════════════════════════════════════════════════════════════════════════

Post-Ingestion Compaction
────────────────────────────────────────────────────────────────────────────────────────────────────
Initial DB size:         27.21 GB
Compaction completed in 0:02:09
Size before:             27.21 GB
Size after:              27.21 GB
Space saved:             247.91 KB (0.0%)

Pipeline Complete
════════════════════════════════════════════════════════════════════════════════════════════════════
Output database:      .../edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_pivoted.db
Total records:        970,620,875
Database size:        27.21 GB
End Time:             2025-10-24 18:12:48
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━



# **Inspect the Procesed Database**
## `db_head`: Print the first _N_ key–value pairs

In [4]:
db_path = "/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_pivoted.db"

db_head(db_path, n=5)

First 5 key-value pairs:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   [1470] abstract
     Value: 1 occurrences in 1 documents

[ 2] Key:   [1470] angle
     Value: 2 occurrences in 2 documents

[ 3] Key:   [1470] arran
     Value: 1 occurrences in 1 documents

[ 4] Key:   [1470] arrange
     Value: 1 occurrences in 1 documents

[ 5] Key:   [1470] atomic
     Value: 1 occurrences in 1 documents



## `db_peek`: Print _N_ key-value pairs starting at the specified key

In [4]:
db_path = "/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_pivoted.db"

db_peek(db_path, start_key="[2000] quick", n=5)


5 key-value pairs starting from 000007d0717569636b:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   [2000] quick
     Value: 882,968 occurrences in 239,075 documents

[ 2] Key:   [2000] quicka
     Value: 6 occurrences in 4 documents

[ 3] Key:   [2000] quickaccess
     Value: 526 occurrences in 62 documents

[ 4] Key:   [2000] quickaccount
     Value: 2 occurrences in 2 documents

[ 5] Key:   [2000] quickacting
     Value: 47 occurrences in 43 documents



## `db_peek_prefix`: Print key-value pairs containing the specified prefix

In [12]:
db_path = "/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_pivoted.db"

db_peek_prefix(db_path, prefix="[2011] unite", n=5)

5 key-value pairs with prefix 000007db756e697465:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   [2011] unite
     Value: 435,288 occurrences in 172,321 documents

[ 2] Key:   [2011] unitea
     Value: 23 occurrences in 19 documents

[ 3] Key:   [2011] uniteal
     Value: 3 occurrences in 3 documents

[ 4] Key:   [2011] uniteam
     Value: 2 occurrences in 2 documents

[ 5] Key:   [2011] unitec
     Value: 224 occurrences in 90 documents

