# **Setup**

In [2]:
# Auto-reload edited scripts
%load_ext autoreload
%autoreload 2

# Logging
from ngram_prep.ngram_acquire.logger import setup_logger

# Ngram pivot functions
from pathlib import Path
from ngram_prep.ngram_pivot.config import PipelineConfig
from ngram_prep.ngram_pivot.pipeline import run_pivot_pipeline

# Utilities
from ngram_prep.utilities.peek import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Log to file

In [3]:
setup_logger(
    db_path="/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_pivoted.db",
    console=False,
    rotate=True,
    max_bytes=100_000_000,
    backup_count=5,
    force=True
)

PosixPath('/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_pivoted.db/ngram_download_20251101_235530.log')

# **Pivot Multigrams to Create Yearly Indices**

In [4]:
pipeline_cfg = PipelineConfig(
    src_db=Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"),
    dst_db=Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_pivoted.db"),
    tmp_dir=Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp"),
    num_workers=40,
    num_initial_work_units=1000,
    work_unit_claim_order="random",
    flush_interval_s=15.0,
    progress_every_s=30.0,
    mode="restart",
    num_ingest_readers=1,
    ingest_buffer_shards=1,
    use_smart_partitioning=True,
    ingest_mode="direct_sst"
)

run_pivot_pipeline(pipeline_cfg)


PARALLEL N-GRAM DATABASE PIVOT
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Start Time: 2025-11-01 23:55:36
Mode:       RESTART

Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
Source DB:            ...dk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db
Target DB:            .../edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_pivoted.db
Temp directory:       /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp

Parallelism
────────────────────────────────────────────────────────────────────────────────────────────────────
Workers:              40
Initial work units:   1000

Database Profiles
────────────────────────────────────────────────────────────────────────────────────────────────────
Reader profile:       read:packed24
Writer profile:       write:packed24
Ingest profile:       write:packed24

Flush

SST Files Ingested: 100%|██████████████████████████████████████████████████| 1000/1000 [01:20<00:00]



Phase 4: Finalizing database...
════════════════════════════════════════════════════════════════════════════════════════════════════

Post-Ingestion Compaction
────────────────────────────────────────────────────────────────────────────────────────────────────
Initial DB size:         260.27 GB
Compaction completed in 1:12:38
Size before:             260.27 GB
Size after:              633.31 GB
Space saved:             -373.04 GB (-143.3%)

Pipeline Complete
════════════════════════════════════════════════════════════════════════════════════════════════════
Output database:      .../edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_pivoted.db
Total records:        18,135,984,601
Database size:        260.27 GB
End Time:             2025-11-02 01:48:49
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━



# **Inspect the Procesed Database**
## `db_head`: Print the first _N_ key–value pairs

In [23]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_pivoted.db"

db_head(db_path, n=5)

First 5 key-value pairs:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   [1470] <UNK> <UNK> <UNK> <UNK> eng
     Value: 1 occurrences in 1 documents

[ 2] Key:   [1470] <UNK> <UNK> <UNK> atomic energy
     Value: 1 occurrences in 1 documents

[ 3] Key:   [1470] <UNK> <UNK> <UNK> eng <UNK>
     Value: 1 occurrences in 1 documents

[ 4] Key:   [1470] <UNK> <UNK> <UNK> much convenient
     Value: 1 occurrences in 1 documents

[ 5] Key:   [1470] <UNK> <UNK> <UNK> selection <UNK>
     Value: 1 occurrences in 1 documents



## `db_peek`: Print _N_ key-value pairs starting at the specified key

In [123]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_pivoted.db"

db_peek(db_path, start_key="[2002] <UNK> <UNK> world trade center", n=3)


3 key-value pairs starting from 000007d23c554e4b3e203c554e4b3e20776f726c642074726164652063656e746572:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   [2002] <UNK> <UNK> world trade center
     Value: 63,108 occurrences in 32,377 documents

[ 2] Key:   [2002] <UNK> <UNK> world trade charter
     Value: 1 occurrences in 1 documents

[ 3] Key:   [2002] <UNK> <UNK> world trade climate
     Value: 1 occurrences in 1 documents



## `db_peek_prefix`: Print key-value pairs containing the specified prefix

In [124]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_pivoted.db"

db_peek_prefix(db_path, prefix="[1983] <UNK> nuclear war", n=3)

3 key-value pairs with prefix 000007bf3c554e4b3e206e75636c65617220776172:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   [1983] <UNK> nuclear war <UNK> <UNK>
     Value: 22,068 occurrences in 19,218 documents

[ 2] Key:   [1983] <UNK> nuclear war <UNK> acceptable
     Value: 5 occurrences in 3 documents

[ 3] Key:   [1983] <UNK> nuclear war <UNK> accident
     Value: 337 occurrences in 208 documents

