# **Setup**

In [1]:
# Auto-reload edited scripts
%load_ext autoreload
%autoreload 2

# Recompile Cython script
%cd /scratch/edk202/ngram-prep
!python setup.py build_ext --inplace -q

# Stop words
from stop_words import get_stop_words

# Lemmatization
from ngram_prep.ngram_filter.lemmatizer import SpacyLemmatizer

# Ngram acquisition functions
from ngram_prep.ngram_acquire import download_and_ingest_to_rocksdb
from ngram_prep.ngram_acquire.logger import setup_logger

# Ngram processing functions
from pathlib import Path
from ngram_prep.ngram_filter.config import PipelineConfig, FilterConfig
from ngram_prep.ngram_filter.pipeline.orchestrator import build_processed_db
from ngram_prep.utilities.peek import db_head, db_peek, db_peek_prefix

/scratch/edk202/ngram-prep
running build_ext


## Log to file

In [2]:
setup_logger(
    db_path="/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db",
    console=False,
    rotate=True,
    max_bytes=100_000_000,
    backup_count=5,
    force=True
)

PosixPath('/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db/ngram_download_20251030_225802.log')

# **Download 5-Grams and Ingest to RocksDB**

In [None]:
download_and_ingest_to_rocksdb(
    ngram_size=5,
    repo_release_id="20200217",
    repo_corpus_id="eng",
    db_path_stub="/scratch/edk202/NLP_corpora/Google_Books/",
    file_range=(0, 19422),
    random_seed=76,
    workers=30,
    use_threads=False,
    ngram_type="tagged",
    overwrite_db=False,
    write_batch_size=1_000_000,
    open_type="write:packed24",
    compact_after_ingest=True
)

N-GRAM ACQUISITION PIPELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Start Time: 2025-10-30 22:21:03

Download Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
Ngram repo:           https://books.storage.googleapis.com/?prefix=ngrams/books/20200217/eng/5-
DB path:              /scratch/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db
File range:           0 to 19422
Total files:          19423
Files to get:         19423
Skipping:             0
Download workers:     30
Batch size:           1,000,000
Ngram size:           5
Ngram type:           tagged
Overwrite DB:         False
DB Profile:           write:packed24

Download Progress
════════════════════════════════════════════════════════════════════════════════════════════════════


Files Processed:   0%|                                                           | 0/19423 [00:00<?]

# **Run Processing Pipeline**

In [None]:
src_db = Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db")
dst_db = src_db.parent / "5grams_processed.db"
tmp_dir = src_db.parent / "processing_tmp"
whitelist_dir = src_db.parent.parent / "1gram_files" / "1grams_processed.db" / "whitelist.txt"

stop_set = set(get_stop_words("english"))
lemmatizer = SpacyLemmatizer(language="en")
spell_check_lang = "en"

filter_config = FilterConfig(
    stop_set=stop_set,
    lemma_gen=lemmatizer,
    whitelist_path = whitelist_dir
)

pipeline_config = PipelineConfig(
    src_db=src_db,
    dst_db=dst_db,
    tmp_dir=tmp_dir,
    num_workers=40,
    num_initial_work_units=40,
    work_unit_claim_order="random",
    flush_interval_s=5.0,
    mode="restart",
    progress_every_s=5.0,
    ingest_num_readers=10,
    ingest_batch_items=1_000_000,
    ingest_queue_size=2,
)

build_processed_db(pipeline_config, filter_config)

N-GRAM FILTER PIPELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
[4mPipeline[0m
Run mode:             restart
Compact after ingest: True

[4mWorkers[0m
Num Workers:        40
Initial work units: 40
Dynamic splitting:  Enabled
Profiles:           read=read:packed24, write=write:packed24
Flush interval:     5.0s

[4mFiles[0m
Source: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db
Destination: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db
Input whitelist: ..._corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db/whitelist.txt
  All tokens (min count: 1)
Output whitelist: None
Loading whitelist...
Loaded 5,000 tokens

Phase 1: Creating work units...
═══════════════════════════════════════════════════════════════════════════════════════════

, # **Inspect the Procesed Database**
## `db_head`: Print the first _N_ key–value pairs

In [4]:
db_path = "/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"

db_head(db_path, key_format="utf-8", value_format="packed", n=5)

First 5 key-value pairs:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   <UNK> <UNK> <UNK> <UNK> aaa
     Value: [166 records] ... +156 earlier, (2010, 676, 433), (2011, 531, 360), (2012, 507, 342)
            (2013, 356, 246), (2014, 425, 286), (2015, 322, 219), (2016, 313, 232)
            (2017, 246, 202), (2018, 228, 125), (2019, 191, 123)

[ 2] Key:   <UNK> <UNK> <UNK> <UNK> aar
     Value: [220 records] ... +210 earlier, (2010, 63, 54), (2011, 62, 51), (2012, 31, 31), (2013, 51, 51)
            (2014, 80, 38), (2015, 30, 28), (2016, 19, 17), (2017, 52, 51), (2018, 1140, 1136)
            (2019, 15, 15)

[ 3] Key:   <UNK> <UNK> <UNK> <UNK> aaron
     Value: [304 records] ... +294 earlier, (2010, 1929, 1736), (2011, 2095, 1935), (2012, 2762, 2518)
            (2013, 2385, 2195), (2014, 2637, 2441), (2015, 2500, 2252), (2016, 2176, 1949)
            (2017, 9100, 6289), (2018, 2809, 2607), (2019, 2034, 1886)

[ 4] Key: 

## `db_peek`: Print _N_ key-value pairs starting at the specified key

In [5]:
db_path = "/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"

db_peek(db_path, start_key=b"quick brown <UNK> <UNK> <UNK>", key_format="utf-8", value_format="packed", n=5)


5 key-value pairs starting from 717569636b2062726f776e203c554e4b3e203c554e4b3e203c554e4b3e:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   quick brown <UNK> <UNK> <UNK>
     Value: [21 records] ... +11 earlier, (2006, 4, 3), (2007, 1, 1), (2008, 13, 3), (2009, 2, 2)
            (2010, 5, 4), (2011, 2, 2), (2012, 9, 7), (2013, 5, 3), (2014, 2, 1), (2016, 1, 1)

[ 2] Key:   quick brown eye <UNK> <UNK>
     Value: [156 records] ... +146 earlier, (2010, 5, 5), (2011, 6, 6), (2012, 10, 10), (2013, 19, 19)
            (2014, 18, 18), (2015, 22, 22), (2016, 18, 18), (2017, 26, 26), (2018, 99, 99)
            (2019, 16, 16)

[ 3] Key:   quick brown eye <UNK> butler
     Value: [10 records] (1866, 23, 23), (1867, 2, 2), (1869, 2, 2), (1870, 2, 2), (1871, 4, 4), (1875, 3, 3)
            (1891, 1, 1), (1892, 1, 1), (1903, 1, 1), (1908, 1, 1)

[ 4] Key:   quick brown eye take <UNK>
     Value: [36 records] ... +26 earlier, (2007, 1,

## `db_peek_prefix`: Print key-value pairs containing the specified prefix

In [6]:
db_path = "/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"

db_peek_prefix(db_path, prefix=b"<UNK> <UNK> united state <UNK>", key_format="utf-8", value_format="summary", n=1)

1 key-value pairs with prefix 3c554e4b3e203c554e4b3e20756e69746564207374617465203c554e4b3e:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   <UNK> <UNK> united state <UNK>
     Value: Total: 309,111,055 occurrences in 125,635,403 volumes (1472-2019, 424 years)

