# **Setup**

In [4]:
%cd /scratch/edk202/ngram-prep
!python setup.py build_ext --inplace -q

/scratch/edk202/ngram-prep
running build_ext


In [2]:
# Auto-reload edited scripts
%load_ext autoreload
%autoreload 2

# Stop words
from stop_words import get_stop_words

# Lemmatization
from ngram_prep.ngram_filter.lemmatizer import SpacyLemmatizer

# Ngram acquisition functions
from ngram_prep.ngram_acquire import download_and_ingest_to_rocksdb
from ngram_prep.ngram_acquire.logger import setup_logger

# Ngram processing functions
from pathlib import Path
from ngram_prep.ngram_filter.config import PipelineConfig, FilterConfig
from ngram_prep.ngram_filter.pipeline.orchestrator import build_processed_db
from ngram_prep.utilities.peek import db_head, db_peek, db_peek_prefix

## Log to file

In [3]:
setup_logger(
    db_path="/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db",
    console=False,
    rotate=True,
    max_bytes=100_000_000,
    backup_count=5,
    force=True
)

PosixPath('/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/ngram_download_20251029_234103.log')

# **Download Unigrams and Ingest to RocksDB**

In [4]:
download_and_ingest_to_rocksdb(
    ngram_size=1,
    repo_release_id="20200217",
    repo_corpus_id="eng",
    db_path_stub="/vast/edk202/NLP_corpora/Google_Books/",
    file_range=(0, 23),
    random_seed=98,
    workers=20,
    use_threads=False,
    ngram_type="tagged",
    overwrite_db=True,
    write_batch_size=100_000,
    open_type="write:packed24",
    compact_after_ingest=True
)

N-GRAM ACQUISITION PIPELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Start Time: 2025-10-29 20:54:31

Download Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
Ngram repo:           https://books.storage.googleapis.com/?prefix=ngrams/books/20200217/eng/1-
DB path:              /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db
File range:           0 to 23
Total files:          24
Files to get:         24
Skipping:             0
Download workers:     25
Batch size:           100,000
Ngram size:           1
Ngram type:           tagged
Overwrite DB:         True
DB Profile:           write:packed24

Download Progress
════════════════════════════════════════════════════════════════════════════════════════════════════


Files Processed: 100%|█████████████████████████████████████████████████████████| 24/24 [07:30<00:00]



Post-Ingestion Compaction
════════════════════════════════════════════════════════════════════════════════════════════════════
Initial DB size:         46.75 GB
Compaction completed in 0:07:58
Size before:             46.75 GB
Size after:              57.76 GB
Space saved:             -11.01 GB (-23.6%)

Processing complete!

Final Summary
════════════════════════════════════════════════════════════════════════════════════════════════════
Fully processed files:       24
Failed files:                0
Total entries written:       41,783,218
Write batches flushed:       24
Uncompressed data processed: 43.28 GB
Processing throughput:       47.36 MB/sec

End Time: 2025-10-29 21:10:06.699674
Total Runtime: 0:15:35.697970
Time per file: 0:00:38.987415
Files per hour: 92.3


# **Run Processing Pipeline**

In [None]:
src_db = Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db")
dst_db = src_db.parent / "1grams_processed.db"
tmp_dir = src_db.parent / "processing_tmp"

stop_set = set(get_stop_words("english"))
lemmatizer = SpacyLemmatizer(language="en")

filter_config = FilterConfig(
    stop_set=stop_set,
    lemma_gen=lemmatizer,
)

pipeline_config = PipelineConfig(
    src_db=src_db,
    dst_db=dst_db,
    tmp_dir=tmp_dir,
    num_workers=20,
    num_initial_work_units=20,
    work_unit_claim_order="random",
    max_split_depth=100,
    split_check_interval_s=45.0,
    mode="restart",
    progress_every_s=15.0,
    max_items_per_bucket=10_000_000,
    max_bytes_per_bucket=512 * 1024 * 1024,
    ingest_num_readers=20,
    ingest_batch_items=5_000_000,
    ingest_queue_size=1,
    output_whitelist_path=dst_db / "whitelist.txt",
    output_whitelist_top_n=5_000
)

build_processed_db(pipeline_config, filter_config)

[2025-10-30 00:13:57,208] [INFO] Added vocab lookups: lexeme_norm
[2025-10-30 00:13:57,209] [INFO] Created vocabulary
[2025-10-30 00:13:57,210] [INFO] Finished initializing nlp object


# **Inspect the Procesed Database**
## `db_head`: Print the first _N_ key–value pairs

In [5]:
db_path = "/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db"

db_head(db_path, key_format="utf-8", value_format="packed", n=5)

First 5 key-value pairs:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   aaa
     Value: [404 records] ... +394 earlier, (2010, 73431, 21325), (2011, 70646, 20906), (2012, 89564, 25953)
            (2013, 80458, 22051), (2014, 65780, 19258), (2015, 59141, 17123), (2016, 52199, 16834)
            (2017, 46773, 15045), (2018, 41298, 13335), (2019, 37433, 12542)

[ 2] Key:   aaaa
     Value: [338 records] ... +328 earlier, (2010, 2932, 1388), (2011, 4081, 1368), (2012, 3535, 1743)
            (2013, 3136, 1565), (2014, 2906, 1324), (2015, 21418, 1175), (2016, 6364, 1182)
            (2017, 2710, 1266), (2018, 2217, 1050), (2019, 1905, 1076)

[ 3] Key:   aaaaa
     Value: [274 records] ... +264 earlier, (2010, 805, 410), (2011, 727, 413), (2012, 965, 524)
            (2013, 817, 520), (2014, 964, 429), (2015, 653, 359), (2016, 694, 368)
            (2017, 700, 368), (2018, 569, 267), (2019, 393, 310)

[ 4] Key:   aaaaaa
     

## `db_peek`: Print _N_ key-value pairs starting at the specified key

In [11]:
db_path = "/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db"

db_peek(db_path, start_key=b"other_", key_format="utf-8", value_format="packed", n=5)

5 key-value pairs starting from 6f746865725f:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   other_ADJ
     Value: [521 records] ... +511 earlier, (2010, 34424910, 260910), (2011, 33692399, 270927)
            (2012, 41956338, 340795), (2013, 43665881, 339134), (2014, 37566652, 329281)
            (2015, 32354317, 275662), (2016, 32758827, 267095), (2017, 32266995, 264232)
            (2018, 30586747, 250060), (2019, 27286016, 237519)

[ 2] Key:   other_NOUN
     Value: [324 records] ... +314 earlier, (2010, 392, 276), (2011, 262, 247), (2012, 306, 279)
            (2013, 392, 337), (2014, 517, 363), (2015, 277, 251), (2016, 307, 268)
            (2017, 301, 251), (2018, 478, 205), (2019, 183, 169)

[ 3] Key:   other_States_NOUN
     Value: [56 records] ... +46 earlier, (1966, 1, 1), (1967, 4, 4), (1968, 1, 1), (1969, 2, 2), (1970, 1, 1)
            (1971, 1, 1), (1975, 1, 1), (1978, 1, 1), (1979, 2, 2), (1990, 1, 1)

[ 

## `db_peek_prefix`: Print key-value pairs containing the specified prefix

In [9]:
db_path = "/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db"

db_peek_prefix(db_path, prefix=b"unite", key_format="utf-8", value_format="summary", n=1)

1 key-value pairs with prefix 756e697465:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   unite
     Value: Total: 55,373,396 occurrences in 15,400,322 volumes (1478-2019, 461 years)

