# **Setup**
## Recompile Cython Extensions

In [1]:
%cd /scratch/edk202/ngram-prep

%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8

%pip install -e . --no-build-isolation -q

/scratch/edk202/ngram-prep
env: LC_ALL=C.UTF-8
env: LANG=C.UTF-8
Note: you may need to restart the kernel to use updated packages.


## Imports

In [1]:
# Auto-reload edited scripts
%load_ext autoreload
%autoreload 2

# NLTK resources
from nltk.corpus import stopwords; stopwords = set(stopwords.words("english"))
from nltk.stem import WordNetLemmatizer; lemmatizer = WordNetLemmatizer()

# Ngram acquisition functions
from ngram_prep.ngram_acquire import download_and_ingest_to_rocksdb
from ngram_prep.ngram_acquire.logger import setup_logger

# Ngram processing functions
from pathlib import Path
from ngram_prep.ngram_filter.config import PipelineConfig, FilterConfig
from ngram_prep.ngram_filter.pipeline.orchestrator import build_processed_db
from ngram_prep.utilities.peek import db_head, db_peek, db_peek_prefix

## Set up logging to file

In [2]:
setup_logger(
    db_path="/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db",
    console=False,
    rotate=True,
    max_bytes=100_000_000,
    backup_count=5,
    force=True
)

PosixPath('/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db/ngram_download_20251016_221329.log')

# **Download Unigrams and Ingest to RocksDB**

In [3]:
download_and_ingest_to_rocksdb(
    ngram_size=1,
    repo_release_id="20200217",
    repo_corpus_id="eng",
    db_path_stub="/vast/edk202/NLP_corpora/Google_Books/",
    file_range=(0, 23),
    random_seed=98,
    workers=25,
    use_threads=False,
    ngram_type="tagged",
    overwrite_db=True,
    write_batch_size=100_000,
    open_type="write:packed24",
    compact_after_ingest=True
)

N-GRAM ACQUISITION PIPELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Start Time: 2025-10-16 20:37:29

Download Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
Ngram repo:           https://books.storage.googleapis.com/?prefix=ngrams/books/20200217/eng/1-
DB path:              /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db
File range:           0 to 23
Total files:          24
Files to get:         24
Skipping:             0
Download workers:     25
Batch size:           100,000
Ngram size:           1
Ngram type:           tagged
Overwrite DB:         True
DB Profile:           write:packed24

Download Progress
════════════════════════════════════════════════════════════════════════════════════════════════════


Files Processed: 100%|█████████████████████████████████████████████████████████| 24/24 [07:48<00:00]



Post-Ingestion Compaction
════════════════════════════════════════════════════════════════════════════════════════════════════
Initial DB size:         52.04 GB
Compaction completed in 0:03:11
Size before:             52.04 GB
Size after:              57.76 GB
Space saved:             -5.73 GB (-11.0%)

Processing complete!

Final Summary
════════════════════════════════════════════════════════════════════════════════════════════════════
Fully processed files:       24
Failed files:                0
Total entries written:       41,783,218
Write batches flushed:       24
Uncompressed data processed: 43.28 GB
Processing throughput:       67.04 MB/sec

End Time: 2025-10-16 20:48:30.474334
Total Runtime: 0:11:01.084885
Time per file: 0:00:27.545204
Files per hour: 130.7


# **Run Processing Pipeline**

In [3]:
src_db = Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db")
dst_db = src_db.parent / "1grams_processed.db"
tmp_dir = src_db.parent / "processing_tmp"

filter_config = FilterConfig(
    stop_set=stopwords,
    lemma_gen=lemmatizer,
)

pipeline_config = PipelineConfig(
    src_db=src_db,
    dst_db=dst_db,
    tmp_dir=tmp_dir,
    num_workers=30,
    num_initial_work_units=30,
    max_split_depth=15,
    split_check_interval_s=30.0,
    mode="restart",
    progress_every_s=30.0,
    max_items_per_bucket=100_000,
    max_bytes_per_bucket=128 * 1024 * 1024,
    output_whitelist_path=dst_db / "whitelist.txt",
    output_whitelist_top_n=40_000
)

build_processed_db(pipeline_config, filter_config)

N-GRAM FILTER PIPELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
[4mPipeline[0m
Run mode:             restart
Compact after ingest: True

[4mWorkers[0m
Num Workers:        30
Initial work units: 30
Dynamic splitting:  Enabled
Profiles:           read=read:packed24, write=write:packed24
Buffer:             100,000 items, 128.00 MB

[4mFiles[0m
Source: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db
Destination: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db
Input whitelist: None
Output whitelist: ...ks/20200217/eng/1gram_files/1grams_processed.db/whitelist.txt (top 40,000 keys)

Phase 1: Creating work units...
════════════════════════════════════════════════════════════════════════════════════════════════════
Clean restart - creating new work units


# **Inspect the Procesed Database**
## `db_head`: Print the first _N_ key–value pairs

In [4]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db"

db_head(db_path, key_format="utf-8", value_format="packed", n=5)

First 5 key-value pairs:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   aaa
     Value: [404 records] ... +394 earlier, (2010, 73349, 21260), (2011, 70513, 20830), (2012, 89395, 25823)
            (2013, 80343, 21958), (2014, 65708, 19212), (2015, 59086, 17071), (2016, 52149, 16790)
            (2017, 46701, 14991), (2018, 41247, 13294), (2019, 37242, 12423)

[ 2] Key:   aaaa
     Value: [337 records] ... +327 earlier, (2010, 2921, 1377), (2011, 4022, 1349), (2012, 3495, 1726)
            (2013, 3113, 1547), (2014, 2891, 1311), (2015, 21412, 1170), (2016, 6345, 1171)
            (2017, 2699, 1255), (2018, 2194, 1040), (2019, 1902, 1073)

[ 3] Key:   aaaaa
     Value: [274 records] ... +264 earlier, (2010, 805, 410), (2011, 726, 412), (2012, 961, 522)
            (2013, 813, 519), (2014, 964, 429), (2015, 652, 358), (2016, 693, 367)
            (2017, 696, 364), (2018, 569, 267), (2019, 393, 310)

[ 4] Key:   aaaaaa
     

## `db_peek`: Print _N_ key-value pairs starting at the specified key

In [5]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db"

db_peek(db_path, start_key=b"phenomenology", key_format="utf-8", value_format="packed", n=5)

5 key-value pairs starting from 7068656e6f6d656e6f6c6f6779:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   phenomenology
     Value: [217 records] ... +207 earlier, (2010, 100154, 17025), (2011, 94673, 16162), (2012, 135952, 24003)
            (2013, 170315, 28592), (2014, 135473, 22509), (2015, 124618, 20352)
            (2016, 131720, 23794), (2017, 120626, 21602), (2018, 113220, 19643)
            (2019, 99773, 17020)

[ 2] Key:   phenomenologyof
     Value: [35 records] ... +25 earlier, (2009, 6, 6), (2010, 5, 4), (2011, 1, 1), (2012, 5, 4), (2013, 5, 5)
            (2014, 2, 2), (2015, 5, 5), (2017, 10, 9), (2018, 12, 8), (2019, 5, 3)

[ 3] Key:   phenomenoloical
     Value: [25 records] ... +15 earlier, (1995, 2, 2), (2001, 1, 1), (2003, 1, 1), (2005, 2, 2), (2007, 3, 3)
            (2008, 2, 2), (2013, 1, 1), (2014, 1, 1), (2015, 1, 1), (2018, 1, 1)

[ 4] Key:   phenomenolooical
     Value: [30 records] ... +20 ea

## `db_peek_prefix`: Print key-value pairs containing the specified prefix

In [6]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db"

db_peek_prefix(db_path, prefix=b"unite", key_format="utf-8", value_format="summary", n=1)

1 key-value pairs with prefix 756e697465:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   unite
     Value: Total: 55,373,396 occurrences in 15,400,322 volumes (1478-2019, 461 years)

