# **Setup**
## Recompile Cython Extensions

In [1]:
%cd /scratch/edk202/ngram-prep

%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8

%pip install -e . --no-build-isolation -q

/scratch/edk202/ngram-prep
env: LC_ALL=C.UTF-8
env: LANG=C.UTF-8
Note: you may need to restart the kernel to use updated packages.


## Imports

In [1]:
# Auto-reload edited scripts
%load_ext autoreload
%autoreload 2

# NLTK resources
from nltk.corpus import stopwords; stopwords = set(stopwords.words("english"))
from nltk.stem import WordNetLemmatizer; lemmatizer = WordNetLemmatizer()

# Ngram acquisition functions
from ngram_prep.ngram_acquire import download_and_ingest_to_rocksdb
from ngram_prep.ngram_acquire.logger import setup_logger

# Ngram processing functions
from pathlib import Path
from ngram_prep.ngram_filter.config import PipelineConfig, FilterConfig
from ngram_prep.ngram_filter.pipeline.orchestrator import build_processed_db
from ngram_prep.parallel import SplitMonitorConfig
from ngram_prep.utilities.peek import db_head, db_peek, db_peek_prefix

## Set up logging to file

In [2]:
setup_logger(
    db_path="/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db",
    console=False,
    rotate=True,
    max_bytes=100_000_000,
    backup_count=5,
    force=True
)

PosixPath('/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db/ngram_download_20251014_171702.log')

# **Download Unigrams and Ingest to RocksDB**

In [23]:
download_and_ingest_to_rocksdb(
    ngram_size=1,
    repo_release_id="20200217",
    repo_corpus_id="eng",
    db_path_stub="/vast/edk202/NLP_corpora/Google_Books/",
    file_range=(0, 23),
    random_seed=98,
    workers=25,
    use_threads=False,
    ngram_type="tagged",
    overwrite_db=True,
    write_batch_size=100_000,
    open_type="write:packed24",
    compact_after_ingest=True
)

N-GRAM ACQUISITION PIPELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Start Time: 2025-10-12 10:55:53

Download Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
Ngram repo:           https://books.storage.googleapis.com/?prefix=ngrams/books/20200217/eng/1-
DB path:              /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db
File range:           0 to 23
Total files:          24
Files to get:         24
Skipping:             0
Download workers:     25
Batch size:           100,000
Ngram size:           1
Ngram type:           tagged
Overwrite DB:         True
DB Profile:           write:packed24

Download Progress
════════════════════════════════════════════════════════════════════════════════════════════════════


Files Processed: 100%|█████████████████████████████████████████████████████████| 24/24 [08:00<00:00]


Post-Ingestion Compaction
════════════════════════════════════════════════════════════════════════════════════════════════════
Initial DB size:         51.97 GB





Compaction completed in 0:03:12
Size before:             51.97 GB
Size after:              57.76 GB
Space saved:             -5.79 GB (-11.1%)

Processing complete!

Final Summary
════════════════════════════════════════════════════════════════════════════════════════════════════
Fully processed files:       24
Failed files:                0
Total entries written:       41,783,218
Write batches flushed:       24
Uncompressed data processed: 43.28 GB
Processing throughput:       65.21 MB/sec

End Time: 2025-10-12 11:07:13.060015
Total Runtime: 0:11:19.601267
Time per file: 0:00:28.316719
Files per hour: 127.1


# **Run Processing Pipeline**

In [8]:
src_db = Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db")
dst_db = src_db.parent / "1grams_processed.db"
tmp_dir = src_db.parent / "processing_tmp"

filter_config = FilterConfig(
    stop_set=stopwords,
    lemma_gen=lemmatizer,
)

pipeline_config = PipelineConfig(
    src_db=src_db,
    dst_db=dst_db,
    tmp_dir=tmp_dir,
    num_workers=30,
    num_ingest_workers=10,
    num_initial_work_units=2,
    mode="restart",
    delete_after_ingest=True,
    compact_after_ingest=True,
    progress_every_s=60.0,
    max_items_per_bucket=100_000,
    max_bytes_per_bucket=128 * 1024 * 1024,
    output_whitelist_path=dst_db / "whitelist.txt",
    output_whitelist_top_n=40_000
)

build_processed_db(pipeline_config, filter_config)

N-GRAM FILTER PIPELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
[4mPipeline[0m
Run mode:             restart
Concurrent ingestion: Enabled
Compact after ingest: True

[4mWorkers[0m
Num Workers:        30
Initial work units: 2
Dynamic splitting:  Enabled
Profiles:           read=read:packed24, write=write:packed24
Buffer:             100,000 items, 128.00 MB

[4mFiles[0m
Source: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db
Destination: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db
Input whitelist: None
Output whitelist: ...ks/20200217/eng/1gram_files/1grams_processed.db/whitelist.txt (top 40,000 keys)

Phase 1: Creating work units...
════════════════════════════════════════════════════════════════════════════════════════════════════
Clean resta

PicklingError: Can't pickle <class 'ngram_prep.ngram_filter.config.FilterConfig'>: it's not the same object as ngram_prep.ngram_filter.config.FilterConfig

In [11]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db"

db_head(db_path, key_format="utf-8", value_format="packed", n=5)

First 5 key-value pairs:
────────────────────────────────────────────────────────────────────────────────────────────────────
Database is empty


In [22]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db"

db_peek(db_path, start_key=b"act", key_format="utf-8", value_format="packed", n=5)

5 key-value pairs starting from 616374:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   act
     Value: [504 records] ... +494 earlier, (2010, 10556515, 1099161), (2011, 9229582, 1109018)
            (2012, 10938686, 1383123), (2013, 11459759, 1421585), (2014, 9599801, 1295175)
            (2015, 8323275, 1110554), (2016, 8486488, 1109310), (2017, 8038504, 1063541)
            (2018, 7433187, 1001313), (2019, 6529027, 945410)

[ 2] Key:   acta
     Value: [390 records] ... +380 earlier, (2010, 136674, 20924), (2011, 165094, 21097)
            (2012, 465319, 45151), (2013, 324333, 39196), (2014, 181040, 24479)
            (2015, 142455, 21433), (2016, 167581, 24617), (2017, 167433, 23366)
            (2018, 188806, 23373), (2019, 163520, 21539)

[ 3] Key:   actaa
     Value: [203 records] ... +193 earlier, (2010, 11, 9), (2011, 3, 1), (2012, 15, 13), (2013, 16, 10)
            (2014, 7, 5), (2015, 3, 3), (2016, 3, 3), (201