# **Setup**
## Recompile Cython Extensions

In [1]:
%cd /scratch/edk202/ngram-prep

%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8

%pip install -e . --no-build-isolation -q

/scratch/edk202/ngram-prep
env: LC_ALL=C.UTF-8
env: LANG=C.UTF-8
Note: you may need to restart the kernel to use updated packages.


## Imports

In [2]:
# Auto-reload edited scripts
%load_ext autoreload
%autoreload 2

# NLTK resources
from nltk.corpus import stopwords; stopwords = set(stopwords.words("english"))
from nltk.stem import WordNetLemmatizer; lemmatizer = WordNetLemmatizer()

# Ngram acquisition functions
from ngram_acquire.pipeline.orchestrate import download_and_ingest_to_rocksdb
from ngram_acquire.pipeline.logger import setup_logger

# Ngram processing functions
from pathlib import Path
from ngram_filter.config import PipelineConfig, FilterConfig
from ngram_filter.pipeline.orchestrator import build_processed_db

## Set up logging to file

In [3]:
setup_logger(
    db_path="/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db",
    console=False,
    rotate=True,
    max_bytes=100_000_000,
    backup_count=5,
    force=True
)

PosixPath('/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db/ngram_download_20251006_141813.log')

# **Download Unigrams and Ingest to RocksDB**

In [4]:
download_and_ingest_to_rocksdb(
    ngram_size=1,
    repo_release_id="20200217",
    repo_corpus_id="eng",
    db_path_stub="/vast/edk202/NLP_corpora/Google_Books/",
    file_range=(0, 23),
    random_seed=21,
    workers=30,
    use_threads=False,
    ngram_type="tagged",
    overwrite_db=True,
    overwrite_checkpoint=True,
    write_batch_size=100_000,
    open_type="write:packed24",
    post_compact=False,
)

N-GRAM ACQUISITION PIPELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Start Time: 2025-10-06 14:18:13

Download Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
Ngram repo:           https://books.storage.googleapis.com/?prefix=ngrams/books/20200217/eng/1-
DB path:              /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db
File range:           0 to 23
Total files:          24
Files to get:         24
Skipping:             0
Download workers:     30
Batch size:           100,000
Ngram size:           1
Ngram type:           tagged
Overwrite DB:         True
Overwrite checkpoint: False
DB Profile:           write:packed24
Compact:              False

Download Progress
════════════════════════════════════════════════════════════════════════════════════════════════════


Files Processed: 100%|█████████████████████████████████████████████████████████| 24/24 [08:00<00:00]



Processing complete!

Final Summary
════════════════════════════════════════════════════════════════════════════════════════════════════
Fully processed files:       24
Failed files:                0
Total entries written:       41,783,218
Write batches flushed:       24
Uncompressed data processed: 43.28 GB
Processing throughput:       86.20 MB/sec

End Time: 2025-10-06 14:26:47.869106
Total Runtime: 0:08:34.107396
Time per file: 0:00:21.421142
Files per hour: 168.1


# **Run Processing Pipeline**

In [5]:
src_db = Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db")
dst_db = src_db.parent / "1grams_processed.db"
tmp_dir = src_db.parent / "processing_tmp"

pipeline_config = PipelineConfig(
    src_db=src_db,
    dst_db=dst_db,
    tmp_dir=tmp_dir,
    readers=32,
    ingestors=32,
    work_units_per_reader=1,
    partitioning_sample_rate=0.01,
    prefix_length=4,
    mode="restart",
    force_cache_use=False,
    enable_ingest=True,
    delete_after_ingest=True,
    post_compact=True,
    overwrite_checkpoint=True,
    progress_every_s=60.0,
    output_whitelist_path=dst_db / "whitelist.txt",
    output_whitelist_top_n=40_000
)

filter_config = FilterConfig(
    stop_set=stopwords,
    lemma_gen=lemmatizer,
)

build_processed_db(pipeline_config, filter_config)

N-GRAM FILTER PIPELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Configuration:
════════════════════════════════════════════════════════════════════════════════════════════════════
[4mPipeline[0m
Run mode: restart
Ingest after filtering: True
Compact after ingesting: True
  
[4mWorkers[0m
Num Workers: 32
Work units: 32
Profiles: read=read:packed24, write=write:packed24
Buffer: 100,000 items, 128MB
  
[4mFiles[0m
Source: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db
Destination: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db
Input whitelist: None
Output whitelist: ...ks/20200217/eng/1gram_files/1grams_processed.db/whitelist.txt (top 40,000 keys)

Phase 1: Creating work units...
════════════════════════════════════════════════════════════════════════════════════════════════════
Clean restart - resampling and creating new work units
Sampling database at 0.01000 

[unit_0031      8.8s]:  100%|██████████████████████████████████████████████████| 32/32 [03:14<00:00]


Incremental Compaction Summary
————————————————————————————————————————————————————————————————————————————————————————————————————
Units compacted this run:    32
Total units completed:       32/32
Size before:                 26.51 GB
Size after:                  21.96 GB
Space saved:                 4.54 GB (17.1%)
Total runtime:               0:03:14
Time per unit:               0:00:06
Units per hour:              591.6






Phase 4: Generating output whitelist...
════════════════════════════════════════════════════════════════════════════════════════════════════
  Output path: ...LP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db/whitelist.txt
  Extracting top 40,000 tokens
  Generated whitelist with 40,000 tokens in 145.8s

╭───────────────────────────────────────────────────────────────────────────────────────╮
│ PROCESSING COMPLETE: Final DB contains 17,204,027 items, 29,263.7 MB                  │
│ DB: ...t/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db │
│ Whitelist: ...Google_Books/20200217/eng/1gram_files/1grams_processed.db/whitelist.txt │
╰───────────────────────────────────────────────────────────────────────────────────────╯

