In [None]:
%cd /scratch/edk202/ngram-prep

%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8

%pip install -e . --no-build-isolation -q

In [1]:
# Auto-reload packages
%load_ext autoreload
%autoreload 2

In [2]:
# Standard stuff
from pathlib import Path

# NLTK stuff
from nltk.corpus import stopwords; stopwords = set(stopwords.words("english"))
from nltk.stem import WordNetLemmatizer; lemmatizer = WordNetLemmatizer()

# Raw n-gram acquisition stuff
from ngram_acquire.pipeline.orchestrate import download_and_ingest_to_rocksdb
from ngram_acquire.pipeline.logger import setup_logger

# Cython utilities
from ngram_filter.config import PipelineConfig, FilterConfig
from ngram_filter.pipeline.orchestrator import build_processed_db

In [3]:
setup_logger(
    db_path="/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams.db",
    console=False,
    rotate=True,
    max_bytes=100_000_000,
    backup_count=5,
    force=True
)

PosixPath('/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams.db/ngram_download_20250917_130759.log')

# **Download 5-Grams and Ingest to a RocksDB Database**

In [5]:
download_and_ingest_to_rocksdb(
    ngram_size = 5,
    repo_release_id = "20200217",
    repo_corpus_id = "eng",
    db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db",
    file_range = (2001, 4999),
    random_seed = 11,
    workers = 40,
    use_threads = False,
    ngram_type = "tagged",
    overwrite = False,
    write_batch_size = 100_000,
    open_type = "write:packed24",
    post_compact = True
)

[31mStart Time: 2025-09-17 16:14:56[0m
[4m
Download & Ingestion Configuration[0m
Ngram repository:           https://storage.googleapis.com/books/ngrams/books/20200217/eng/eng-5-ngrams_exports.html
RocksDB database path:      /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db
File index range:           2001 to 4999 (count ~ 2999)
Total files available:      19423
Files to process:           2999
First file URL:             http://storage.googleapis.com/books/ngrams/books/20200217/eng/5-03481-of-19423.gz
Last file URL:              http://storage.googleapis.com/books/ngrams/books/20200217/eng/5-03853-of-19423.gz
Ngram size:                 5
Ngram filtering:            tagged
Overwrite mode:             False
Write batch size:           100,000
Worker processes/threads:   40 (processes)



Processing Files: 100%|[34m██████████[0m| 2999/2999 [30:06<00:00,  1.66files/s]  



[33mStarting post-ingestion compaction...[0m
[32mCompaction completed in 0:35:10.479071[0m
[32m
Processing completed![0m
Fully processed files: 2999
Total entries written: 195,574,775
Write batches flushed: 735
Uncompressed data processed: 4.31 TB
Processing throughput: 1153.67 MB/sec
[31m
End Time: 2025-09-17 17:20:15.979724[0m
[31mTotal Runtime: 1:05:19.522277[0m
[34m
Time per file: 0:00:01.306943[0m
[34mFiles per hour: 2754.5[0m


In [4]:
from common_db.api import open_db

db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db"

with open_db(db_path, mode="rw") as db:
    db.compact_all()

## Process the Ngrams

In [None]:
src_db = Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db")
dst_db = src_db.parent / "5grams_processed.db"
tmp_dir = src_db.parent / "processing_tmp"
wht_path ="/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams_processed.db/whitelist.txt"

# Default configs or override as desired
pipeline_config = PipelineConfig(
    src_db=src_db,
    dst_db=dst_db,
    tmp_dir=tmp_dir,
    readers=8,
    work_units_per_reader=16,
    force_restart=True,
    progress_every_s=60.0,
)

filter_config = FilterConfig(
    stop_set=stopwords,
    lemma_gen=lemmatizer,
    whitelist_path=wht_path
)

build_processed_db(pipeline_config, filter_config)