In [5]:
%cd /scratch/edk202/ngram-prep

%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8

%pip install -e . --no-build-isolation -q

/scratch/edk202/ngram-prep
env: LC_ALL=C.UTF-8
env: LANG=C.UTF-8
Note: you may need to restart the kernel to use updated packages.


In [1]:
# Auto-reload packages
%load_ext autoreload
%autoreload 2

In [2]:
# Standard stuff
from pathlib import Path

# NLTK stuff
from nltk.corpus import stopwords; stopwords = set(stopwords.words("english"))
from nltk.stem import WordNetLemmatizer; lemmatizer = WordNetLemmatizer()

# Raw n-gram acquisition stuff
from ngram_acquire.pipeline.orchestrate import download_and_ingest_to_rocksdb
from ngram_acquire.utils.vocab import write_vocab
from ngram_acquire.pipeline.logger import setup_logger
from utilities.save_sample import save_sample_to_db, verify_sample_db

# Downloaded n-gram filtering stuff

# Cython utilities
from ngram_filter.config import PipelineConfig, FilterConfig
from ngram_filter.pipeline.orchestrator import build_processed_db
from utilities.count_items import count_db_items
from utilities.reservoir_sampler import reservoir_sampling

In [3]:
setup_logger(
    db_path="/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams.db",
    console=False,
    rotate=True,
    max_bytes=100_000_000,
    backup_count=5,
    force=True
)

PosixPath('/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams.db/ngram_download_20250912_005624.log')

## Download Unigrams and Ingest to a RocksDB Database

In [9]:
download_and_ingest_to_rocksdb(
    ngram_size = 1,
    repo_release_id = "20200217",
    repo_corpus_id = "eng",
    db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams.db",
    file_range = (0, 23),
    random_seed = 42,
    workers = 25,
    use_threads = False,
    ngram_type = "tagged",
    overwrite = True,
    write_batch_size = 100_000,
    open_type = "write:packed24",
    post_compact = False
)

[31mStart Time: 2025-09-12 00:17:53[0m
[4m
Download & Ingestion Configuration[0m
Ngram repository:           https://storage.googleapis.com/books/ngrams/books/20200217/eng/eng-1-ngrams_exports.html
RocksDB database path:      /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams.db
File index range:           0 to 23 (count ~ 24)
Total files available:      24
Files to process:           24
First file URL:             http://storage.googleapis.com/books/ngrams/books/20200217/eng/1-00015-of-00024.gz
Last file URL:              http://storage.googleapis.com/books/ngrams/books/20200217/eng/1-00020-of-00024.gz
Ngram size:                 1
Ngram filtering:            tagged
Overwrite mode:             True
Write batch size:           100,000
Worker processes/threads:   25 (processes)



Processing Files: 100%|[34m██████████[0m| 24/24 [11:03<00:00, 27.65s/files]  


[32m
Processing completed![0m
Fully processed files: 24
Total entries written: 41,783,218
Write batches flushed: 24
Uncompressed data processed: 43.28 GB
Processing throughput: 64.90 MB/sec
[31m
End Time: 2025-09-12 00:29:16.368092[0m
[31mTotal Runtime: 0:11:22.891919[0m
[34m
Time per file: 0:00:28.453830[0m
[34mFiles per hour: 126.5[0m


## Count the Raw Records

In [5]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams.db"

count = count_db_items(
    db_path,
    progress_interval=5_000_000
)

Database Item Counter
Progress: 5,000,000 items | 16.1s elapsed | 309,622 items/sec
Progress: 10,000,000 items | 32.8s elapsed | 304,537 items/sec
Progress: 15,000,000 items | 53.2s elapsed | 281,807 items/sec
Progress: 20,000,000 items | 72.4s elapsed | 276,058 items/sec
Progress: 25,000,000 items | 92.1s elapsed | 271,356 items/sec
Progress: 30,000,000 items | 114.2s elapsed | 262,690 items/sec
Progress: 35,000,000 items | 131.7s elapsed | 265,784 items/sec
Progress: 40,000,000 items | 148.6s elapsed | 269,166 items/sec
FINAL COUNT: 41,783,242 items
Total Time:  156.01 seconds


## Sample the Raw Records

In [5]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams_processed.db"

sample = reservoir_sampling(
    db_path,
    sample_size=10,
    key_type="byte",
    progress_interval=1_000_000,
    return_keys=True,
)

RESERVOIR SAMPLING CONFIGURATION
------------------------------------------------------------
Target sample size:     10 items
Key handling strategy:  byte
Progress interval:      1,000,000 items
Database limit:         No limit (full traversal)
[PROGRESS] Processed 1,000,000 items
[PROGRESS] Processed 2,000,000 items
[PROGRESS] Processed 3,000,000 items
[PROGRESS] Processed 4,000,000 items
[PROGRESS] Processed 5,000,000 items
[PROGRESS] Processed 6,000,000 items
[PROGRESS] Processed 7,000,000 items
[PROGRESS] Processed 8,000,000 items
[PROGRESS] Processed 9,000,000 items
[PROGRESS] Processed 10,000,000 items
[PROGRESS] Processed 11,000,000 items
[PROGRESS] Processed 12,000,000 items
[PROGRESS] Processed 13,000,000 items
RESERVOIR SAMPLING RESULTS
------------------------------------------------------------
Items processed:        13,499,702
Metadata entries:       0
Final sample size:      10
Execution time:         73.7197 seconds
-----------------------------------------------------

## Save the Sample to a Testing Database

In [6]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams_processed_sample.db"

save_sample_to_db(
    sample,
    db_path,
    overwrite=True
)

Saving 10 samples to /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams_processed_sample.db
Successfully saved 10 samples


True

In [7]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams_processed_sample.db"

valid = verify_sample_db(
    db_path,
    show_count=10,
    decode_output=True,
    unpack_ngram=True
)


Sample entries from /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams_processed_sample.db:
------------------------------------------------------------
N-gram: 'dousos'
  2014: freq=1, docs=1
  2000: freq=1, docs=1
  1999: freq=1, docs=1
  1986: freq=1, docs=1
  1971: freq=3, docs=3
  ... and 38 more years
------------------------------
N-gram: 'embarkedon'
  2018: freq=1, docs=1
  2016: freq=3, docs=3
  2015: freq=1, docs=1
  2014: freq=4, docs=4
  2013: freq=6, docs=6
  ... and 73 more years
------------------------------
N-gram: 'fortresles'
  2008: freq=2, docs=2
  2004: freq=1, docs=1
  1990: freq=1, docs=1
  1974: freq=1, docs=1
  1966: freq=1, docs=1
  ... and 35 more years
------------------------------
N-gram: 'gentrt'
  2010: freq=2, docs=1
  2008: freq=1, docs=1
  2004: freq=1, docs=1
  2001: freq=1, docs=1
  1992: freq=1, docs=1
  ... and 75 more years
------------------------------
N-gram: 'iflict'
  2014: freq=2, docs=2
  2012: freq=1, docs=1
  2011: 

## Run Processing Pipeline

In [4]:
src_db = Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams.db")
dst_db = src_db.parent / "1grams_processed.db"
tmp_dir = src_db.parent / "processing_tmp"

# Default configs or override as desired
pipeline_config = PipelineConfig(
    src_db=src_db,
    dst_db=dst_db,
    tmp_dir=tmp_dir,
    readers=24
)

filter_config = FilterConfig(
    stop_set=stopwords,
    lemma_gen=lemmatizer,
)

# Run it
build_processed_db(pipeline_config, filter_config)

N-GRAM FILTER PIPELINE
Configuration:
  Workers: 24
  Work units: 192
  Source: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams.db
  Destination: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams_processed.db
  Buffer: 25,000 items, 16MB
  Profile: write:packed24

Phase 1: Creating work units...
  Creating new work units...
Creating 192 work units using ASCII range...
Created 192 work units covering range 0x21-0x7e
  Validating work units...
Validating 192 work units...
  Validating work unit 0: beginning...
    Found 0 keys in range
  Validating work unit 1: 21...
    Found 0 keys in range
  Validating work unit 2: 21...
    Found 10 keys in range
  Validating work unit 3: 22...
    Found 0 keys in range
  Validating work unit 4: 22...
    Found 100 keys in range
  Validating work unit 5: 23...
    Found 0 keys in range
  Validating work unit 6: 23...
    Found 10 keys in range
  Validating work unit 7: 24...
    Found 0 keys in range
  Valida

In [None]:
src = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams_processed.db"
dst = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/vocab.txt"

write_vocab(src, dst, top=60_000)