# **Setup**
## Recompile Cython Extensions

In [None]:
%cd /scratch/edk202/ngram-prep

%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8

%pip install -e . --no-build-isolation -q

## Imports

In [1]:
# Auto-reload edited scripts
%load_ext autoreload
%autoreload 2

# NLTK resources
from nltk.corpus import stopwords; stopwords = set(stopwords.words("english"))
from nltk.stem import WordNetLemmatizer; lemmatizer = WordNetLemmatizer()

# Ngram acquisition functions
from ngram_prep.ngram_acquire import download_and_ingest_to_rocksdb
from ngram_prep.ngram_acquire.logger import setup_logger

# Ngram processing functions
from pathlib import Path
from ngram_prep.ngram_filter.config import PipelineConfig, FilterConfig
from ngram_prep.ngram_filter.pipeline.orchestrator import build_processed_db
from ngram_prep.utilities.peek import db_head, db_peek, db_peek_prefix

## Set up logging to file

In [2]:
setup_logger(
    db_path="/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db",
    console=False,
    rotate=True,
    max_bytes=100_000_000,
    backup_count=5,
    force=True
)

PosixPath('/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db/ngram_download_20251022_135517.log')

# **Download 5-Grams and Ingest to RocksDB**

In [3]:
download_and_ingest_to_rocksdb(
    ngram_size=5,
    repo_release_id="20200217",
    repo_corpus_id="eng",
    #db_path_stub="/vast/edk202/NLP_corpora/Google_Books/",
    db_path_stub="/scratch/edk202/NLP_tmp",
    file_range=(0, 19422),
    random_seed=76,
    workers=30,
    use_threads=False,
    ngram_type="tagged",
    overwrite_db=False,
    write_batch_size=1_000_000,
    open_type="write:packed24",
    compact_after_ingest=True

)

N-GRAM ACQUISITION PIPELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Start Time: 2025-10-22 13:55:17

Download Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
Ngram repo:           https://books.storage.googleapis.com/?prefix=ngrams/books/20200217/eng/5-
DB path:              /scratch/edk202/NLP_tmp/20200217/eng/5gram_files/5grams.db
File range:           0 to 19422
Total files:          19423
Files to get:         0
Skipping:             19423
Download workers:     30
Batch size:           1,000,000
Ngram size:           5
Ngram type:           tagged
Overwrite DB:         False
DB Profile:           write:packed24

Download Progress
════════════════════════════════════════════════════════════════════════════════════════════════════


Files Processed:   0%|                                                               | 0/0 [00:00<?]


Post-Ingestion Compaction
════════════════════════════════════════════════════════════════════════════════════════════════════
Initial DB size:         2.21 TB





Compaction completed in 3:02:23
Size before:             2.21 TB
Size after:              2.21 TB
Space saved:             -41.14 KB (-0.0%)

Processing complete!

Final Summary
════════════════════════════════════════════════════════════════════════════════════════════════════
Fully processed files:       0
Failed files:                0
Total entries written:       0
Write batches flushed:       0
Uncompressed data processed: 0.00 B
Processing throughput:       0.00 MB/sec

End Time: 2025-10-22 16:58:01.020141
Total Runtime: 3:02:43.692410
Time per file: 0:00:00
Files per hour: 0.0


# **Run Processing Pipeline**

In [None]:
src_db = Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db")
dst_db = src_db.parent / "5grams_processed.db"
tmp_dir = src_db.parent / "processing_tmp"
whitelist_dir = src_db.parent.parent / "1gram_files" / "1grams_processed.db" / "whitelist.txt"

filter_config = FilterConfig(
    stop_set=stopwords,
    lemma_gen=lemmatizer,
    whitelist_path=whitelist_dir
)

pipeline_config = PipelineConfig(
    src_db=src_db,
    dst_db=dst_db,
    tmp_dir=tmp_dir,
    num_workers=20,
    num_initial_work_units=40,
    work_unit_claim_order="sequential",
    max_split_depth=30,
    split_check_interval_s=10.0,
    mode="restart",
    progress_every_s=30.0,
    max_items_per_bucket=100_000,
    max_bytes_per_bucket=128 * 1024 * 1024,
    ingest_num_readers=20,
    ingest_queue_size=1,
)

build_processed_db(pipeline_config, filter_config)

# **Inspect the Procesed Database**
## `db_head`: Print the first _N_ key–value pairs

In [None]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"

db_head(db_path, key_format="utf-8", value_format="packed", n=5)

## `db_peek`: Print _N_ key-value pairs starting at the specified key

In [None]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"

db_peek(db_path, start_key=b"quick brown <UNK> <UNK> <UNK>", key_format="utf-8", value_format="packed", n=5)


## `db_peek_prefix`: Print key-value pairs containing the specified prefix

In [None]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"

db_peek_prefix(db_path, prefix=b"<UNK> <UNK> united state <UNK>", key_format="utf-8", value_format="summary", n=1)