# **Unigrams: Full Pipeline**
## **Setup**
### Imports

In [24]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
from stop_words import get_stop_words
from ngram_prep.ngram_filter.lemmatizer import SpacyLemmatizer
from ngram_prep.ngram_acquire import download_and_ingest_to_rocksdb
from ngram_prep.ngram_filter.config import PipelineConfig as FilterPipelineConfig
from ngram_prep.ngram_filter.config import FilterConfig
from ngram_prep.ngram_filter.pipeline.orchestrator import build_processed_db
from ngram_prep.utilities.peek import db_head, db_peek, db_peek_prefix
from ngram_prep.utilities.notebook_logging import setup_notebook_logging

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Configure Paths

In [25]:
base_path = Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files")
raw_db = base_path / "1grams.db"
filtered_db = base_path / "1grams_processed.db"
pivoted_db = base_path / "1grams_pivoted.db"
filter_tmp_dir = base_path / "processing_tmp"
pivot_tmp_dir = base_path / "pivot_tmp"
whitelist_path = filtered_db / "whitelist.txt"

## **Phase 1: Download and Ingest**

In [None]:
setup_notebook_logging(
    workflow_name="unigrams_acquire",
    data_path=str(base_path),
    console=False
)

download_and_ingest_to_rocksdb(
    ngram_size=1,
    repo_release_id="20200217",
    repo_corpus_id="eng",
    db_path_stub="/vast/edk202/NLP_corpora/Google_Books/",
    file_range=(0, 23),
    random_seed=98,
    workers=20,
    use_threads=False,
    ngram_type="tagged",
    overwrite_db=True,
    write_batch_size=100_000,
    open_type="write:packed24",
    compact_after_ingest=True
)

N-GRAM ACQUISITION PIPELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Start Time: 2025-11-04 00:14:04

Download Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
Ngram repo:           https://books.storage.googleapis.com/?prefix=ngrams/books/20200217/eng/1-
DB path:              /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db
File range:           0 to 23
Total files:          24
Files to get:         24
Skipping:             0
Download workers:     20
Batch size:           100,000
Ngram size:           1
Ngram type:           tagged
Overwrite DB:         True
DB Profile:           write:packed24

Download Progress
════════════════════════════════════════════════════════════════════════════════════════════════════


Files Processed:  46%|██████████████████████████▏                              | 11/24 [04:08<03:14]

## **Phase 2: Filter, Normalize, and Generate Whitelist**

In [3]:
setup_notebook_logging(
    workflow_name="unigrams_filter",
    data_path=str(base_path),
    console=False
)

stop_set = set(get_stop_words("english"))
lemmatizer = SpacyLemmatizer(language="en")

filter_config = FilterConfig(
    stop_set=stop_set,
    lemma_gen=lemmatizer,
)

pipeline_config = FilterPipelineConfig(
    src_db=raw_db,
    dst_db=filtered_db,
    tmp_dir=filter_tmp_dir,
    num_workers=40,
    use_smart_partitioning=True,
    samples_per_worker=500_000,
    num_initial_work_units=400,
    work_unit_claim_order="random",
    flush_interval_s=5.0,
    mode="restart",
    progress_every_s=10.0,
    ingest_num_readers=10,
    ingest_batch_items=2_000_000,
    ingest_queue_size=2,
    output_whitelist_path=whitelist_path,
    output_whitelist_top_n=6_000,
    output_whitelist_year_range=(1900, 2019),
    output_whitelist_spell_check=True,
    output_whitelist_spell_check_language="en"
)

build_processed_db(pipeline_config, filter_config)

N-GRAM FILTER PIPELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
[4mPipeline[0m
Run mode:             restart
Compact after ingest: True

[4mWorkers[0m
Num Workers:        40
Initial work units: 400
Profiles:           read=read:packed24, write=write:packed24
Flush interval:     5.0s

[4mFiles[0m
Source: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db
Destination: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db
Input whitelist: None
Output whitelist: ...oks/20200217/eng/1gram_files/1grams_processed.db/whitelist.txt (top 6,000 keys)

Phase 1: Creating work units...
════════════════════════════════════════════════════════════════════════════════════════════════════
Clean restart - creating new work units
Sampling database to create 400 density-based w

Shards Ingested: 100%|███████████████████████████████████████████████████████| 400/400 [03:58<00:00]



Ingestion complete: 400 shards, 17,716,031 items in 238.0s (74,431 items/s)

Phase 4: Finalizing database...
════════════════════════════════════════════════════════════════════════════════════════════════════

Post-Ingestion Compaction
────────────────────────────────────────────────────────────────────────────────────────────────────
Initial DB size:         25.07 GB
Compaction completed in 0:01:08
Size before:             25.07 GB
Size after:              21.13 GB
Space saved:             3.94 GB (15.7%)

Phase 5: Generating output whitelist...
════════════════════════════════════════════════════════════════════════════════════════════════════
  Output path: ...LP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db/whitelist.txt
  Extracting top 6,000 tokens
  Spell checking enabled (en)
  Year range filter: 1900-2019 (inclusive)
  Generated whitelist with 6,000 tokens in 439.0s

┌────────────────────────────────────────────────────────────────────────────────────────

## **Optional: Inspect Database**

### `db_head`: Show first N records

In [11]:
db_head(str(filtered_db), n=5)

First 5 key-value pairs:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   FALSE
     Value: Total: 124,503 occurrences in 110,465 volumes (1538-2019, 380 years)

[ 2] Key:   TRUE
     Value: Total: 4,241,065 occurrences in 2,916,658 volumes (1501-2019, 440 years)

[ 3] Key:   aaa
     Value: Total: 5,093,481 occurrences in 1,181,441 volumes (1477-2019, 404 years)

[ 4] Key:   aaaa
     Value: Total: 474,351 occurrences in 92,702 volumes (1477-2019, 338 years)

[ 5] Key:   aaaaa
     Value: Total: 54,556 occurrences in 23,113 volumes (1581-2019, 274 years)



### `db_peek`: Show records starting from a key

In [18]:
db_peek(str(filtered_db), start_key=b"time", n=5)

5 key-value pairs starting from 74696d65:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   time
     Value: Total: 2,893,115,714 occurrences in 69,504,388 volumes (1470-2019, 517 years)

[ 2] Key:   timea
     Value: Total: 55,191 occurrences in 38,897 volumes (1591-2019, 343 years)

[ 3] Key:   timeable
     Value: Total: 2,076 occurrences in 1,823 volumes (1792-2019, 143 years)

[ 4] Key:   timeabout
     Value: Total: 499 occurrences in 489 volumes (1614-2019, 152 years)

[ 5] Key:   timeabove
     Value: Total: 61 occurrences in 53 volumes (1724-2009, 44 years)



### `db_peek_prefix`: Show records matching a prefix

In [23]:
db_peek_prefix(str(filtered_db), prefix=b"tarnation", n=5)

5 key-value pairs with prefix 7461726e6174696f6e:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   tarnation
     Value: Total: 69,016 occurrences in 49,433 volumes (1606-2019, 265 years)

[ 2] Key:   tarnational
     Value: Total: 114 occurrences in 92 volumes (1913-2010, 53 years)

[ 3] Key:   tarnationest
     Value: Total: 53 occurrences in 53 volumes (1840-2019, 30 years)

[ 4] Key:   tarnationly
     Value: Total: 94 occurrences in 94 volumes (1833-2019, 44 years)

