# **Multigrams: Full Pipeline**
## **Setup**
### Imports

In [3]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
from stop_words import get_stop_words
from ngram_prep.ngram_filter.lemmatizer import SpacyLemmatizer
from ngram_prep.ngram_acquire import download_and_ingest_to_rocksdb
from ngram_prep.ngram_filter.config import PipelineConfig as FilterPipelineConfig
from ngram_prep.ngram_filter.config import FilterConfig
from ngram_prep.ngram_filter.pipeline.orchestrator import build_processed_db
from ngram_prep.ngram_pivot.config import PipelineConfig as PivotPipelineConfig
from ngram_prep.ngram_pivot.pipeline import run_pivot_pipeline
from ngram_prep.utilities.peek import db_head, db_peek, db_peek_prefix
from ngram_prep.utilities.notebook_logging import setup_notebook_logging

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Configure Paths

In [4]:
base_path = Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files")
raw_db = base_path / "5grams.db"
filtered_db = base_path / "5grams_processed.db"
pivoted_db = base_path / "5grams_pivoted.db"
filter_tmp_dir = base_path / "processing_tmp"
pivot_tmp_dir = base_path / "pivot_tmp"
whitelist_path = filtered_db / "whitelist.txt"

## **Phase 1: Download and Ingest**

In [3]:
setup_notebook_logging(
    workflow_name="multigrams_acquire",
    data_path=str(base_path),
    console=False
)

download_and_ingest_to_rocksdb(
    ngram_size=5,
    repo_release_id="20200217",
    repo_corpus_id="eng",
    db_path_stub="/vast/edk202/NLP_corpora/Google_Books/",
    file_range=(0, 19422),
    random_seed=76,
    workers=30,
    use_threads=False,
    ngram_type="tagged",
    overwrite_db=False,
    write_batch_size=1_000_000,
    open_type="write:packed24",
    compact_after_ingest=True
)

N-GRAM ACQUISITION PIPELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Start Time: 2025-11-03 22:21:30

Download Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
Ngram repo:           https://books.storage.googleapis.com/?prefix=ngrams/books/20200217/eng/1-
DB path:              /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db
File range:           0 to 23
Total files:          24
Files to get:         24
Skipping:             0
Download workers:     20
Batch size:           100,000
Ngram size:           1
Ngram type:           tagged
Overwrite DB:         True
DB Profile:           write:packed24

Download Progress
════════════════════════════════════════════════════════════════════════════════════════════════════


Files Processed: 100%|█████████████████████████████████████████████████████████| 24/24 [07:30<00:00]



Post-Ingestion Compaction
════════════════════════════════════════════════════════════════════════════════════════════════════
Initial DB size:         46.75 GB
Compaction completed in 0:03:11
Size before:             46.75 GB
Size after:              57.76 GB
Space saved:             -11.02 GB (-23.6%)

Processing complete!

Final Summary
════════════════════════════════════════════════════════════════════════════════════════════════════
Fully processed files:       24
Failed files:                0
Total entries written:       41,783,218
Write batches flushed:       24
Uncompressed data processed: 43.28 GB
Processing throughput:       68.86 MB/sec

End Time: 2025-11-03 22:32:13.598633
Total Runtime: 0:10:43.596855
Time per file: 0:00:26.816536
Files per hour: 134.2


## **Phase 2: Filter, Normalize, and Generate Whitelist**

In [3]:
setup_notebook_logging(
    workflow_name="unigrams_filter",
    data_path=str(base_path),
    console=False
)

stop_set = set(get_stop_words("english"))
lemmatizer = SpacyLemmatizer(language="en")

filter_config = FilterConfig(
    stop_set=stop_set,
    lemma_gen=lemmatizer,
)

pipeline_config = FilterPipelineConfig(
    src_db=raw_db,
    dst_db=filtered_db,
    tmp_dir=filter_tmp_dir,
    num_workers=30,
    use_smart_partitioning=True,
    samples_per_worker=500_000,
    num_initial_work_units=600,
    work_unit_claim_order="random",
    flush_interval_s=5.0,
    mode="restart",
    progress_every_s=60.0,
    ingest_num_readers=10,
    ingest_batch_items=1_000_000,
    ingest_queue_size=3,
)

build_processed_db(pipeline_config, filter_config)

N-GRAM FILTER PIPELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
[4mPipeline[0m
Run mode:             restart
Compact after ingest: True

[4mWorkers[0m
Num Workers:        40
Initial work units: 400
Profiles:           read=read:packed24, write=write:packed24
Flush interval:     5.0s

[4mFiles[0m
Source: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db
Destination: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db
Input whitelist: None
Output whitelist: ...oks/20200217/eng/1gram_files/1grams_processed.db/whitelist.txt (top 6,000 keys)

Phase 1: Creating work units...
════════════════════════════════════════════════════════════════════════════════════════════════════
Clean restart - creating new work units
Sampling database to create 400 density-based w

Shards Ingested: 100%|███████████████████████████████████████████████████████| 400/400 [03:58<00:00]



Ingestion complete: 400 shards, 17,716,031 items in 238.0s (74,431 items/s)

Phase 4: Finalizing database...
════════════════════════════════════════════════════════════════════════════════════════════════════

Post-Ingestion Compaction
────────────────────────────────────────────────────────────────────────────────────────────────────
Initial DB size:         25.07 GB
Compaction completed in 0:01:08
Size before:             25.07 GB
Size after:              21.13 GB
Space saved:             3.94 GB (15.7%)

Phase 5: Generating output whitelist...
════════════════════════════════════════════════════════════════════════════════════════════════════
  Output path: ...LP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db/whitelist.txt
  Extracting top 6,000 tokens
  Spell checking enabled (en)
  Year range filter: 1900-2019 (inclusive)
  Generated whitelist with 6,000 tokens in 439.0s

┌────────────────────────────────────────────────────────────────────────────────────────

## **Phase 3: Pivot to Yearly Indices**

In [None]:
setup_notebook_logging(
    workflow_name="multigrams_pivot",
    data_path=str(base_path),
    console=False
)

pivot_config = PivotPipelineConfig(
    src_db=filtered_db,
    dst_db=pivoted_db,
    tmp_dir=pivot_tmp_dir,
    num_workers=20,
    num_initial_work_units=600,
    work_unit_claim_order="random",
    flush_interval_s=15.0,
    progress_every_s=30.0,
    mode="restart",
    num_ingest_readers=1,
    ingest_buffer_shards=1,
    use_smart_partitioning=True,
    ingest_mode="direct_sst"
)

run_pivot_pipeline(pivot_config)


PARALLEL N-GRAM DATABASE PIVOT
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Start Time: 2025-11-03 23:55:06
Mode:       RESTART

Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
Source DB:            ...dk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db
Target DB:            .../edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_pivoted.db
Temp directory:       /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/pivot_tmp

Parallelism
────────────────────────────────────────────────────────────────────────────────────────────────────
Workers:              20
Initial work units:   600

Database Profiles
────────────────────────────────────────────────────────────────────────────────────────────────────
Reader profile:       read:packed24
Writer profile:       write:packed24
Ingest profile:       write:packed24

Flush 

# Inspect Final Database

## `db_head`: First N records

In [None]:
db_head(str(pivoted_db), n=5)

## `db_peek`: Records starting from a key

In [None]:
db_peek(str(pivoted_db), start_key="[2000] quick", n=5)

## `db_peek_prefix`: Records matching a prefix

In [None]:
db_peek_prefix(str(pivoted_db), prefix="[2011] unite", n=5)