In [None]:
%cd /scratch/edk202/ngram-prep

%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8

%pip install -e . --no-build-isolation -q

In [1]:
# Auto-reload packages
%load_ext autoreload
%autoreload 2

In [2]:
# Standard stuff
from pathlib import Path

# NLTK stuff
from nltk.corpus import stopwords; stopwords = set(stopwords.words("english"))
from nltk.stem import WordNetLemmatizer; lemmatizer = WordNetLemmatizer()

# Raw n-gram acquisition stuff
from ngram_acquire.pipeline.orchestrate import download_and_ingest_to_rocksdb
from ngram_acquire.utils.vocab import write_vocab
from ngram_acquire.pipeline.logger import setup_logger
from utilities.save_sample import save_sample_to_db, verify_sample_db

# Downloaded n-gram filtering stuff

# Cython utilities
from ngram_filter.config import PipelineConfig, FilterConfig
from ngram_filter.pipeline.orchestrator import build_processed_db
from utilities.count_items import count_db_items
from utilities.reservoir_sampler import reservoir_sampling


Phase 3: Finalizing...
════════════════════════════════════════════════════════════════════════════════════════════════════


In [3]:
setup_logger(
    db_path="/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams.db",
    console=False,
    rotate=True,
    max_bytes=100_000_000,
    backup_count=5,
    force=True
)

PosixPath('/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams.db/ngram_download_20250912_164740.log')

## Download Unigrams and Ingest to a RocksDB Database

In [None]:
download_and_ingest_to_rocksdb(
    ngram_size = 1,
    repo_release_id = "20200217",
    repo_corpus_id = "eng",
    db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams.db",
    file_range = (0, 23),
    random_seed = 42,
    workers = 25,
    use_threads = False,
    ngram_type = "tagged",
    overwrite = True,
    write_batch_size = 100_000,
    open_type = "write:packed24",
    post_compact = False
)

## Count the Raw Records

In [None]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams.db"

count = count_db_items(
    db_path,
    progress_interval=5_000_000
)

## Sample the Raw Records

In [None]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams_processed.db"

sample = reservoir_sampling(
    db_path,
    sample_size=10,
    key_type="byte",
    progress_interval=1_000_000,
    return_keys=True,
)

## Save the Sample to a Testing Database

In [None]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams_processed_sample.db"

save_sample_to_db(
    sample,
    db_path,
    overwrite=True
)

In [None]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams_processed_sample.db"

valid = verify_sample_db(
    db_path,
    show_count=10,
    decode_output=True,
    unpack_ngram=True
)

In [None]:
# Temp fix for __init__.py problem

with open("/ext3/miniforge3/envs/hist_w2v/lib/python3.11/site-packages/rocks_shim/__init__.py", 'w') as f:
    f.write("from .rocks_shim import *\n")

## Run Processing Pipeline

In [None]:
src_db = Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams.db")
dst_db = src_db.parent / "1grams_processed.db"
tmp_dir = src_db.parent / "processing_tmp"

# Default configs or override as desired
pipeline_config = PipelineConfig(
    src_db=src_db,
    dst_db=dst_db,
    tmp_dir=tmp_dir,
    readers=8,
    force_restart=True,
    progress_every_s=60.0
)

filter_config = FilterConfig(
    stop_set=stopwords,
    lemma_gen=lemmatizer,
)

# Run it
build_processed_db(pipeline_config, filter_config)

N-GRAM FILTER PIPELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Configuration:
════════════════════════════════════════════════════════════════════════════════════════════════════
  Workers: 8
  Work units: 64
  Source: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams.db
  Destination: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams_processed.db
  Buffer: 25,000 items, 16MB
  Profile: write:packed24

Phase 1: Creating work units...
════════════════════════════════════════════════════════════════════════════════════════════════════
  Force restart requested - clearing existing work units
  Creating 64 work units using ASCII range...
  Created 64 work units covering range 0x21-0x7e
  Validating 64 work units...
  Validated 64 work units: 795 keys over 10 sample units
  Created 64 work units

Phase 2: Processing 64 work units with 8 workers...
═══════════════════════════════════════════

In [None]:
src = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams_processed.db"
dst = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/vocab.txt"

write_vocab(src, dst, top=60_000)