# **Setup**
## Recompile Cython Extensions

In [None]:
%cd /scratch/edk202/ngram-prep

%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8

%pip install -e . --no-build-isolation -q

## Imports

In [4]:
# Auto-reload edited scripts
%load_ext autoreload
%autoreload 2

# NLTK resources
from nltk.corpus import stopwords; stopwords = set(stopwords.words("english"))
from nltk.stem import WordNetLemmatizer; lemmatizer = WordNetLemmatizer()

# Ngram acquisition functions
from ngram_prep.ngram_acquire import download_and_ingest_to_rocksdb
from ngram_prep.ngram_acquire.logger import setup_logger

# Ngram processing functions
from pathlib import Path
from ngram_prep.ngram_filter.config import PipelineConfig, FilterConfig
from ngram_prep.ngram_filter.pipeline.orchestrator import build_processed_db

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Set up logging to file

In [5]:
setup_logger(
    db_path="/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db",
    console=False,
    rotate=True,
    max_bytes=100_000_000,
    backup_count=5,
    force=True
)

PosixPath('/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db/ngram_download_20251009_233904.log')

# **Download Unigrams and Ingest to RocksDB**

In [None]:
download_and_ingest_to_rocksdb(
    ngram_size=1,
    repo_release_id="20200217",
    repo_corpus_id="eng",
    db_path_stub="/vast/edk202/NLP_corpora/Google_Books/",
    file_range=(0, 23),
    random_seed=98,
    workers=25,
    use_threads=False,
    ngram_type="tagged",
    overwrite_db=True,
    write_batch_size=100_000,
    open_type="write:packed24",
    compact_after_ingest=True
)

# **Run Processing Pipeline**

In [6]:
src_db = Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db")
dst_db = src_db.parent / "1grams_processed.db"
tmp_dir = src_db.parent / "processing_tmp"

pipeline_config = PipelineConfig(
    src_db=src_db,
    dst_db=dst_db,
    tmp_dir=tmp_dir,
    num_workers=32,
    mode="restart",
    enable_ingest=True,
    num_ingest_workers=32,
    delete_after_ingest=True,
    compact_after_ingest=True,
    progress_every_s=60.0,
    output_whitelist_path=dst_db / "whitelist.txt",
    output_whitelist_top_n=40_000
)

filter_config = FilterConfig(
    stop_set=stopwords,
    lemma_gen=lemmatizer,
)

build_processed_db(pipeline_config, filter_config)

N-GRAM FILTER PIPELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
[4mPipeline[0m
Run mode:               restart
Ingest after filtering: True
Compact after ingest:   True

[4mWorkers[0m
Num Workers:        32
Initial work units: 32
Dynamic splitting:  Enabled
Profiles:           read=read:packed24, write=write:packed24
Buffer:             100,000 items, 128.00 MB

[4mFiles[0m
Source: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db
Destination: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db
Input whitelist: None
Output whitelist: ...ks/20200217/eng/1gram_files/1grams_processed.db/whitelist.txt (top 40,000 keys)

Phase 1: Creating work units...
════════════════════════════════════════════════════════════════════════════════════════════════════
Clean r

Shards Ingested: 100%|█████████████████████████████████████████████████████████| 74/74 [01:55<00:00]


Phase 3: Finalizing (flush)...
════════════════════════════════════════════════════════════════════════════════════════════════════
Performing final flush...


Shards Ingested: 100%|█████████████████████████████████████████████████████████| 74/74 [02:06<00:00]


Post-Ingestion Compaction
════════════════════════════════════════════════════════════════════════════════════════════════════
Initial DB size:         25.15 GB





Compaction completed in 0:02:26
Size before:             25.15 GB
Size after:              21.91 GB
Space saved:             3.24 GB (12.9%)

┌──────────────────────────────────────────────────────────────────────────────────────────────┐
│ PROCESSING COMPLETE                                                                          │
├──────────────────────────────────────────────────────────────────────────────────────────────┤
│ Items: 16,500,108                                                                            │
│ Size: 26.31 GB                                                                               │
│ Database: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db │
└──────────────────────────────────────────────────────────────────────────────────────────────┘
