# **Setup**

In [1]:
# Auto-reload edited scripts
%load_ext autoreload
%autoreload 2

# Recompile Cython script
%cd /scratch/edk202/ngram-prep
!python setup.py build_ext --inplace -q

# Stop words
from stop_words import get_stop_words

# Lemmatization
from ngram_prep.ngram_filter.lemmatizer import SpacyLemmatizer

# Ngram acquisition functions
from ngram_prep.ngram_acquire import download_and_ingest_to_rocksdb
from ngram_prep.ngram_acquire.logger import setup_logger

# Ngram processing functions
from pathlib import Path
from ngram_prep.ngram_filter.config import PipelineConfig, FilterConfig
from ngram_prep.ngram_filter.pipeline.orchestrator import build_processed_db
from ngram_prep.utilities.peek import db_head, db_peek, db_peek_prefix

/scratch/edk202/ngram-prep
running build_ext


## Log to file

In [2]:
setup_logger(
    db_path="/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db",
    console=False,
    rotate=True,
    max_bytes=100_000_000,
    backup_count=5,
    force=True
)

PosixPath('/scratch/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db/ngram_download_20251101_000639.log')

# **Download 5-Grams and Ingest to RocksDB**

In [None]:
download_and_ingest_to_rocksdb(
    ngram_size=5,
    repo_release_id="20200217",
    repo_corpus_id="eng",
    db_path_stub="/scratch/edk202/NLP_corpora/Google_Books/",
    file_range=(0, 19422),
    random_seed=76,
    workers=30,
    use_threads=False,
    ngram_type="tagged",
    overwrite_db=False,
    write_batch_size=1_000_000,
    open_type="write:packed24",
    compact_after_ingest=True
)

# **Run Processing Pipeline**

In [None]:
src_db = Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db")
dst_db = src_db.parent / "5grams_processed.db"
tmp_dir = src_db.parent / "processing_tmp"
whitelist_dir = src_db.parent.parent / "1gram_files" / "1grams_processed.db" / "whitelist.txt"

stop_set = set(get_stop_words("english"))
lemmatizer = SpacyLemmatizer(language="en")
spell_check_lang = "en"

filter_config = FilterConfig(
    stop_set=stop_set,
    lemma_gen=lemmatizer,
    whitelist_path = whitelist_dir
)

pipeline_config = PipelineConfig(
    src_db=src_db,
    dst_db=dst_db,
    tmp_dir=tmp_dir,
    num_workers=30,
    use_smart_partitioning=True,
    samples_per_worker=500_000,
    num_initial_work_units=600,
    work_unit_claim_order="random",
    flush_interval_s=5.0,
    mode="restart",
    progress_every_s=60.0,
    ingest_num_readers=10,
    ingest_batch_items=1_000_000,
    ingest_queue_size=3,
)

build_processed_db(pipeline_config, filter_config)

N-GRAM FILTER PIPELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
[4mPipeline[0m
Run mode:             restart
Compact after ingest: True

[4mWorkers[0m
Num Workers:        30
Initial work units: 600
Profiles:           read=read:packed24, write=write:packed24
Flush interval:     5.0s

[4mFiles[0m
Source: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db
Destination: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db
Input whitelist: ..._corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db/whitelist.txt
  All tokens (min count: 1)
Output whitelist: None
Loading whitelist...
Loaded 6,000 tokens

Phase 1: Creating work units...
════════════════════════════════════════════════════════════════════════════════════════════════════
Clean restart - c

# **Inspect the Procesed Database**
## `db_head`: Print the first _N_ key–value pairs

In [6]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"

db_head(db_path, key_format="utf-8", value_format="packed", n=5)

First 5 key-value pairs:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   <UNK> <UNK> <UNK> <UNK> abandon
     Value: [387 records] ... +377 earlier, (2010, 27926, 26498), (2011, 27582, 26509), (2012, 32242, 31044)
            (2013, 34955, 33807), (2014, 33745, 32732), (2015, 29802, 28815), (2016, 28203, 27315)
            (2017, 37094, 35829), (2018, 37506, 36414), (2019, 27181, 26381)

[ 2] Key:   <UNK> <UNK> <UNK> <UNK> abdominal
     Value: [282 records] ... +272 earlier, (2010, 18891, 14483), (2011, 20957, 15540), (2012, 25837, 21188)
            (2013, 23417, 18111), (2014, 26332, 21212), (2015, 26647, 20917), (2016, 23715, 19133)
            (2017, 15718, 12321), (2018, 15650, 12934), (2019, 12355, 10399)

[ 3] Key:   <UNK> <UNK> <UNK> <UNK> abide
     Value: [398 records] ... +388 earlier, (2010, 6488, 6060), (2011, 5526, 5182), (2012, 7163, 6760)
            (2013, 7851, 7301), (2014, 7147, 6697), (2015, 7133, 67

## `db_peek`: Print _N_ key-value pairs starting at the specified key

In [7]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"

db_peek(db_path, start_key=b"quick brown <UNK> <UNK> <UNK>", key_format="utf-8", value_format="packed", n=5)


5 key-value pairs starting from 717569636b2062726f776e203c554e4b3e203c554e4b3e203c554e4b3e:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   quick brown <UNK> <UNK> <UNK>
     Value: [21 records] ... +11 earlier, (2006, 4, 3), (2007, 1, 1), (2008, 13, 3), (2009, 2, 2)
            (2010, 5, 4), (2011, 2, 2), (2012, 9, 7), (2013, 5, 3), (2014, 2, 1), (2016, 1, 1)

[ 2] Key:   quick brown eye <UNK> <UNK>
     Value: [156 records] ... +146 earlier, (2010, 5, 5), (2011, 6, 6), (2012, 10, 10), (2013, 19, 19)
            (2014, 18, 18), (2015, 22, 22), (2016, 18, 18), (2017, 26, 26), (2018, 99, 99)
            (2019, 16, 16)

[ 3] Key:   quick brown eye <UNK> butler
     Value: [10 records] (1866, 23, 23), (1867, 2, 2), (1869, 2, 2), (1870, 2, 2), (1871, 4, 4), (1875, 3, 3)
            (1891, 1, 1), (1892, 1, 1), (1903, 1, 1), (1908, 1, 1)

[ 4] Key:   quick brown eye take <UNK>
     Value: [36 records] ... +26 earlier, (2007, 1,

## `db_peek_prefix`: Print key-value pairs containing the specified prefix

In [17]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"

db_peek_prefix(db_path, prefix=b"<UNK> <UNK> <UNK> <UNK> alone", key_format="utf-8", value_format="summary", n=1)

1 key-value pairs with prefix 3c554e4b3e203c554e4b3e203c554e4b3e203c554e4b3e20616c6f6e65:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   <UNK> <UNK> <UNK> <UNK> alone
     Value: Total: 7,354,834 occurrences in 7,103,072 volumes (1501-2019, 441 years)

