# **Setup**
## Recompile Cython Extensions

In [1]:
%cd /scratch/edk202/ngram-prep

%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8

%pip install -e . --no-build-isolation -q

/scratch/edk202/ngram-prep
env: LC_ALL=C.UTF-8
env: LANG=C.UTF-8
Note: you may need to restart the kernel to use updated packages.


## Imports

In [None]:
# Auto-reload edited scripts
%load_ext autoreload
%autoreload 2

# NLTK resources
from nltk.corpus import stopwords; stopwords = set(stopwords.words("english"))
from nltk.stem import WordNetLemmatizer; lemmatizer = WordNetLemmatizer()

# Ngram acquisition functions
from ngram_acquire.pipeline.orchestrate import download_and_ingest_to_rocksdb
from ngram_acquire.pipeline.logger import setup_logger

# Ngram processing functions
from ngram_filter.config import PipelineConfig, FilterConfig
from ngram_filter.pipeline.orchestrator import build_processed_db
from common_db.api import open_db
from utilities.peek import *

## Set up logging to file

In [None]:
setup_logger(
    db_path="/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams.db",
    console=False,
    rotate=True,
    max_bytes=100_000_000,
    backup_count=5,
    force=True
)

# **Download 5-Grams and Ingest to RocksDB**

In [None]:
download_and_ingest_to_rocksdb(
    ngram_size = 5,
    repo_release_id = "20200217",
    repo_corpus_id = "eng",
    db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db",
    file_range = (0, 19422),
    random_seed = 11,
    workers = 40,
    use_threads = False,
    ngram_type = "tagged",
    overwrite = True,
    write_batch_size = 100_000,
    open_type = "write:packed24",
    post_compact = True
)

# **Run Processing Pipeline**

In [None]:
src_db = Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db")
dst_db = src_db.parent / "5grams_processed.db"
tmp_dir = src_db.parent / "processing_tmp"
wht_path ="/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1grams_processed.db/whitelist.txt"

pipeline_config = PipelineConfig(
    src_db=src_db,
    dst_db=dst_db,
    tmp_dir=tmp_dir,
    readers=2,
    ingestors=16,
    partitioning_sample_rate=0.001,
    prefix_length=6,
    mode="resume",
    force_cache_use=False,
    enable_ingest=True,
    delete_after_ingest=True,
    enable_compact=False,
    progress_every_s=60.0,
)

filter_config = FilterConfig(
    stop_set=stopwords,
    lemma_gen=lemmatizer,
    whitelist_path=wht_path
)

build_processed_db(pipeline_config, filter_config)

As the pipeline runs, the work-unit status is tracked in a SQLite database. Run the following command to check progress.

Execute from the `processing_tmp` directory:
```
python3 -c "
import sqlite3
conn = sqlite3.connect('work_tracker.db')
cur = conn.cursor()
results = cur.execute('SELECT status, COUNT(*) FROM work_units GROUP BY status').fetchall()
status_dict = dict(results)
print(f\"Completed: {status_dict.get('completed', 0)}, Processing: {status_dict.get('processing', 0)}, Pending: {status_dict.get('pending', 0)}\")
"
```

## Run a Manual Compaction

Compaction improves DB read performance. Run this if compaction was disabled during ingestion.

In [None]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"

with open_db(db_path, mode="rw", profile="read:packed24") as db:
    db.compact_all()

# **Inspect the Procesed Database**
## `db_head`: Print the first _N_ key–value pairs

In [None]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"

db_head(db_path, key_format="utf-8", value_format="packed", n=5)

## `db_peek`: Print _N_ key-value pairs starting at the specified key

In [None]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"

db_peek(db_path, start_key=b"quick brown <UNK> <UNK> <UNK>", key_format="utf-8", value_format="packed", n=5)


## `db_peek_prefix`: Print key-value pairs containing the specified prefix

In [None]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"

db_peek_prefix(db_path, prefix=b"<UNK> united state <UNK> america", key_format="utf-8", value_format="summary", n=1)