# **Download and Ingest to Unigrams to Database**

In [None]:
download_and_ingest_to_rocksdb(
    ngram_size=1,
    repo_release_id="20200217",
    repo_corpus_id="eng",
    db_path_stub="/vast/edk202/NLP_corpora/Google_Books/",
    file_range=(0, 23),
    random_seed=98,
    workers=20,
    use_threads=False,
    ngram_type="tagged",
    overwrite_db=True,
    write_batch_size=100_000,
    open_type="write:packed24",
    compact_after_ingest=True
)

# **Run Processing Pipeline**

In [None]:
src_db = Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db")
dst_db = src_db.parent / "1grams_processed.db"
tmp_dir = src_db.parent / "processing_tmp"

stop_set = set(get_stop_words("english"))
lemmatizer = SpacyLemmatizer(language="en")
spell_check_lang = "en"

filter_config = FilterConfig(
    stop_set=stop_set,
    lemma_gen=lemmatizer,
)

pipeline_config = PipelineConfig(
    src_db=src_db,
    dst_db=dst_db,
    tmp_dir=tmp_dir,
    num_workers=40,
    use_smart_partitioning=True,
    samples_per_worker=500_000,
    num_initial_work_units=400,
    work_unit_claim_order="random",
    flush_interval_s=5.0,
    mode="restart",
    progress_every_s=5.0,
    ingest_num_readers=10,
    ingest_batch_items=2_000_000,
    ingest_queue_size=2,
    output_whitelist_path=dst_db / "whitelist.txt",
    output_whitelist_top_n=6_000,
    output_whitelist_spell_check=True,
    output_whitelist_spell_check_language=spell_check_lang
)

build_processed_db(pipeline_config, filter_config)

N-GRAM FILTER PIPELINE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
[4mPipeline[0m
Run mode:             restart
Compact after ingest: True

[4mWorkers[0m
Num Workers:        40
Initial work units: 400
Profiles:           read=read:packed24, write=write:packed24
Flush interval:     5.0s

[4mFiles[0m
Source: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams.db
Destination: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db
Input whitelist: None
Output whitelist: ...oks/20200217/eng/1gram_files/1grams_processed.db/whitelist.txt (top 6,000 keys)

Phase 1: Creating work units...
════════════════════════════════════════════════════════════════════════════════════════════════════
Clean restart - creating new work units
Sampling database to create 400 density-based w

# **Inspect the Procesed Database**
## `db_head`: Print the first _N_ key–value pairs

In [16]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db"

db_head(db_path, key_format="utf-8", value_format="packed", n=5)

First 5 key-value pairs:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   FALSE
     Value: [380 records] ... +370 earlier, (2010, 841, 723), (2011, 706, 634), (2012, 773, 706)
            (2013, 1294, 952), (2014, 1017, 876), (2015, 1094, 799), (2016, 1047, 734)
            (2017, 1432, 1129), (2018, 1952, 1511), (2019, 1557, 1054)

[ 2] Key:   TRUE
     Value: [440 records] ... +430 earlier, (2010, 46286, 35975), (2011, 42519, 33719), (2012, 48922, 38400)
            (2013, 57484, 43234), (2014, 53077, 41010), (2015, 48476, 36281), (2016, 45082, 34440)
            (2017, 55048, 37723), (2018, 60260, 40195), (2019, 49897, 37191)

[ 3] Key:   aaa
     Value: [404 records] ... +394 earlier, (2010, 73202, 21159), (2011, 70332, 20728), (2012, 89260, 25722)
            (2013, 80191, 21846), (2014, 65586, 19120), (2015, 58974, 16984), (2016, 52096, 16749)
            (2017, 46607, 14927), (2018, 41184, 13241), (2019, 37169, 123

## `db_peek`: Print _N_ key-value pairs starting at the specified key

In [3]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db"

db_peek(db_path, start_key=b"unhappy", key_format="utf-8", value_format="packed", n=5)

5 key-value pairs starting from 756e6861707079:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   unhappy
     Value: [444 records] ... +434 earlier, (2010, 257380, 95642), (2011, 261556, 99718)
            (2012, 304990, 116127), (2013, 343730, 120790), (2014, 325703, 118893)
            (2015, 285862, 99410), (2016, 259175, 93715), (2017, 319228, 93295)
            (2018, 347741, 93195), (2019, 295667, 88524)

[ 2] Key:   unhappyand
     Value: [29 records] ... +19 earlier, (1952, 1, 1), (1970, 1, 1), (1995, 1, 1), (1996, 1, 1), (2001, 1, 1)
            (2008, 2, 2), (2009, 1, 1), (2010, 2, 2), (2011, 1, 1), (2013, 1, 1)

[ 3] Key:   unhappyboy
     Value: [39 records] ... +29 earlier, (2000, 1, 1), (2002, 3, 3), (2003, 1, 1), (2004, 4, 4), (2006, 1, 1)
            (2010, 2, 2), (2013, 2, 2), (2016, 1, 1), (2018, 1, 1), (2019, 1, 1)

[ 4] Key:   unhappycountry
     Value: [44 records] ... +34 earlier, (1939, 1, 1), (1953,

## `db_peek_prefix`: Print key-value pairs containing the specified prefix

In [13]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/1grams_processed.db"

db_peek_prefix(db_path, prefix=b"flank", key_format="utf-8", value_format="summary", n=1)

1 key-value pairs with prefix 666c616e6b:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   flank
     Value: Total: 18,977,446 occurrences in 6,278,467 volumes (1520-2019, 420 years)

