# **Setup**
## Imports

In [None]:
# Auto-reload edited scripts
%load_ext autoreload
%autoreload 2

# Logging
from ngram_prep.ngram_acquire.logger import setup_logger

# Ngram pivot functions
from pathlib import Path
from ngram_prep.ngram_pivot.config import PivotConfig
from ngram_prep.ngram_pivot.parallel import pivot_parallel, ParallelPivotConfig
from ngram_prep.ngram_pivot.merge import merge_pivot_shards

# Utilities
from ngram_prep.utilities.peek import *

## Set up logging to file

In [4]:
setup_logger(
    db_path="/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_pivoted.db",
    console=False,
    rotate=True,
    max_bytes=100_000_000,
    backup_count=5,
    force=True
)

PosixPath('/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/ngram_download_20251008_171630.log')

# **Pivot Multigrams to Create Yearly Indices**

In [None]:
# Configure and run the pivot operation
pivot_cfg = PivotConfig(
    source_db_path=Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"),
    target_db_path=Path("/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_pivoted.db"),
    write_batch_size=100_000,
    validate=True
)

parallel_cfg = ParallelPivotConfig(
    num_workers=40,
    mode="restart"
)

shards_dir = pivot_parallel(pivot_cfg, parallel_cfg)

total_items, total_bytes = merge_pivot_shards(
    shards_dir=shards_dir,
    target_db_path=pivot_cfg.target_db_path,
    num_readers=40,
    delete_shards=True,
)


PARALLEL N-GRAM DATABASE PIVOT
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Start Time: 2025-10-08 17:18:44
Mode:       RESTART

Configuration
════════════════════════════════════════════════════════════════════════════════════════════════════
Workers:              40
Work units:           320
Dynamic splitting:    enabled
Sample rate:          0.001
Prefix length:        2
Write batch size:     100,000
Source profile:       read:packed24
Target profile:       write:packed24

Phase 1: Creating work units...
════════════════════════════════════════════════════════════════════════════════════════════════════
Using cache from 5grams.db for 5grams_processed.db
Forcing cache use and not checking config match
Loaded 107 work units from cache

Phase 2: Processing 107 pending work units with 40 workers...
════════════════════════════════════════════════════════════════════════════════════════════════════


Pivoting [  917.1M recs]:  |█                             | 21.6M/577M [01:22<32:04, 289kngram/s]   



Pivoting [  939.5M recs]:  |█▏                            | 22.1M/577M [01:24<32:24, 286kngram/s]



Pivoting [    1.2B recs]:  |█▍                            | 28.8M/577M [02:01<53:34, 171kngram/s]  

Error processing n-gram b'<UNK> <UNK> <UNK> average tariff': IO error: No such file or directory: While open a file for random read: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0007_L.db/000008.sst: No such file or directory


Pivoting [    1.2B recs]:  |█▌                            | 29.5M/577M [02:05<49:46, 183kngram/s]

Error processing n-gram b'delicate nose <UNK> <UNK> profusion': IO error: No such file or directory: While open a file for random read: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0044_L.db/000008.sst: No such file or directory


Pivoting [   11.0B recs]:  |████████████▏                 | 234M/577M [17:37<31:13, 184kngram/s] 



Pivoting [   15.5B recs]:  |████████████████▎             | 314M/577M [24:36<20:06, 218kngram/s]



Pivoting [   18.1B recs]:  |██████████████████▋           | 361M/577M [28:42<19:16, 187kngram/s]



Pivoting [   18.4B recs]:  |███████████████████           | 367M/577M [29:11<17:17, 203kngram/s]



Pivoting [   19.2B recs]:  |███████████████████▊          | 381M/577M [30:23<17:10, 191kngram/s]



Pivoting [   19.9B recs]:  |████████████████████▍         | 394M/577M [31:32<17:22, 176kngram/s]



Pivoting [   21.4B recs]:  |█████████████████████▉        | 422M/577M [33:54<12:01, 215kngram/s]



Pivoting [   21.5B recs]:  |█████████████████████▉        | 423M/577M [33:59<12:15, 210kngram/s]



Pivoting [   24.1B recs]:  |████████████████████████▋     | 475M/577M [38:06<07:13, 238kngram/s]



Pivoting [   24.4B recs]:  |████████████████████████▉     | 481M/577M [38:34<07:25, 217kngram/s]



Pivoting [   24.8B recs]:  |█████████████████████████▎    | 488M/577M [39:09<07:34, 196kngram/s]



Pivoting [   25.2B recs]:  |█████████████████████████▋    | 495M/577M [39:46<06:19, 216kngram/s]



Pivoting [   26.3B recs]:  |██████████████████████████▉   | 518M/577M [41:33<04:34, 215kngram/s]



Pivoting [   26.7B recs]:  |███████████████████████████▎  | 525M/577M [42:05<04:11, 208kngram/s]



Pivoting [   27.4B recs]:  |████████████████████████████  | 540M/577M [43:18<03:26, 181kngram/s]



Pivoting [   27.8B recs]:  |████████████████████████████▍ | 547M/577M [43:52<02:30, 204kngram/s]



Pivoting [   29.7B recs]:  |                              | 584M/? [46:46<00:00, 218kngram/s]   



Pivoting [   30.0B recs]:  |                              | 590M/? [47:22<00:00, 209kngram/s]



Pivoting [   31.5B recs]:  |                              | 619M/? [49:39<00:00, 222kngram/s]



Pivoting [   32.2B recs]:  |                              | 634M/? [50:53<00:00, 193kngram/s]



Pivoting [   34.1B recs]:  |                              | 671M/? [53:48<00:00, 206kngram/s]



Pivoting [   34.1B recs]:  |                              | 671M/? [53:49<00:00, 200kngram/s]



Pivoting [   34.2B recs]:  |                              | 672M/? [53:55<00:00, 176kngram/s]



Pivoting [   34.5B recs]:  |                              | 679M/? [54:25<00:00, 195kngram/s]



Pivoting [   34.8B recs]:  |                              | 685M/? [54:55<00:00, 186kngram/s]Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 120, in pivot_worker
    _process_work_unit_shard(
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 178, in _process_work_unit_shard
    with open_db(
  File "/ext3/miniforge3/envs/hist_w2v/lib/python3.11/contextlib.py", line 137, in __enter__
    return next(self.gen)
           ^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/common_db/api.py", line 47, in open_db
    db = rs.open(str(path), mode=mode, **kwargs)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: IO error: While mkdir if missing: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0027_R_R_R_R_R_R_R_L_R_R_R_L_L_L_R_L_L_L.db: Disk quota exceeded
Pivoting [   34.8B recs]:  |                              | 685M/? [54:55<00:00, 186kngram/s]

Worker 27 failed on unit unit_0027_R_R_R_R_R_R_R_L_R_R_R_L_L_L_R_L_L_L: IO error: While mkdir if missing: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0027_R_R_R_R_R_R_R_L_R_R_R_L_L_L_R_L_L_L.db: Disk quota exceeded
Worker 27 crashed: disk I/O error


Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 120, in pivot_worker
    _process_work_unit_shard(
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 178, in _process_work_unit_shard
    with open_db(
  File "/ext3/miniforge3/envs/hist_w2v/lib/python3.11/contextlib.py", line 137, in __enter__
    return next(self.gen)
           ^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/common_db/api.py", line 47, in open_db
    db = rs.open(str(path), mode=mode, **kwargs)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: IO error: While mkdir if missing: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0027_R_R_R_R_R_R_R_L_R_R_R_L_L_L_R_L_L_L.db: Disk quota exceeded

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 144, in pivot_worker
    work

Error processing n-gram b'<UNK> <UNK> consistency <UNK> ought': IO error: While pwrite to file at offset 883949568: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0007_L_R_L_L_L_R_R_R_L_L_R_L_R_L_R_L_R.db/000035.sst: Disk quota exceeded
Worker 13 failed on unit unit_0027_R_R_R_R_R_R_R_L_R_R_R_L_L_L_R_L_L_R: IO error: While mkdir if missing: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0027_R_R_R_R_R_R_R_L_R_R_R_L_L_L_R_L_L_R.db: Disk quota exceeded


Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 128, in claim_work_unit
    with sqlite3.connect(str(self.db_path)) as conn:
sqlite3.OperationalError: disk I/O error
Pivoting [   34.8B recs]:  |                              | 685M/? [54:56<00:00, 187kngram/s]

Error processing n-gram b'<UNK> <UNK> critical body composition': IO error: While pwrite to file at offset 849346560: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0007_L_R_L_L_L_R_R_R_L.db/000037.sst: Disk quota exceeded
Worker 13 crashed: disk I/O error
Error processing n-gram b'<UNK> <UNK> consist <UNK> open': IO error: While pwrite to file at offset 848297984: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0007_L_R_L_L_L_R_R_R_L_L_R_L_R_L.db/000035.sst: Disk quota exceeded


Pivoting [   34.8B recs]:  |                              | 685M/? [54:57<00:00, 206kngram/s]

Error processing n-gram b'<UNK> <UNK> criminal statute even': IO error: While pwrite to file at offset 769654784: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0007_L_R_L_L_L_R_R_R.db/000037.sst: Disk quota exceeded


Pivoting [   34.8B recs]:  |                              | 685M/? [54:57<00:00, 191kngram/s]Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 131, in claim_work_unit
    cursor = conn.execute(
             ^^^^^^^^^^^^^
sqlite3.OperationalError: unable to open database file
Pivoting [   34.8B recs]:  |                              | 685M/? [54:57<00:00, 202kngram/s]

Worker 20 crashed: unable to open database file


Pivoting [   34.8B recs]:  |                              | 685M/? [54:57<00:00, 190kngram/s]Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 131, in claim_work_unit
    cursor = conn.execute(
             ^^^^^^^^^^^^^
sqlite3.OperationalError: unable to open database file
Pivoting [   34.8B recs]:  |                              | 685M/? [54:58<00:00, 201kngram/s]

Error processing n-gram b'<UNK> <UNK> coffin <UNK> force': IO error: While pwrite to file at offset 761266176: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0007_L_R_L_L_L_R_R_R_L_L_R_L_R_L_R_L_R_L_L_L_L_R.db/000033.sst: Disk quota exceeded
Worker 38 crashed: unable to open database file
Worker 37 crashed: unable to open database file


Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 131, in claim_work_unit
    cursor = conn.execute(
             ^^^^^^^^^^^^^
sqlite3.OperationalError: unable to open database file
Pivoting [   34.8B recs]:  |                              | 685M/? [54:58<00:00, 195kngram/s]Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 120, in pivot_worker
    _process_work_unit_shard(
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 178, in _process_work_unit_shard
    with open_db(
  File "/ext3/miniforge3/envs/hist_w2v/lib/python3.11/contextlib.py", line 137, in __enter__
    return next(self.gen)
           ^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/common_db/api.py", li

Error processing n-gram b'<UNK> <UNK> competition make obligatory': IO error: While pwrite to file at offset 889192448: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0007_L_R_L_L_L_R_R_R_L_L_R_L_R_L_R_L_R_L_L_L_L_R_R_R_L_R.db/000034.sst: Disk quota exceeded
Worker 7 failed on unit unit_0027_R_R_R_R_R_R_R_L_R_R_R_L_L_L_R_L_R_L: IO error: While mkdir if missing: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0027_R_R_R_R_R_R_R_L_R_R_R_L_L_L_R_L_R_L.db: Disk quota exceeded


Pivoting [   34.8B recs]:  |                              | 685M/? [54:58<00:00, 197kngram/s]Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 120, in pivot_worker
    _process_work_unit_shard(
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 178, in _process_work_unit_shard
    with open_db(
  File "/ext3/miniforge3/envs/hist_w2v/lib/python3.11/contextlib.py", line 137, in __enter__
    return next(self.gen)
           ^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/common_db/api.py", line 47, in open_db
    db = rs.open(str(path), mode=mode, **kwargs)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: IO error: While mkdir if missing: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0027_R_R_R_R_R_R_R_L_R_R_R_L_L_L_R_L_R_L.db: Disk quota exceeded

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File

Worker 7 crashed: unable to open database file
Error processing n-gram b'<UNK> <UNK> double <UNK> large': IO error: While pwrite to file at offset 695205888: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0007_L_R.db/000041.sst: Disk quota exceeded
Error processing n-gram b'<UNK> <UNK> competent natural enemy': IO error: While pwrite to file at offset 702545920: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0007_L_R_L_L_L_R_R_R_L_L_R_L_R_L_R_L_R_L_L_L_L_R_R_R_L.db/000034.sst: Disk quota exceeded
Error processing n-gram b'<UNK> <UNK> criminal case prosecute': IO error: While pwrite to file at offset 837812224: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0007_L_R_L_L_L_R_R_R_L_L.db/000037.sst: Disk quota exceeded
Error processing n-gram b'<UNK> <UNK> competition <UNK> exchange': IO error: While pwrite to file at offset 687865856: /vast/edk202/NLP_corpora/Google_Books/2020

Pivoting [   34.8B recs]:  |                              | 685M/? [54:58<00:00, 178kngram/s]



Pivoting [   34.8B recs]:  |                              | 685M/? [54:58<00:00, 191kngram/s]Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 131, in claim_work_unit
    cursor = conn.execute(
             ^^^^^^^^^^^^^
sqlite3.OperationalError: unable to open database file
Pivoting [   34.8B recs]:  |                              | 685M/? [54:59<00:00, 181kngram/s]

Worker 23 crashed: unable to open database file
Error processing n-gram b'<UNK> <UNK> competent <UNK> therefore': IO error: While pwrite to file at offset 827326464: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0007_L_R_L_L_L_R_R_R_L_L_R_L_R_L_R_L_R_L_L_L_L_R_R.db/000034.sst: Disk quota exceeded


Pivoting [   34.8B recs]:  |                              | 685M/? [54:59<00:00, 181kngram/s]Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 131, in claim_work_unit
    cursor = conn.execute(
             ^^^^^^^^^^^^^
sqlite3.OperationalError: unable to open database file
Pivoting [   34.8B recs]:  |                              | 685M/? [54:59<00:00, 167kngram/s]

Worker 5 crashed: unable to open database file


Pivoting [   34.8B recs]:  |                              | 685M/? [54:59<00:00, 167kngram/s]Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 131, in claim_work_unit
    cursor = conn.execute(
             ^^^^^^^^^^^^^
sqlite3.OperationalError: unable to open database file
Pivoting [   34.8B recs]:  |                              | 685M/? [54:59<00:00, 159kngram/s]

Worker 33 crashed: unable to open database file


Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 131, in claim_work_unit
    cursor = conn.execute(
             ^^^^^^^^^^^^^
sqlite3.OperationalError: unable to open database file
Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 131, in claim_work_unit
    cursor = conn.execute(
             ^^^^^^^^^^^^^
sqlite3.OperationalError: unable to open database file
Pivoting [   34.8B recs]:  |                              | 685M/? [54:59<00:00, 137kngram/s]

Worker 24 crashed: unable to open database file
Worker 12 crashed: unable to open database file


Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 131, in claim_work_unit
    cursor = conn.execute(
             ^^^^^^^^^^^^^
sqlite3.OperationalError: unable to open database file
Pivoting [   34.8B recs]:  |                              | 685M/? [55:00<00:00, 146kngram/s]Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 131, in claim_work_unit
    cursor = conn.execute(
             ^^^^^^^^^^^^^
sqlite3.OperationalError: unable to open database file
Pivoting [   34.8B recs]:  |                             

Worker 31 crashed: unable to open database file
Worker 3 crashed: unable to open database file
Worker 8 crashed: unable to open database file


Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 131, in claim_work_unit
    cursor = conn.execute(
             ^^^^^^^^^^^^^
sqlite3.OperationalError: unable to open database file
Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 131, in claim_work_unit
    cursor = conn.execute(
             ^^^^^^^^^^^^^
sqlite3.OperationalError: unable to open database file
Pivoting [   34.8B recs]:  |                              | 685M/? [55:00<00:00, 130kngram/s]Traceback (most recent call last):
  File "/scratch/edk20

Worker 6 crashed: unable to open database file
Worker 26 crashed: unable to open database file


Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 131, in claim_work_unit
    cursor = conn.execute(
             ^^^^^^^^^^^^^
sqlite3.OperationalError: unable to open database file
Pivoting [   34.8B recs]:  |                              | 685M/? [55:00<00:00, 134kngram/s]

Worker 4 crashed: unable to open database file


Pivoting [   34.8B recs]:  |                              | 685M/? [55:01<00:00, 140kngram/s]Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 131, in claim_work_unit
    cursor = conn.execute(
             ^^^^^^^^^^^^^
sqlite3.OperationalError: unable to open database file
Pivoting [   34.8B recs]:  |                              | 686M/? [55:01<00:00, 136kngram/s]

Worker 0 crashed: unable to open database file


Pivoting [   34.8B recs]:  |                              | 686M/? [55:01<00:00, 129kngram/s]

Error processing n-gram b'<UNK> <UNK> doctor <UNK> represent': IO error: While open a file for appending: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0007_L_R_L.db/000041.sst: Disk quota exceeded


Pivoting [   34.9B recs]:  |                              | 686M/? [55:02<00:00, 134kngram/s]

Error processing n-gram b'<UNK> <UNK> endless vortex <UNK>': IO error: While pwrite to file at offset 538968064: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0007.db/000043.sst: Disk quota exceeded


Pivoting [   34.9B recs]:  |                              | 686M/? [55:02<00:00, 131kngram/s]Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 131, in claim_work_unit
    cursor = conn.execute(
             ^^^^^^^^^^^^^
sqlite3.OperationalError: unable to open database file
Pivoting [   34.9B recs]:  |                              | 686M/? [55:03<00:00, 127kngram/s]

Worker 2 crashed: unable to open database file


Pivoting [   34.9B recs]:  |                              | 686M/? [55:03<00:00, 123kngram/s]Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 131, in claim_work_unit
    cursor = conn.execute(
             ^^^^^^^^^^^^^
sqlite3.OperationalError: unable to open database file
Pivoting [   34.9B recs]:  |                              | 686M/? [55:04<00:00, 128kngram/s]

Worker 25 crashed: unable to open database file


Pivoting [   34.9B recs]:  |                              | 686M/? [55:05<00:00, 130kngram/s]

Error processing n-gram b'<UNK> <UNK> democratic member <UNK>': IO error: While open a file for appending: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0007_L_R_L_L_L_R_R.db/000039.sst: Disk quota exceeded
Error processing n-gram b'<UNK> <UNK> communist act <UNK>': IO error: While open a file for appending: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0007_L_R_L_L_L_R_R_R_L_L_R_L_R_L_R_L_R_L_L_L_L_R_R_R_L_R_L.db/000034.sst: Disk quota exceeded


Pivoting [   34.9B recs]:  |                              | 686M/? [55:06<00:00, 132kngram/s]Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 131, in claim_work_unit
    cursor = conn.execute(
             ^^^^^^^^^^^^^
sqlite3.OperationalError: unable to open database file
Pivoting [   34.9B recs]:  |                              | 686M/? [55:07<00:00, 132kngram/s]

Worker 28 crashed: unable to open database file
Worker 17 failed on unit unit_0007_L_R_L_L_L_R_R_R_L_L_R_L_R_L_R_L_R_L_L_L_L_R_R_R_L_R_L: IO error: While open a file for appending: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0007_L_R_L_L_L_R_R_R_L_L_R_L_R_L_R_L_R_L_L_L_L_R_R_R_L_R_L.db/000034.sst: Disk quota exceeded
Worker 17 crashed: unable to open database file


Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 120, in pivot_worker
    _process_work_unit_shard(
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 211, in _process_work_unit_shard
    _flush_batch(target_db, write_buffer, stats, target_counter)
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 232, in _flush_batch
    with db.write_batch(disable_wal=True, sync=False) as batch:
RuntimeError: IO error: While open a file for appending: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0007_L_R_L_L_L_R_R_R_L_L_R_L_R_L_R_L_R_L_L_L_L_R_R_R_L_R_L.db/000034.sst: Disk quota exceeded
Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 120, in pivot_worker
    _process_work_unit_shard(
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 211, in _process_work_unit_shard
    _flush_batch(target_db

Error processing n-gram b'<UNK> <UNK> convert <UNK> subdue': IO error: While open a file for appending: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0007_L_R_L_L_L_R_R_R_L_L_R_L_R_L_R.db/000036.sst: Disk quota exceeded


Pivoting [   34.9B recs]:  |                              | 687M/? [55:11<00:00, 121kngram/s]Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 131, in claim_work_unit
    cursor = conn.execute(
             ^^^^^^^^^^^^^
sqlite3.OperationalError: unable to open database file
Pivoting [   34.9B recs]:  |                              | 687M/? [55:11<00:00, 120kngram/s]

Error processing n-gram b'<UNK> <UNK> democratic member <UNK>': IO error: While open a file for appending: /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/pivot_tmp/shards/unit_0007_L_R_L_L_L.db/000039.sst: Disk quota exceeded
Worker 30 crashed: unable to open database file


Pivoting [   34.9B recs]:  |                              | 687M/? [55:13<00:00, 104kngram/s]Traceback (most recent call last):
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/parallel.py", line 109, in pivot_worker
    work_unit = work_tracker.claim_work_unit(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/ngram-prep/src/ngram_pivot/work_tracker.py", line 131, in claim_work_unit
    cursor = conn.execute(
             ^^^^^^^^^^^^^
sqlite3.OperationalError: unable to open database file
Pivoting [   34.9B recs]:  |                              | 687M/? [55:13<00:00, 103kngram/s]

Worker 1 crashed: unable to open database file


Pivoting [   35.6B recs]:  |                              | 700M/? [57:40<00:00, 79.2kngram/s]



Pivoting [   36.3B recs]:  |                              | 713M/? [1:00:02<00:00, 79.8kngram/s]



Pivoting [   36.3B recs]:  |                              | 713M/? [1:00:04<00:00, 76.2kngram/s]

# **Inspect the Procesed Database**
## `db_head`: Print the first _N_ key–value pairs

In [4]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"

db_head(db_path, key_format="utf-8", value_format="packed", n=5)

First 5 key-value pairs:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   <UNK> <UNK> <UNK> <UNK> aaa
     Value: [166 records] ... +156 earlier, (2010, 471, 403), (2011, 413, 327), (2012, 362, 299)
            (2013, 267, 224), (2014, 309, 256), (2015, 220, 190), (2016, 232, 204)
            (2017, 194, 184), (2018, 132, 107), (2019, 139, 115)

[ 2] Key:   <UNK> <UNK> <UNK> <UNK> aac
     Value: [122 records] ... +112 earlier, (2010, 124, 103), (2011, 131, 97), (2012, 138, 114)
            (2013, 178, 87), (2014, 176, 134), (2015, 58, 53), (2016, 96, 86), (2017, 121, 74)
            (2018, 115, 74), (2019, 29, 27)

[ 3] Key:   <UNK> <UNK> <UNK> <UNK> aachen
     Value: [143 records] ... +133 earlier, (2010, 216, 152), (2011, 235, 173), (2012, 1900, 1363)
            (2013, 1181, 944), (2014, 223, 169), (2015, 132, 120), (2016, 207, 162)
            (2017, 257, 141), (2018, 192, 140), (2019, 144, 121)

[ 4] Key:   <UNK> <U

## `db_peek`: Print _N_ key-value pairs starting at the specified key

In [5]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"

db_peek(db_path, start_key=b"quick brown <UNK> <UNK> <UNK>", key_format="utf-8", value_format="packed", n=5)


5 key-value pairs starting from 717569636b2062726f776e203c554e4b3e203c554e4b3e203c554e4b3e:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   quick brown <UNK> <UNK> <UNK>
     Value: [21 records] ... +11 earlier, (2006, 4, 3), (2007, 1, 1), (2008, 13, 3), (2009, 2, 2)
            (2010, 5, 4), (2011, 2, 2), (2012, 9, 7), (2013, 5, 3), (2014, 2, 1), (2016, 1, 1)

[ 2] Key:   quick brown eye <UNK> <UNK>
     Value: [156 records] ... +146 earlier, (2010, 5, 5), (2011, 6, 6), (2012, 10, 10), (2013, 19, 19)
            (2014, 18, 18), (2015, 22, 22), (2016, 18, 18), (2017, 26, 26), (2018, 99, 99)
            (2019, 16, 16)

[ 3] Key:   quick brown eye <UNK> butler
     Value: [10 records] (1866, 23, 23), (1867, 2, 2), (1869, 2, 2), (1870, 2, 2), (1871, 4, 4), (1875, 3, 3)
            (1891, 1, 1), (1892, 1, 1), (1903, 1, 1), (1908, 1, 1)

[ 4] Key:   quick brown eye take <UNK>
     Value: [36 records] ... +26 earlier, (2007, 1,

## `db_peek_prefix`: Print key-value pairs containing the specified prefix

In [9]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams_processed.db"

db_peek_prefix(db_path, prefix=b"<UNK> united state <UNK> <UNK>", key_format="utf-8", value_format="summary", n=1)

1 key-value pairs with prefix 3c554e4b3e20756e69746564207374617465203c554e4b3e203c554e4b3e:
────────────────────────────────────────────────────────────────────────────────────────────────────
[ 1] Key:   <UNK> united state <UNK> <UNK>
     Value: Total: 163,384,713 occurrences in 96,901,345 volumes (1472-2019, 384 years)

