In [1]:
import os

%load_ext autoreload

%autoreload 2

os.chdir('/scratch/edk202/hist_w2v')

In [None]:
import orjson
import rocksdict

from ngram_tools.download_and_ingest_to_rocksdb import download_and_ingest_to_rocksdb
from reservoir_sampler_python import reservoir_sampling_python
from utils.resource_summary import print_resource_summary

E0000 00:00:1755148249.410853  639809 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755148249.416276  639809 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755148249.430396  639809 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755148249.430411  639809 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755148249.430413  639809 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755148249.430414  639809 computation_placer.cc:177] computation placer already registered. Please check linka

In [None]:
!python setup.py clean --all
!python setup.py build_ext --inplace --force

# Reload the optimized modules
from src.reservoir_sampler import reservoir_sampling
from src.count_db_items import count_db_items

In [None]:
print_resource_summary()

<pre>SYSTEM RESOURCE SUMMARY
=============================================
Hostname: cm047.hpc.nyu.edu

Job Allocation:
   CPUs: 40
   Memory: 293.0 GB
   Partition: short
   Job ID: 64944309
   Node list: cm047

Physical GPU Hardware:
   No physical GPUs allocated to this job

TensorFlow GPU Recognition:
   TensorFlow can access 0 GPU(s)
   Built with CUDA support: True
=============================================</pre>

# Download

In [None]:
proj_dir = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/"
db_path = os.path.join(proj_dir, "5grams.db")

download_and_ingest_to_rocksdb(
    ngram_size=5,
    ngram_type='tagged',
    repo_release_id="20200217",
    repo_corpus_id="eng",
    db_path=db_path,
    #file_range=(0, 999),
    workers=16,
    write_batch_size=20000000,
    use_threads=False,
    overwrite=False,
    random_seed=42,
)


[31mStart Time: 2025-08-14 01:02:33.212120
[0m
[4mDownload & Ingestion Configuration[0m
Ngram repository:           https://storage.googleapis.com/books/ngrams/books/20200217/eng/eng-5-ngrams_exports.html
RocksDB database path:      /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db
File index range:           0 to 19422
Total files available:      19423
Files to process:           12816
First file URL:             https://storage.googleapis.com/books/ngrams/books/20200217/eng/5-14825-of-19423.gz
Last file URL:              https://storage.googleapis.com/books/ngrams/books/20200217/eng/5-16116-of-19423.gz
Ngram size:                 5
Ngram filtering:            tagged
Overwrite mode:             False
Files to skip (processed):  6607
Worker processes/threads:   16 (processes)
Write batch size:           20,000,000



Processing Files:   1%|[34m▏         [0m| 174/12816 [05:10<94:02:23, 26.78s/files]

# Inspect Database Contents

In [None]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db"
db = rocksdict.Rdict(db_path)

import struct

# Request (key, value) tuples now that value payload is packed binary
sample = reservoir_sampling(
    db,
    sample_size=10,
    key_type="string",
    progress_interval=100_000_000,
    max_items=2_500_000_000,
    return_keys=True,
)

print(f"\nSAMPLE")
print("-"*60)
for i, item in enumerate(sample[:len(sample)], 1):
    key, value = item
    # Unpack binary value: each record is (year, match_count, volume_count) as uint32
    freq_tuples = struct.iter_unpack('<III', value)
    print(f"{i}. {key}")
    for j, (year, match_count, volume_count) in enumerate(freq_tuples, 1):
        print(f"    {j}: year={year}, match_count={match_count}, volume_count={volume_count}")

db.close()


In [None]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db"
db = rocksdict.Rdict(db_path)

import struct

sample = reservoir_sampling_python(
    db,
    sample_size=10,
    key_type="string",
    progress_interval=100_000_000,
    max_items=2_500_000_000,
    return_keys=True,
)

print(f"\nSAMPLE (Python)")
print("-"*60)
for i, item in enumerate(sample[:len(sample)], 1):
    key, value = item
    freq_tuples = struct.iter_unpack('<III', value)
    print(f"{i}. {key}")
    for j, (year, match_count, volume_count) in enumerate(freq_tuples, 1):
        print(f"    {j}: year={year}, match_count={match_count}, volume_count={volume_count}")

db.close()


In [None]:
db.close()

In [None]:
db_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5grams.db"
db = rocksdict.Rdict(db_path)

count = count_db_items(db, progress_interval=100_000_000)

db.close()

In [None]:
# Examine the unpacked binary structure of one sample (value payload is packed frequencies)
import struct

if sample:
    print("Sample unpacked structure:")
    print("="*50)
    key, value = sample[-1]
    freq_tuples = list(struct.iter_unpack('<III', value))
    print(f"Key: {key}")
    print(f"Number of frequency records: {len(freq_tuples)}")
    print("Frequencies (first 5 shown):")
    for freq in freq_tuples[:5]:
        year, match_count, volume_count = freq
        print(f"  year={year}, match_count={match_count}, volume_count={volume_count}")
