In [1]:
from ngram_embeddings.download_ngrams import download_ngram_files
from ngram_embeddings.convert_to_jsonl import convert_to_jsonl_files
from ngram_embeddings.lowercase_ngrams import lowercase_ngrams
from ngram_embeddings.lemmatize_ngrams import lemmatize_ngrams
from ngram_embeddings.filter_ngrams import filter_ngrams
from ngram_embeddings.sort_ngrams import sort_ngrams
from ngram_embeddings.helpers.verify_sort import check_file_sorted
from ngram_embeddings.consolidate_ngrams import consolidate_duplicate_ngrams
from ngram_embeddings.helpers.print_jsonl_lines import print_jsonl_lines
from ngram_embeddings.index_and_create_vocab import index_and_create_vocab_files

# **Process Multigrams for Training Word-Embedding Models**

## **Goal**: Download and preprocess mulitgrams for use in training `Word2Vec` models. 

This workflow is resource-intensive and is probably only practical when run on a computing cluster. On my university's High Performance Computing (HPC) cluster, I request the maximum 14 cores (48 logical processors) and 128G of memory and use a 2T fast-I/O NVMe SSD filespace. I still run up against time and time and resource limits and have designed the code to contend with them.

The code affords some options to keep things efficient. Throughout the workflow you can specify compress=True, which tells the code to compress its output files. Downstream scripts will see the `.lz4` extensions and handle the files accordingly. If you know your workflow runs correctly and wish to further conserve space, you can specify `delete_input=True` for many of the scripts; this will delete the source files for a given step once that step is complete. The scripts are fairly memory-efficient—with the exception of `sort_ngrams` and `index_and_create_vocab_files`, which sort multiple files in memory at once. When processing multigrams, I've found that allocating more than ~10 workers in these scripts leads to memory exhaustion (with 128G!) and slow processing.

**NOTE:** You'll probably want to have run the unigram workflow before processing multigrams. That workflow allows you create a vocabulary file for filtering out uncommon tokens from the multigrams. Although you can run the `filter_ngrams` module without a vocab file, most use cases will benefit from one.

### Download multigrams
Here, I'm using `download_ngrams` module to fetch 5grams appended with part-of-speech (POS) tags (e.g., `_VERB`). Although you can specify `ngram_type='untagged'`, POS tags are necessary to lemmatize the tokens. Specify the number of parallel processes you wish to use by setting `workers` (the default is all available processors). I've specified `compress=True` becausae 5gram files are _big_.

`[Runtime: 4:19:11.661609]`

In [3]:
download_ngram_files(
    ngram_size=5,
    ngram_type='tagged',
    proj_dir='/vast/edk202/NLP_corpora/Google_Books/20200217/eng',
    compress=True,
    overwrite=False
)

[31mStart Time:                2025-01-04 17:43:25.557908
[0m
[4mDownload Info[0m
Ngram repository:          https://storage.googleapis.com/books/ngrams/books/20200217/eng/eng-5-ngrams_exports.html
Output directory:          /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1download
File index range:          0 to 19422
File URLs available:       19423
File URLs to use:          19423
First file to get:         https://storage.googleapis.com/books/ngrams/books/20200217/eng/5-00000-of-19423.gz
Last file to get:          https://storage.googleapis.com/books/ngrams/books/20200217/eng/5-19422-of-19423.gz
Ngram size:                5
Ngram type:                tagged
Number of workers:         48
Compress saved files:      True
Overwrite existing files:  False

Downloading     |[32m██████████████████████████████████████████████████[0m| 100.0% 19423       /19423      [0m
[31m
End Time:                  2025-01-04 22:02:37.219517[0m
[31mTotal runtime:             4:19:

### Convert files from TXT to JSONL
This module converts the original multigram files' text data to a more flexible JSON Lines (JSONL) format. Although this increases storage demands, it makes downstream processing more efficient.

`[Total runtime: 1:30:56.083010]`

In [3]:
convert_to_jsonl_files(
    ngram_size=5,
    ngram_type='tagged',
    proj_dir='/vast/edk202/NLP_corpora/Google_Books/20200217/eng',
    compress=True,
    overwrite=False,
    delete_input=True
)

[31mStart Time:                2025-01-05 13:47:05.912405
[0m
[4mConversion Info[0m
Input directory:           /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1download
Output directory:          /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/2convert
File index range:          0 to 19422
Files available:           19423
Files to use:              19423
First file to get:         /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1download/5-00000-of-19423.txt.lz4
Last file to get:          /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/1download/5-19422-of-19423.txt.lz4
Ngram size:                5
Ngram type:                tagged
Number of workers:         48
Compress output files:     True
Overwrite existing files:  False
Delete input directory:    True

Converting      |[32m██████████████████████████████████████████████████[0m| 100.0% 19423       /19423      [0m
[31m
End Time:                  2025-01-05 15:19:30.9

### Make multigrams all lowercase
This module lowercases all characters in the multigrams. Most use cases benefit from this.

`[Total runtime: 0:47:33.859616]`

In [4]:
lowercase_ngrams(
    ngram_size=5,
    proj_dir='/vast/edk202/NLP_corpora/Google_Books/20200217/eng',
    compress=True,
    overwrite=False,
    delete_input=True
)

[31mStart Time:                2025-01-05 15:19:31.052375
[0m
[4mLowercasing Info[0m
Input directory:           /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/2convert
Output directory:          /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/3lowercase
File index range:          0 to 6520
Files available:           6521
Files to use:              6521
First file to get:         /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/2convert/5-00000-of-19423.jsonl.lz4
Last file to get:          /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/2convert/5-19422-of-19423.jsonl.lz4
Ngram size:                5
Number of workers:         48
Compress output files:     True
Overwrite existing files:  False
Delete input directory:    True

Lowercasing     |[32m██████████████████████████████████████████████████[0m| 100.0% 6521        /6521       [0m
[31m
End Time:                  2025-01-05 16:07:13.575473[0m
[31mTotal runtime:    

### Lemmatize the multigrams
Likewise, most use cases will benefit from multigrams that are lemmatized—that is, reduced to their base form. This requires POS-tagged multigrams. Example: `people_NOUN` ("the people of this land") will be converted to `person` in the output; `people_VERB` ("to people this land") will not. The POS tag will then be discarded as it is no longer useful.

`[Total runtime: 1:39:30.015704]`

In [2]:
lemmatize_ngrams(
    ngram_size=5,
    proj_dir='/vast/edk202/NLP_corpora/Google_Books/20200217/eng',
    compress=True,
    overwrite=False,
    delete_input=True
)

[31mStart Time:                2025-01-05 16:40:58.283173
[0m
[4mLemmatizing Info[0m
Input directory:           /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/3lowercase
Output directory:          /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/4lemmatize
File index range:          0 to 6520
Files available:           6521
Files to use:              6521
First file to get:         /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/3lowercase/5-00000-of-19423.jsonl.lz4
Last file to get:          /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/3lowercase/5-19422-of-19423.jsonl.lz4
Ngram size:                5
Number of workers:         48
Compress output files:     True
Overwrite existing files:  False
Delete input directory:    True

Lemmatizing     |[32m██████████████████████████████████████████████████[0m| 100.0% 6521        /6521       [0m
[31m
End Time:                  2025-01-05 18:18:46.729100[0m
[31mTotal runtim

### Filter the multigrams
This module removes tokens that provide little information about words' semantic context—specifically, those that contain numerals (`numerals=True`), nonalphabetic characters (`nonalpha=True`), stopwords (high-frequency, low information tokens like "the" and "into"; `stops=True`), or short words (those below a certain user-specified character count; here, `min_token_length=3`). You can also specify a **vocabulary file** like the one illustrated in the unigram workflow. A vocabulary file is simply a list of the _N_ most common words in the unigram corpus; the multigram tokens are checked against this list and those that don't appear in it are dropped.

The filtering process will inevitably turn some longer ngrams (e.g., 5grams) into shorter ones (e.g., 3grams) after unwanted tokens are dropped. The training of word-embedding models requires _linguistic context_—which in turn requires ngrams containing more than one token. A unigram isn't useful for helping a model learn what "company" a word keeps. Thus, the `min_tokens` option allows you to drop ngrams that fall below a specified length during filtering. If filtering results in an ngram with fewer than the minimum tokens, all data for that ngram is dropped entirely. Here, I've set `min_tokens=2`, since two tokens (and higher) provide at least some contextual information.

`[Total runtime: 0:49:53.902454]`

In [3]:
filter_ngrams(
    ngram_size=5,
    proj_dir='/vast/edk202/NLP_corpora/Google_Books/20200217/eng',
    numerals=True,
    nonalpha=True,
    stops=True,
    min_token_length=3,
    min_tokens=2,
    compress=True,
    overwrite=False,
    delete_input=True
)

[31mStart Time:                   2025-01-05 18:18:46.742270
[0m
[4mFiltering Info[0m
Input directory:              /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/4lemmatize
Output directory:             /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5filter
File index range:             0 to 6520
Files available:              6521
Files to use:                 6521
First file to get:            /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/4lemmatize/5-00000-of-19423.jsonl.lz4
Last file to get:             /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/4lemmatize/5-19422-of-19423.jsonl.lz4
Ngram size:                   5
Number of workers:            48
Compress output files:        True
Overwrite existing files:     False
Delete input directory:       True

[4mFiltering Options[0m
Drop stopwords:               True
Drop tokens under:            3 chars
Drop tokens with numerals:    True
Drop non-alphabetic:       

### Sort and combine the multigram files
This modules creates a single, fully-sorted multigram file out of the filtered files. This is crucial for the next step (ngram consolidation; see below).   

Sorting a giant file is a resource-hungry process and I've tried to implement an efficient approach that leverages parallelism: We first sort the filtered files in parallel using Python's standard sorting algorithm [Timsort](https://en.wikipedia.org/wiki/Timsort); then, we incrementally [heapsort](https://en.wikipedia.org/wiki/Heapsort) the files in parallel until we get down to 2 files. Finally, we heapsort the final 2 files (necessarily using one processor) to arrive at a single combined and sorted unigram file.

Because this step can take a _very_ long time for larger multigrams (e.g., 5grams), we can run it in sessions using the `start_iteration` and `end_iteration` options. Iteration 1 comes after the initial file sort. If you only have time to complete, say, iterations 1–3, you can set `end_iteration=3`. During a later session, you can specify `start_iteration=4` to pick up where you left off.

`[Sort + Iteration 1 runtime: 1:44:39.319527]`

`[Iteration 2 runtime:        0:47:37.115564]`

`[Iteration 3 runtime:        1:01:02.813422]`

`[Iteration 4 runtime:        1:14:05.632074]`

`[Iteration 5 runtime:        2:21:00.000000]`

`[Iteration 6 runtime:        4:26:40.465811]`

`[Iteration 7 runtime:        7:00:51.409004]`

In [2]:
sort_ngrams(
    ngram_size=5,
    proj_dir='/vast/edk202/NLP_corpora/Google_Books/20200217/eng',
    workers=10,
    sort_key='ngram',
    compress=True,
    overwrite=False,
    sort_order='ascending',
    start_iteration=7,
    end_iteration=7,
    delete_input=True
)

[31mStart Time:                2025-01-07 14:48:30.163232
[0m
[4mSort Info[0m
Input directory:           /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5filter
Sorted directory:          /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/temp
Temp directory:            /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/tmp
Merged file:               /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/6corpus/5gram-merged.jsonl.lz4
Files available:           6520
First file to get:         /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5filter/5-00069-of-19423.jsonl.lz4
Last file to get:          /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5filter/5-19422-of-19423.jsonl.lz4
Files to use:              6520
Ngram size:                5
Number of workers:         10
Compress output files:     True
Overwrite existing files:  False
Sort key:                  ngram
Sort order:                ascending
He

### Verify sort [OPTIONAL]
If we want, we can verify that the output file is correctly sorted. If the script outputs `True`, then the file is sorted. Bear in mind that you need to specify the file path manually here; be sure to use the right file extension based on whether `sort_ngrams` was run with `compress=True`.

In [None]:
check_file_sorted(
    input_file='/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/6corpus/5gram-merged.jsonl.lz4',
    field="ngram",
    sort_order="ascending"
)

### Consolidate duplicate unigrams
This module consolidates the sorted unigram file. Lowercasing and lemmatizing produce duplicate unigrams. Now that the file is sorted, we can scan through it and consolidate consecutive idential duplicates. This involves summing their overall and yearly frequencies and document counts. It also leads to a much smaller file.

`[Runtime: 0:12:33.459340]`

In [None]:
consolidate_duplicate_ngrams(
    ngram_size=5,
    proj_dir='/vast/edk202/NLP_corpora/Google_Books/20200217/eng',
    compress=True,
    overwrite=False
)

[31mStart Time:                2025-01-07 21:56:56.400661
[0m
[4mConsolidation Info[0m
Merged file:               /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/6corpus/5gram-merged.jsonl.lz4
Corpus file:               /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/6corpus/5gram-corpus.jsonl.lz4
Ngram size:                5
Compress output files:     True
Overwrite existing files:  False



### View line [OPTIONAL]
If we want, we can inspect a line in the file.

In [27]:
!python print_jsonl_lines.py \
    --file_path "{base_dir}/1gram_files/6corpus/1gram-consolidated.jsonl.lz4" \
    --start 1000000 \
    --end 1000001 \
    --parse

Line 1000000: {'ngram': 'bandageing', 'freq_tot': 71, 'doc_tot': 51, 'freq': {'1804': 1, '1817': 1, '1845': 2, '1870': 1, '1872': 1, '1873': 1, '1874': 1, '1878': 1, '1885': 5, '1891': 1, '1907': 1, '1911': 2, '1914': 2, '1916': 1, '1917': 24, '1921': 1, '1964': 1, '1966': 1, '1973': 3, '1974': 1, '1988': 1, '1991': 1, '2000': 4, '2004': 2, '2005': 3, '2007': 2, '2012': 2, '2014': 4}, 'doc': {'1804': 1, '1817': 1, '1845': 2, '1870': 1, '1872': 1, '1873': 1, '1874': 1, '1878': 1, '1885': 5, '1891': 1, '1907': 1, '1911': 2, '1914': 2, '1916': 1, '1917': 12, '1921': 1, '1964': 1, '1966': 1, '1973': 2, '1974': 1, '1988': 1, '1991': 1, '2000': 2, '2004': 1, '2005': 2, '2007': 1, '2012': 2, '2014': 2}}
Line 1000001: {'ngram': 'bandageless', 'freq_tot': 79, 'doc_tot': 66, 'freq': {'1907': 2, '1909': 10, '1910': 1, '1921': 20, '1926': 1, '1927': 3, '1934': 2, '1935': 1, '1945': 2, '1958': 4, '1969': 5, '1982': 1, '1985': 1, '1987': 1, '1998': 1, '2001': 1, '2003': 1, '2004': 1, '2005': 2, '200

### Index unigrams and create vocabulary file
Most use cases will require an indexed list of "valid" (i.e., reasonably common) vocabulary words. This indexing script served dual functions of (1) mapping each unigram to an index number (saved in `/6corpus/1gram-consolidated-indexed.jsonl`) and (2) culling this file into a vocabulary list consisting of the _n_ most frequent unigrams (saved in `6corpus/1gram-consolidated-vocab_list_match.txt`). Unlike files upstream in the workflow, the vocabulary files are not very large and are therefore not compressed.

In [25]:
!python 7_index.py \
    --ngram_size 1 \
    --proj_dir {base_dir} \
    --input_file {base_dir}/1gram_files/6corpus/1gram-consolidated.jsonl.lz4 \
    --overwrite \
    --vocab_file 80000 \
    --workers 48

[31mStart Time:                2025-01-01 22:32:01.094067
[0m
[4mIndexing Info[0m
Project directory:         /vast/edk202/NLP_corpora/Google_Books/20200217/eng
Output directory:          /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/6corpus
Input file:                /vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/6corpus/1gram-consolidated.jsonl.lz4
Ngram size:                1
Overwrite existing files:  True
Workers:                   48
Vocab size (top N):        80000


[4mIndexing Info[0m
Chunking        |[32m██████████████████████████████████████████████████[0m| 100.0% 13499384    /13499384   [0m
Sorting         |[32m██████████████████████████████████████████████████[0m| 100.0% 135         /135        [0m
Merging         |[32m██████████████████████████████████████████████████[0m| 100.0% 13499384    /13499384   [0m
Indexing        |[32m██████████████████████████████████████████████████[0m| 100.0% 13499384    /13499384   [0m

Indexe

## Process Multigrams

In [1]:
base_dir = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng"

In [None]:
!python 1_download.py \
    --ngram_size 5 \
    --ngram_type tagged \
    --proj_dir {base_dir} \
    --overwrite \
    --compress

In [None]:
!python 2_convert.py \
    --ngram_size 5 \
    --ngram_type tagged \
    --proj_dir {base_dir} \
    --compress

In [None]:
!python 3_lowercase.py \
    --ngram_size 5 \
    --proj_dir {base_dir} \
    --compress

In [None]:
!python 4_lemmatize.py \
    --ngram_size 5 \
    --proj_dir {base_dir} \
    --compress

In [None]:
!python 5_filter.py \
    --ngram_size 5 \
    --proj_dir {base_dir} \
    --numerals \
    --nonalpha \
    --stopwords \
    --min_token_length 3 \
    --min_tokens 2 \
    --compress \
    --vocab_file {base_dir}/1gram_files/6corpus/1-00000-to-00017-vocab_list_match.txt

In [None]:
base_dir = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng"

!python 6_sort2.py \
    --ngram_size 5 \
    --proj_dir {base_dir} \
    --workers 10 \
    --sort_key ngram \
    --compress \
    --sort_order ascending \
    --end_iteration 2

[31mStart Time:                2025-01-01 15:54:18.573790
[0m
[4mSort Info[0m
Input directory:           /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5filter
Sorted directory:          /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/temp
Temp directory:            /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/tmp
Merged file:               /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/6corpus/5gram-merged.jsonl.lz4
Files available:           6520
Files to use:              6520
First file to get:         /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5filter/5-00069-of-19423.jsonl.lz4
Last file to get:          /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/5filter/5-19422-of-19423.jsonl.lz4
Ngram size:                5
Number of workers:         10
Compress output files:     True
Overwrite existing files:  False
Sort key:                  ngram
Sort order:                ascending
He

In [2]:
!python 7_consolidate.py \
    --ngram_size 5 \
    --proj_dir {base_dir} \
    --overwrite \
    --compress

[31mStart Time:                2024-12-31 17:52:08.091974
[0m
[4mConsolidation Info[0m
Merged file:               /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/6corpus/5gram-merged.jsonl.lz4
Consolidated directory:    /vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/6corpus/5gram-consolidated.jsonl.lz4
Ngram size:                5
Compress output files:     True
Overwrite existing files:  True

Consolidating   |[32m██████████████████████████████████████████████████[0m| 100.0% 276470316   /276470316  [0m

Lines before consolidation:  276470316
Lines after consolidation:   75107076
[31m
End Time:                  2024-12-31 19:10:05.741234
[0m


In [3]:
import lz4.frame

input_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/6corpus/5gram-consolidated.jsonl.lz4"
output_path = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng/5gram_files/6corpus/5gram-consolidated.jsonl"

def decompress_lz4_file(input_path, output_path):
    with lz4.frame.open(input_path, "rb") as compressed_file:
        with open(output_path, "wb") as decompressed_file:
            decompressed_file.write(compressed_file.read())

decompress_lz4_file(input_path, output_path)

In [4]:
!python print_jsonl_lines.py \
    --file_path "{base_dir}/1gram_files/6corpus/1gram-consolidated.jsonl" \
    --start 50000 \
    --end 50100

Error reading the file '/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/6corpus/1gram-consolidated.jsonl': [Errno 2] No such file or directory: '/vast/edk202/NLP_corpora/Google_Books/20200217/eng/1gram_files/6corpus/1gram-consolidated.jsonl'


In [4]:
!python verify_sort.py \
    --input_file "{base_dir}/5gram_files/6corpus/5gram-consolidated.jsonl" \
    --field ngram \
    --sort_order ascending

Lines: 75107076line [08:47, 142445.80line/s]

The file is sorted.

Processing complete.


In [None]:
!python simulate_merge.py \
    --file_dir "{base_dir}/5gram_files/temp" \
    --workers 48

In [None]:
base_dir = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng"

!python simulate_merge2.py \
    --ngram_size 5 \
    --file_dir "{base_dir}/5gram_files/temp" \
    --tmp_dir "{base_dir}/5gram_files/tmp" \
    --compress \
    --sort_key ngram \
    --sort_order ascending \
    --workers 48

In [22]:
!lz4 -t {base_dir}/1gram_files/6corpus/1gram-consolidated.jsonl.lz4

/vast/edk202/NLP_corpora/Googl : decoded 18959398668 bytes                     
