# Training an N-Gram Language Model Using KenLM Software

In [None]:
# Mount your Google Drive to copy the clean data from
import sys
from google.colab import drive
drive.mount('/content/drive')
experiment_folder = '/content/drive/My Drive/my_projects/PSU_language_models_session/'
sys.path.append(experiment_folder)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# change the paths below to fit your needs. They should be the paths that you
# used them in notebook #1 (1- Prepare Data.ipynb)
!cp '/content/drive/My Drive/my_projects/PSU_language_models_session/data/Clean_Ryiadh_text.txt' /content/Clean_Ryiadh_text.txt
!cp '/content/drive/My Drive/my_projects/PSU_language_models_session/data/Clean_SaudiYoum_text.txt' /content/Clean_SaudiYoum_text.txt
!cp '/content/drive/My Drive/my_projects/PSU_language_models_session/data/vocab_list.txt' /content/vocab_list.txt
!cat /content/Clean_SaudiYoum_text.txt /content/Clean_Ryiadh_text.txt > /content/Clean_SaudiYoum_n_Ryiadh_text.txt

In [None]:
# This library is a prerequisite for KenLM language model builder
!git clone https://gitlab.com/libeigen/eigen.git
!export EIGEN3_ROOT=$HOME/eigen

Cloning into 'eigen'...
remote: Enumerating objects: 117036, done.[K
remote: Counting objects: 100% (129/129), done.[K
remote: Compressing objects: 100% (82/82), done.[K
remote: Total 117036 (delta 67), reused 102 (delta 47), pack-reused 116907[K
Receiving objects: 100% (117036/117036), 102.59 MiB | 21.19 MiB/s, done.
Resolving deltas: 100% (96642/96642), done.


In [None]:
# Download KenLM source code and compile it
!wget -O - https://kheafield.com/code/kenlm.tar.gz |tar xz
!mkdir kenlm/build
%cd kenlm/build
!cmake ..
!make -j $(nproc)
%cd /content

--2022-09-17 20:10:11--  https://kheafield.com/code/kenlm.tar.gz
Resolving kheafield.com (kheafield.com)... 35.196.63.85
Connecting to kheafield.com (kheafield.com)|35.196.63.85|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 491888 (480K) [application/x-gzip]
Saving to: ‘STDOUT’


2022-09-17 20:10:11 (2.72 MB/s) - written to stdout [491888/491888]

/content/kenlm/build
-- The C compiler identification is GNU 7.5.0
-- The CXX compiler identification is GNU 7.5.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Could NOT find Eigen3 (missing: Eigen3_DIR)
-- Looking for pthread.h
-- Looking for pthread.h -

In [None]:
# This function is used to select the top K most frequent words from the vocab
# list that was already prepared in the previous notebook

def load_vocab(filepath, topk=None, include_freqs=False):
    words = []
    with open(filepath) as f1:
        for word in f1:
            word, freq = word.strip().split()
            word, freq = word, int(freq)
            words.append((word, freq))
    words = sorted(words, key=lambda x: -x[1])
    if not topk is None:
        return [(word, freq) if include_freqs else word for word, freq in words][:topk]
    return [(word, freq) if include_freqs else word for word, freq in words]

In [None]:
# select top 400K frequent words from the given dataset
unique_words = load_vocab('/content/vocab_list.txt', topk=400000, include_freqs=True)
len(unique_words)

In [None]:
# Save the selected words in a file
with open('/content/vocab_for_lm.txt', 'w') as f1:
    for word, freq in unique_words:
        f1.write(f'{word}\n')

In [None]:
# Use the compiled KenLM program to build the N-Gram Language Model
# Here, we are building a 4-gram language model (specified by the option -o 4).
# Also, we are removing (pruning) bigrams, trigrams and quadgrams that 
# have frequency less than or equal to 1 (specified by the option --prune 0 1 1 1)
# where the first digit represents the minimum count of unigrams, the second 
# digit represents the minimum count of bigrams, ... and so on.
# We force the language model to have words that are only in "vocab_for_lm.txt"
# file and replace other words that are not there by the unkown token "<unk>"
!/content/kenlm/build/bin/lmplz -o 4 --prune 0 1 1 1 --limit_vocab_file /content/vocab_for_lm.txt < /content/Clean_SaudiYoum_n_Ryiadh_text.txt > /content/Clean_SaudiYoum_n_Ryiadh_text_lm.arpa


=== 1/5 Counting and sorting n-grams ===
Reading /content/Clean_SaudiYoum_n_Ryiadh_text.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
tcmalloc: large alloc 2173591552 bytes == 0x556471ebc000 @  0x7f04743a51e7 0x55646f4ea7e2 0x55646f4854fe 0x55646f4642eb 0x55646f450066 0x7f047253ec87 0x55646f451baa
tcmalloc: large alloc 8694341632 bytes == 0x5564f37a2000 @  0x7f04743a51e7 0x55646f4ea7e2 0x55646f4d980a 0x55646f4da248 0x55646f464308 0x55646f450066 0x7f047253ec87 0x55646f451baa
=== 1/5 Counting and sorting n-grams ===
Reading /content/Clean_SaudiYoum_n_Ryiadh_text.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
tcmalloc: large alloc 2173591552 bytes == 0x556471ebc000 @  0x7f04743a51e7 0x55646f4ea7e2 0x55646f4854fe 0x55646f4642eb 0x55646f450066 0x7f047253ec87 0x55646f451baa
tcmalloc: large alloc 8694341632 bytes == 0x5564f37a2000 @  0x7f04743a51e7 0x55646f4ea7e2 0x55646f4d980a

In [None]:
# For faster LM loading in the memory and storage efficiency, we convert the LM
# from the standard ARPA format to a binary format
!/content/kenlm/build/bin/build_binary /content/Clean_SaudiYoum_n_Ryiadh_text_lm.arpa /content/Clean_SaudiYoum_n_Ryiadh_text_lm.bin

In [None]:
# copy the LM and the vocab list to Google Drive to be used in the next notebook
!cp /content/Clean_SaudiYoum_n_Ryiadh_text_lm.bin '/content/drive/My Drive/my_projects/PSU_language_models_session/lm_models'
!cp /content/vocab_for_lm.txt '/content/drive/My Drive/my_projects/PSU_language_models_session/lm_models'