reference
- [Boosting Wav2Vec2 with n-grams in Transformers](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Boosting_Wav2Vec2_with_n_grams_in_Transformers.ipynb)
- [N Gram Language Model with KenLM + Tranformers](https://www.kaggle.com/code/umongsain/n-gram-language-model-with-kenlm-tranformers)

In [38]:
from datasets import load_dataset
import re
import pandas as pd
# from datasets import load_metric
from dataclasses import dataclass, field

from IPython.display import display


In [30]:
# Paths to data folder, CSV file names
DATA_PATH = "data/"
TRAIN_CSV = DATA_PATH + "train.csv"
DEV_CSV = DATA_PATH + "dev.csv"
TEST_CSV = DATA_PATH + "test_release.csv"

TARGET_SAMPLE_RATE = 16000


In [31]:
# Read the CSV files
train_df = pd.read_csv(TRAIN_CSV)
# dev_df = pd.read_csv(DEV_CSV)

In [32]:
# Combine the transcripts from both CSV files
transcripts = train_df["transcript"].tolist()

In [33]:
# Write the combined transcripts to a text file
with open("text.txt", "w", encoding="utf-8") as file:
    for sentence in transcripts:
        file.write(sentence + " ")

In [35]:
!kenlm/build/bin/lmplz -o 2 <"text.txt" > "LM/2gram.arpa"

=== 1/5 Counting and sorting n-grams ===
Reading /m/home/home1/16/pix1/data/Desktop/geo_ASR_challenge_2024/text.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Unigram tokens 50371 types 12508
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:150096 2:6665342976
Statistics:
1 12507 D1=0.697428 D2=1.12887 D3+=1.78536
2 38509 D1=0.880434 D2=1.29385 D3+=1.35129
Memory estimate for binary LM:
type      kB
probing  994 assuming -p 1.5
probing 1043 assuming -r models -p 1.5
trie     504 without quantization
trie     397 assuming -q 8 -b 8 quantization 
trie     504 assuming -a 22 array pointer compression
trie     397 assuming -a 22 -q 8 -b 8 array pointer compression and quantization
=== 3/5 Calculating and sorting initial probabilities ===
Chain sizes: 1:150084 2:616144
----5---10---15---20---25---30---35---40---45---

In [36]:
!kenlm/build/bin/lmplz -o 3 <"text.txt" > "LM/3gram.arpa"

=== 1/5 Counting and sorting n-grams ===
Reading /m/home/home1/16/pix1/data/Desktop/geo_ASR_challenge_2024/text.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Unigram tokens 50371 types 12508
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:150096 2:2318380288 3:4346962944
Statistics:
1 12507 D1=0.697428 D2=1.12887 D3+=1.78536
2 38509 D1=0.88797 D2=1.31199 D3+=1.49201
3 48214 D1=0.966904 D2=1.36581 D3+=1.19804
Memory estimate for binary LM:
type      kB
probing 2067 assuming -p 1.5
probing 2342 assuming -r models -p 1.5
trie     995 without quantization
trie     641 assuming -q 8 -b 8 quantization 
trie     954 assuming -a 22 array pointer compression
trie     600 assuming -a 22 -q 8 -b 8 array pointer compression and quantization
=== 3/5 Calculating and sorting initial probabilities ===
Chain sizes: 1:150084 2:

In [24]:
!kenlm/build/bin/lmplz -o 4 --discount_fallback < text.txt > "LM/4gram.arpa"

=== 1/5 Counting and sorting n-grams ===
Reading /m/home/home1/16/pix1/data/Desktop/geo_ASR_challenge_2024/text.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Unigram tokens 50371 types 12508
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:150096 2:1134526464 3:2127237120 4:3403579392
Substituting fallback discounts for order 3: D1=0.5 D2=1 D3+=1.5
Statistics:
1 12507 D1=0.697428 D2=1.12887 D3+=1.78536
2 38509 D1=0.88797 D2=1.31199 D3+=1.49201
3 48214 D1=0.970681 D2=1.39554 D3+=1.32098
4 49884 D1=0.5 D2=1 D3+=1.5
Memory estimate for binary LM:
type      kB
probing 3227 assuming -p 1.5
probing 3784 assuming -r models -p 1.5
trie    1551 without quantization
trie     919 assuming -q 8 -b 8 quantization 
trie    1457 assuming -a 22 array pointer compression
trie     825 assuming -a 22 -q 8 -b 8 array pointer compr

In [25]:
!kenlm/build/bin/lmplz -o 5 --discount_fallback < text.txt > "LM/5gram.arpa"

=== 1/5 Counting and sorting n-grams ===
Reading /m/home/home1/16/pix1/data/Desktop/geo_ASR_challenge_2024/text.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Unigram tokens 50371 types 12508
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:150096 2:650277312 3:1219270144 4:1950832000 5:2844963584
Substituting fallback discounts for order 4: D1=0.5 D2=1 D3+=1.5
Statistics:
1 12507 D1=0.697428 D2=1.12887 D3+=1.78536
2 38509 D1=0.88797 D2=1.31199 D3+=1.49201
3 48214 D1=0.970681 D2=1.39554 D3+=1.32098
4 49884 D1=0.993918 D2=1.45073 D3+=1.15415
5 50221 D1=0.5 D2=1 D3+=1.5
Memory estimate for binary LM:
type      kB
probing 4402 assuming -p 1.5
probing 5251 assuming -r models -p 1.5
trie    2119 without quantization
trie    1202 assuming -q 8 -b 8 quantization 
trie    1971 assuming -a 22 array pointer compression
tr

In [37]:
with open("LM/2gram.arpa", "r") as read_file, open("LM/2gram_correct.arpa", "w") as write_file:
  has_added_eos = False
  for line in read_file:
    if not has_added_eos and "ngram 1=" in line:
      count=line.strip().split("=")[-1]
      write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
    elif not has_added_eos and "<s>" in line:
      write_file.write(line)
      write_file.write(line.replace("<s>", "</s>"))
      has_added_eos = True
    else:
      write_file.write(line)

In [26]:
with open("LM/3gram.arpa", "r") as read_file, open("LM/3gram_correct.arpa", "w") as write_file:
  has_added_eos = False
  for line in read_file:
    if not has_added_eos and "ngram 1=" in line:
      count=line.strip().split("=")[-1]
      write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
    elif not has_added_eos and "<s>" in line:
      write_file.write(line)
      write_file.write(line.replace("<s>", "</s>"))
      has_added_eos = True
    else:
      write_file.write(line)

In [27]:
with open("LM/4gram.arpa", "r") as read_file, open("LM/4gram_correct.arpa", "w") as write_file:
  has_added_eos = False
  for line in read_file:
    if not has_added_eos and "ngram 1=" in line:
      count=line.strip().split("=")[-1]
      write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
    elif not has_added_eos and "<s>" in line:
      write_file.write(line)
      write_file.write(line.replace("<s>", "</s>"))
      has_added_eos = True
    else:
      write_file.write(line)

In [28]:
with open("LM/5gram.arpa", "r") as read_file, open("LM/5gram_correct.arpa", "w") as write_file:
  has_added_eos = False
  for line in read_file:
    if not has_added_eos and "ngram 1=" in line:
      count=line.strip().split("=")[-1]
      write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
    elif not has_added_eos and "<s>" in line:
      write_file.write(line)
      write_file.write(line.replace("<s>", "</s>"))
      has_added_eos = True
    else:
      write_file.write(line)

In [44]:
!head -10 LM/2gram_correct.arpa

\data\
ngram 1=12508
ngram 2=38509

\1-grams:
-4.611464	<unk>	0
0	<s>	-0.055303354
0	</s>	-0.055303354
-3.623821	dangon	-0.124110006
-2.6553433	pro	-0.33915424
