
### In this section, we are building a tokenizer. To do that, we want to:

    - create a list of words by spliting text by whitespace
    - make all words lower case
    - filter out rare words, that occurred less than N times in the corpus
    - setting the vocab length to the total number of unique words


In [50]:
from collections import Counter
import re

text_file = "../data/wiki_text_data.txt"

with open(text_file, "r") as file:
    text = file.read()

def tokenizer(text):
    remove_punctuation = re.sub(r'[^\w\s]', '', text)
    lower_case_words = remove_punctuation.lower()
    split_words_by_whitespace = lower_case_words.split(' ')

    # print count of words in split_words_by_whitespace
    print(f"Number of words before filtering: {len(split_words_by_whitespace)}")

    # get word counts 
    word_counts = Counter(split_words_by_whitespace)

    top_30000_words = dict(word_counts.most_common(30000))

    print("Number of words after filtering", len(top_30000_words))

    # print word-frequency pair for the the highest word frequency and lowest word frequency
    print(f"Highest word frequency: {max(word_counts.values())}")
    print(f"Lowest word frequency: {min(word_counts.values())}")

    # create a word to id mapping with the length of ids being the vocab length
    word_to_id = {word: i for i, word in enumerate(top_30000_words.keys())}

    # create a id to word mapping with the length of ids being the vocab length
    id_to_word = {i: word for i, word in enumerate(top_30000_words.keys())}

    # corpus is the list of words in teh top_30000_words dictionary
    corpus = list(top_30000_words.keys())

    return word_to_id, id_to_word, corpus


In [None]:
# usage
tokenizer(text)




Number of words before filtering: 17005208
Number of words after filtering 30000
Highest word frequency: 1061396
Lowest word frequency: 1


({'the': 0,
  'of': 1,
  'and': 2,
  'one': 3,
  'in': 4,
  'a': 5,
  'to': 6,
  'zero': 7,
  'nine': 8,
  'two': 9,
  'is': 10,
  'as': 11,
  'eight': 12,
  'for': 13,
  's': 14,
  'five': 15,
  'three': 16,
  'was': 17,
  'by': 18,
  'that': 19,
  'four': 20,
  'six': 21,
  'seven': 22,
  'with': 23,
  'on': 24,
  'are': 25,
  'it': 26,
  'from': 27,
  'or': 28,
  'his': 29,
  'an': 30,
  'be': 31,
  'this': 32,
  'which': 33,
  'at': 34,
  'he': 35,
  'also': 36,
  'not': 37,
  'have': 38,
  'were': 39,
  'has': 40,
  'but': 41,
  'other': 42,
  'their': 43,
  'its': 44,
  'first': 45,
  'they': 46,
  'some': 47,
  'had': 48,
  'all': 49,
  'more': 50,
  'most': 51,
  'can': 52,
  'been': 53,
  'such': 54,
  'many': 55,
  'who': 56,
  'new': 57,
  'used': 58,
  'there': 59,
  'after': 60,
  'when': 61,
  'into': 62,
  'american': 63,
  'time': 64,
  'these': 65,
  'only': 66,
  'see': 67,
  'may': 68,
  'than': 69,
  'world': 70,
  'i': 71,
  'b': 72,
  'would': 73,
  'd': 74,
  'no