In [78]:
# use small language model for setence generation
# build a basic chat model 
import os
import random 
import nltk 
from nltk.corpus import reuters
from nltk import bigrams, FreqDist, ConditionalFreqDist

In [59]:
# nltk.download('punkt')
nltk.download('reuters')

[nltk_data] Downloading package reuters to /Users/user/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [60]:
# create a directory to store the text files 
os.makedirs('data', exist_ok=True)

In [61]:
# iterate through the fileids in the reuters corpus 
for fileid in reuters.fileids():
    article_text = ' '.join(reuters.words(fileid))
    filename = f'data/{fileid.replace("/", "_")}.txt'
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(article_text)

In [62]:
!ls data | wc

   10788   10788  186592


In [63]:
#
import os 
import string

In [64]:
# keep training documents in a folder named 'data'
input_data_dir = 'data'

# string punction without the full stop
# or remove full stop sign from punctuation 
punctuation = string.punctuation.replace('.', '')

# check hidden files
def is_hidden(filepath):
    return os.path.basename(filepath).startswith('.')

text_data = ''
for filename in os.listdir(input_data_dir):
    filepath = os.path.join(input_data_dir, filename)
    if not is_hidden(filepath):
        with open(filepath) as infile:
            for line in infile:
                if line.strip():   # skip empty line
                    for char in punctuation:
                        line = line.replace(char, '')
                    text_data += line 


In [65]:
# combined text has 8.4 million chars
len(text_data)

8441833

In [66]:
# tokenize the text into words
# lowercasing for consistency
words = nltk.word_tokenize(text_data.lower())

# generate bigrams
bi_grams = list(bigrams(words))

# calculate frequency distribution for each bigrams
bi_gram_freq_dist = FreqDist(bi_grams)

In [67]:
from itertools import islice
# print the first five elements of the dictionary
first_five_items = list(islice(bi_gram_freq_dist.items(), 5))
for item in first_five_items:
    print(item)

(('weinberger', 'opposes'), 2)
(('opposes', 'fujitsu'), 1)
(('fujitsu', 'buying'), 1)
(('buying', 'u'), 7)
(('u', '.'), 5763)


In [68]:
# compute conditional frequency distribution of bigrams
bi_gram_freq = ConditionalFreqDist(bi_grams)

In [71]:
# bigram first word is 'natural', and second word is printed
# define the first word in bigram
bi_gram_freq['natural']

FreqDist({'gas': 216, 'rubber': 39, 'resources': 9, 'float': 3, 'for': 3, 'lt': 2, 'disasters': 2, 'that': 2, 'source': 1, 'lower': 1, ...})

In [72]:
# only keep top 3 words
import heapq

topk = 3 
# create a dictionary to hold the top topk bigrams for each first word
top_bigrams_per_first_word = {}

# iterate over the bigram frequency distribution 
for (first_word, second_word), freq in bi_gram_freq_dist.items():
    # initialize an empty heap for the first_word if not exist
    if first_word not in top_bigrams_per_first_word:
        top_bigrams_per_first_word[first_word] = [] 
        
    # add to the heap aand maintain top topk 
    heapq.heappush(top_bigrams_per_first_word[first_word],
                  (freq, second_word))
    if len(top_bigrams_per_first_word[first_word]) > topk:
        heapq.heappop(top_bigrams_per_first_word[first_word])

In [73]:
top_bigrams_per_first_word['natural']

[(9, 'resources'), (216, 'gas'), (39, 'rubber')]

In [74]:
# convert the heap to a simple list for each first word 
for first_word in top_bigrams_per_first_word:
    sorted_bigrams = sorted(
        top_bigrams_per_first_word[first_word], reverse=True
        )
    top_bigrams_list = []
    for freq, second_word in sorted_bigrams:
        top_bigrams_list.append(second_word)
    top_bigrams_per_first_word[first_word] = top_bigrams_list
    
# use these filtered bigram to create a ConditionalFreqDist
filtered_bi_grams = [] 
for first_word in top_bigrams_per_first_word:
    for second_word in top_bigrams_per_first_word[first_word]:
        filtered_bi_grams.append((first_word, second_word))
                                 
bi_gram_freq = ConditionalFreqDist(filtered_bi_grams)                                 

In [75]:
def generate_sentence(word, num_words):
    word = word.lower()
    for _ in range(num_words):
        print(word, end=' ')
        next_words = [item for item, freq in bi_gram_freq[word].items()]
        if len(next_words) > 0:
            # randomly choose a next word
            word = random.choice(next_words)
        else:
            break    # break if the word has no following words
    print()

In [77]:
# generate 30 words after "natural"
generate_sentence('natural', 30)

natural gas reserves to the company said . s . 5 . s . the dollar . the dollar and other countries and other than the u . 5 . 
