# 1. Load the 'reuters' corpus

In [1]:
# Necessary installations 
#!pip install nltk
#!pip install numpy

# Start by importing the necessary libraries
import nltk
import string
from nltk.util import ngrams
from nltk.probability import FreqDist
from collections import defaultdict

# Additionally used libraries
import re
import numpy as np

In [2]:
# Ensure the reuters corpus is downloaded
nltk.download('reuters')
nltk.download('punkt')

[nltk_data] Downloading package reuters to /home/lisa/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /home/lisa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load the reuters corpus
from nltk.corpus import reuters

# 2. Preprocess the corpus

In [4]:
def preprocessing(corpus):
    """
    Preprocess the raw text of a corpus. Namely remove numbers, lowercase, remove puncutation, tokenize.
    Parameters:
        corpus (str): the corpus of text to be preprocessed
    Returns:
        corpus (list): the preprocessed corpus
    """
    # remove numbers
    corpus = re.sub(r'\d+', ' ', corpus)
    # lowercase everything
    corpus = corpus.lower() 
    # remove punctuation 
    corpus = "".join([char for char in corpus if char not in string.punctuation])
    # tokenize 
    corpus = nltk.word_tokenize(corpus)
    return corpus

In [5]:
prep_reuters = preprocessing(reuters.raw()) # .raw() to extract text from corpus 

print(prep_reuters[:30])

['asian', 'exporters', 'fear', 'damage', 'from', 'usjapan', 'rift', 'mounting', 'trade', 'friction', 'between', 'the', 'us', 'and', 'japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'asias', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'farreaching']


# 3. N-grams & 4. their frequencies

In [6]:
def ngrams(corpus, n:int):
    """
    Creates ngrams of a preprocessed corpus.
    Parameters:
        corpus (list): a preprocessed (lowercase, no punctuation, no numbers, tokenized) corpus
        n (int): the length of each ngram
    Returns:
        ngrams (zip): the ngrams of the corpus
    """
    ngrams = nltk.ngrams(corpus, n) 
    return ngrams

In [7]:
def freq_ngrams(ngrams):
    """
    Calculates the frequency of each ngram and displays them.
    Parameters:
        ngrams (zip): the ngrams to count the frequencies of
    Returns: 
        freq_ngrams (FreqDist): each ngram and its frequency
    """
    freq_ngrams = nltk.FreqDist(ngrams)
    return freq_ngrams

In [17]:
ngrams_reuters = ngrams(prep_reuters, 3)

freq_ngrams_reuters = freq_ngrams(ngrams_reuters)

freq_ngrams_reuters

FreqDist({('mln', 'vs', 'mln'): 3402, ('cts', 'vs', 'cts'): 1779, ('revs', 'mln', 'vs'): 1515, ('shr', 'cts', 'vs'): 1446, ('the', 'company', 'said'): 1180, ('vs', 'cts', 'net'): 1169, ('cts', 'net', 'vs'): 1082, ('of', 'mln', 'dlrs'): 1049, ('net', 'vs', 'revs'): 887, ('mln', 'dlrs', 'in'): 819, ...})

# 5. Predict the next word

In [9]:
def predict_word_model(input_sentence, freq_ngrams): 
    """
    Predict the next word of an input sentence. 
    Parameters:
        input_sentence (string): the sentence to predict a new word for
        freq_ngrams (FreqDist): the ngrams and their frequencies to base the prediction on
    Returns:
        next_word (string): the next word or a no-next-word message
    """
    # preprocess input sequence
    prep_input_sentence = preprocessing(input_sentence) 
    # change type of the freq_ngrams class object to list
    freq_ngrams = list(freq_ngrams.items())
    # extract the last n-1 words from the sentence, 
    # where n is the length of the ngrams
    len_ngrams = len(freq_ngrams[0][0])
    last_n_words = prep_input_sentence[-len_ngrams+1:]
    # collect all possible next words and their frequencies in a list
    possible_next_words = []
    for ngram in freq_ngrams:
        i = 0
        for word in last_n_words: 
            if ngram[0][i] == word:
                i += 1
                if i == len_ngrams-1:
                    possible_next_words.append(ngram)
            else:
                break
    # choose the next word according to the highest frequency
    # make sure there is at least one possible word
    if possible_next_words == []:
        next_word = 'No next word based on this corpus'
    else: 
        position = np.argmax([i[1] for i in possible_next_words])
        next_word = possible_next_words[position][0][len_ngrams-1]
    
    return next_word

In [10]:
input_sentence = 'hi du 55 ... panterleun automotive technologies of the ' 

In [11]:
predict_word_model(input_sentence, freq_ngrams_reuters)

'company'