# Load Corpus

In [1]:
## Import the necessary libraries
import nltk
import string
from nltk.util import ngrams
from nltk.probability import FreqDist
from collections import defaultdict
import re
from nltk.corpus import reuters
from nltk.tokenize import word_tokenize

In [2]:
# Ensure the reuters corpus is downloaded
nltk.download('reuters')
nltk.download('punkt')

[nltk_data] Downloading package reuters to
[nltk_data]     /home/dradcenko/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /home/dradcenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load the reuters corpus
from nltk.corpus import reuters

# Preprocess Corpus

In [4]:
# Retrieve the text from the Reuters Corpus
rawtext = reuters.raw()

# Transform all letters to lowercase
lowertext = rawtext.lower()

# Remove anything that's not a letter from the corpus
text = re.sub('[^a-zA-Z]+', ' ', lowertext)

# Tokenize the modified corpus
corpus = word_tokenize(text, language='english')

# Transform corpus into a list for later use 
list_ = ' '.join(corpus)
corpus

['asian',
 'exporters',
 'fear',
 'damage',
 'from',
 'u',
 's',
 'japan',
 'rift',
 'mounting',
 'trade',
 'friction',
 'between',
 'the',
 'u',
 's',
 'and',
 'japan',
 'has',
 'raised',
 'fears',
 'among',
 'many',
 'of',
 'asia',
 's',
 'exporting',
 'nations',
 'that',
 'the',
 'row',
 'could',
 'inflict',
 'far',
 'reaching',
 'economic',
 'damage',
 'businessmen',
 'and',
 'officials',
 'said',
 'they',
 'told',
 'reuter',
 'correspondents',
 'in',
 'asian',
 'capitals',
 'a',
 'u',
 's',
 'move',
 'against',
 'japan',
 'might',
 'boost',
 'protectionist',
 'sentiment',
 'in',
 'the',
 'u',
 's',
 'and',
 'lead',
 'to',
 'curbs',
 'on',
 'american',
 'imports',
 'of',
 'their',
 'products',
 'but',
 'some',
 'exporters',
 'said',
 'that',
 'while',
 'the',
 'conflict',
 'would',
 'hurt',
 'them',
 'in',
 'the',
 'long',
 'run',
 'in',
 'the',
 'short',
 'term',
 'tokyo',
 's',
 'loss',
 'might',
 'be',
 'their',
 'gain',
 'the',
 'u',
 's',
 'has',
 'said',
 'it',
 'will',
 'imp

# Create Function to generate N-Grams

In [5]:
def generate_ngrams(text, n): 
    words = text.split()
    alist = [] 
    for i in range (len(words)- n+1):
        alist.append(words[i:i+n])
    return alist

Here, we split a given list into words and create sublists of n words which we then store in the empty list alist. 

In [6]:
# Generate exemplary N-Gram for n = 7
generate_ngrams(list_, 7)

[['asian', 'exporters', 'fear', 'damage', 'from', 'u', 's'],
 ['exporters', 'fear', 'damage', 'from', 'u', 's', 'japan'],
 ['fear', 'damage', 'from', 'u', 's', 'japan', 'rift'],
 ['damage', 'from', 'u', 's', 'japan', 'rift', 'mounting'],
 ['from', 'u', 's', 'japan', 'rift', 'mounting', 'trade'],
 ['u', 's', 'japan', 'rift', 'mounting', 'trade', 'friction'],
 ['s', 'japan', 'rift', 'mounting', 'trade', 'friction', 'between'],
 ['japan', 'rift', 'mounting', 'trade', 'friction', 'between', 'the'],
 ['rift', 'mounting', 'trade', 'friction', 'between', 'the', 'u'],
 ['mounting', 'trade', 'friction', 'between', 'the', 'u', 's'],
 ['trade', 'friction', 'between', 'the', 'u', 's', 'and'],
 ['friction', 'between', 'the', 'u', 's', 'and', 'japan'],
 ['between', 'the', 'u', 's', 'and', 'japan', 'has'],
 ['the', 'u', 's', 'and', 'japan', 'has', 'raised'],
 ['u', 's', 'and', 'japan', 'has', 'raised', 'fears'],
 ['s', 'and', 'japan', 'has', 'raised', 'fears', 'among'],
 ['and', 'japan', 'has', 'rais

# Create Function for the Frequency

In [7]:
def frequency(n, text):
    n_grams = list(ngrams(text, n))
    fdist = FreqDist(n_grams)
    return fdist

To calculate the frequency of the n grams, we split the text into n grams and use the FreqDist() function. 

In [8]:
# Calculate exemplary frequency for n = 2
frequency(3, corpus)

FreqDist({('mln', 'vs', 'mln'): 3402, ('the', 'u', 's'): 2105, ('cts', 'vs', 'cts'): 1779, ('revs', 'mln', 'vs'): 1515, ('shr', 'cts', 'vs'): 1446, ('the', 'company', 'said'): 1180, ('vs', 'cts', 'net'): 1169, ('cts', 'net', 'vs'): 1082, ('of', 'mln', 'dlrs'): 1049, ('net', 'vs', 'revs'): 887, ...})