In [1]:
import nltk

In [2]:
# create a string to demonstrate n-gram identification

text = "New York City " + \
       ".NET Core " + \
       "Microsoft Azure " + \
       "Natural Language Processing"
text

'New York City .NET Core Microsoft Azure Natural Language Processing'

In [3]:
# tokenize

tokens = nltk.word_tokenize(text)
tokens

['New',
 'York',
 'City',
 '.NET',
 'Core',
 'Microsoft',
 'Azure',
 'Natural',
 'Language',
 'Processing']

In [4]:
# find bi-grams (2-grams)

bigrams = nltk.bigrams(tokens)
list(bigrams)

[('New', 'York'),
 ('York', 'City'),
 ('City', '.NET'),
 ('.NET', 'Core'),
 ('Core', 'Microsoft'),
 ('Microsoft', 'Azure'),
 ('Azure', 'Natural'),
 ('Natural', 'Language'),
 ('Language', 'Processing')]

In [5]:
# find tri-grams (3-grams)

trigrams = nltk.trigrams(tokens)
list(trigrams)

[('New', 'York', 'City'),
 ('York', 'City', '.NET'),
 ('City', '.NET', 'Core'),
 ('.NET', 'Core', 'Microsoft'),
 ('Core', 'Microsoft', 'Azure'),
 ('Microsoft', 'Azure', 'Natural'),
 ('Azure', 'Natural', 'Language'),
 ('Natural', 'Language', 'Processing')]

In [6]:
# find 4-grams

four_grams = nltk.ngrams(tokens, 4)
list(four_grams)

[('New', 'York', 'City', '.NET'),
 ('York', 'City', '.NET', 'Core'),
 ('City', '.NET', 'Core', 'Microsoft'),
 ('.NET', 'Core', 'Microsoft', 'Azure'),
 ('Core', 'Microsoft', 'Azure', 'Natural'),
 ('Microsoft', 'Azure', 'Natural', 'Language'),
 ('Azure', 'Natural', 'Language', 'Processing')]

In [7]:
# read the War of the Worlds

with open("wotw.txt") as f:
    wotw = f.read()

In [8]:
# code for preprocessing 

import string
import re

nltk.download('punkt')
nltk.download('stopwords')

def lower_tokens(tokens):
    return [token.lower() for token in tokens]

def remove_punctuation_tokens(tokens):
    punct_regex = re.compile('[{}]'.format(re.escape(string.punctuation)))
    return [a for a,b in zip(tokens, [punct_regex.sub('', token) for token in tokens]) if b != '']

def get_cleaned_tokens(tokens):
    return remove_punctuation_tokens(lower_tokens(tokens))

def remove_stopword_tokens(tokens):
    stopwords = nltk.corpus.stopwords.words('english')
    return [token for token in tokens if token not in stopwords]

def remove_punctuation_in_tokens(tokens):
    punct_regex = re.compile('[{}]'.format(re.escape(string.punctuation)))
    return [punct_regex.sub('', token) for token in tokens]

def preprocess_sentence(sentence):
    return remove_stopword_tokens(remove_punctuation_in_tokens(get_cleaned_tokens(nltk.word_tokenize(sentence))))

def preprocess_tokens(tokens):
    return remove_stopword_tokens(remove_punctuation_in_tokens(get_cleaned_tokens(tokens)))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maxen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maxen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# preprocess 

tokens = preprocess_sentence(wotw)

In [10]:
len(tokens)

29662

In [11]:
# create bigram measures, determines if a bigram is meaningful 
# using a number of statistical measures

bigram_measures = nltk.collocations.BigramAssocMeasures()

# this class looks through all bigrams as potential collocations

finder = nltk.BigramCollocationFinder.from_words(tokens)

#  all bigrams with count < 3 are not significant
finder.apply_freq_filter(3)

In [12]:
# find the top 15 bigrams based on raw frequency

matches = finder.nbest(bigram_measures.raw_freq, 15)
matches

[('red', 'weed'),
 ('black', 'smoke'),
 ('could', 'see'),
 ('ulla', 'ulla'),
 ('came', 'upon'),
 ('far', 'away'),
 ('along', 'road'),
 ('another', 'moment'),
 ('ca', 'nt'),
 ('pine', 'trees'),
 ('one', 'another'),
 ('first', 'time'),
 ('hundred', 'yards'),
 ('one', 'two'),
 ('edge', 'pit')]