## Text Tokenization

### Sentence Tokenization

In [2]:
import nltk
from nltk.corpus import gutenberg
from pprint import pprint

In [12]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/fellipe/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [13]:
austen = gutenberg.raw(fileids='austen-emma.txt')

sample_text = 'We will discuss briefly about the basic syntax, structure and\
              design philosophies. There is a defined hierarchical syntax for Python code\
              which you should remember when writing code! Python is a really powerful\
              programming language!'

print(len(austen))

887071


In [14]:
print(austen[0:100])

[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a


In [16]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/fellipe/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [19]:
default_st = nltk.sent_tokenize
austen_sentences = default_st(text=austen)
sample_sentences = default_st(text=sample_text)

print('Total sentences in sample_text: {}'.format(len(sample_sentences))),
print('Sample text sentences :-')
pprint(sample_sentences)
print('\nTotal sentences in austen: {}'.format(len(austen_sentences))),
print('First 5 sentences in austen:-')
pprint(austen_sentences[0:5])

Total sentences in sample_text: 3
Sample text sentences :-
['We will discuss briefly about the basic syntax, structure and              '
 'design philosophies.',
 'There is a defined hierarchical syntax for Python code              which '
 'you should remember when writing code!',
 'Python is a really powerful              programming language!']

Total sentences in austen: 7456
First 5 sentences in austen:-
['[Emma by Jane Austen 1816]\n'
 '\n'
 'VOLUME I\n'
 '\n'
 'CHAPTER I\n'
 '\n'
 '\n'
 'Emma Woodhouse, handsome, clever, and rich, with a comfortable home\n'
 'and happy disposition, seemed to unite some of the best blessings\n'
 'of existence; and had lived nearly twenty-one years in the world\n'
 'with very little to distress or vex her.',
 'She was the youngest of the two daughters of a most affectionate,\n'
 "indulgent father; and had, in consequence of her sister's marriage,\n"
 'been mistress of his house from a very early period.',
 'Her mother\n'
 'had died too long ago f

In [21]:
nltk.download('europarl_raw')

[nltk_data] Downloading package europarl_raw to
[nltk_data]     /home/fellipe/nltk_data...
[nltk_data]   Unzipping corpora/europarl_raw.zip.


True

In [23]:
from nltk.corpus import europarl_raw

german_text = europarl_raw.german.raw(fileids='ep-00-01-17.de')

print('Total characters in the corpus: {}'.format(len(german_text)))
print('First 100 characters in the corpus ->')
print(german_text[0:100])

Total characters in the corpus: 157171
First 100 characters in the corpus ->
 
Wiederaufnahme der Sitzungsperiode Ich erkläre die am Freitag , dem 17. Dezember unterbrochene Sit


In [24]:
german_sentences_def = default_st(text=german_text, language='german')

# loading german text tokenizer into a PunktSentenceTokenizer instance
german_tokenizer = nltk.data.load(resource_url='tokenizers/punkt/german.pickle')
german_sentences = german_tokenizer.tokenize(german_text)

# verify the type of german_tokenizer
# should be PunktSentenceTokenizer
print(type(german_tokenizer))

<class 'nltk.tokenize.punkt.PunktSentenceTokenizer'>


In [25]:
print(german_sentences_def == german_sentences)

True


In [26]:
# print first 5 sentences of the corpus
for sent in german_sentences[0:5]:
  print(sent)

 
Wiederaufnahme der Sitzungsperiode Ich erkläre die am Freitag , dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen , wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe , daß Sie schöne Ferien hatten .
Wie Sie feststellen konnten , ist der gefürchtete " Millenium-Bug " nicht eingetreten .
Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden .
Im Parlament besteht der Wunsch nach einer Aussprache im Verlauf dieser Sitzungsperiode in den nächsten Tagen .
Heute möchte ich Sie bitten - das ist auch der Wunsch einiger Kolleginnen und Kollegen - , allen Opfern der Stürme , insbesondere in den verschiedenen Ländern der Europäischen Union , in einer Schweigeminute zu gedenken .


In [27]:
punkt_st = nltk.tokenize.PunktSentenceTokenizer()
sample_sentences = punkt_st.tokenize(sample_text)
pprint(sample_sentences)

['We will discuss briefly about the basic syntax, structure and              '
 'design philosophies.',
 'There is a defined hierarchical syntax for Python code              which '
 'you should remember when writing code!',
 'Python is a really powerful              programming language!']


### Word Tokenizer

In [3]:
sentence = "The brown fox wasn't that quick and he couldn't win the race"

default_wt = nltk.word_tokenize
words = default_wt(sentence)

print(words)

['The', 'brown', 'fox', 'was', "n't", 'that', 'quick', 'and', 'he', 'could', "n't", 'win', 'the', 'race']


In [4]:
treebank_wt = nltk.TreebankWordTokenizer()
words = treebank_wt.tokenize(sentence)

print(words)

['The', 'brown', 'fox', 'was', "n't", 'that', 'quick', 'and', 'he', 'could', "n't", 'win', 'the', 'race']


In [5]:
# pattern to identify tokens themselves
TOKEN_PATTERN = r'\w+'
regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN, gaps=False)
words = regex_wt.tokenize(sentence)

print(words)

['The', 'brown', 'fox', 'wasn', 't', 'that', 'quick', 'and', 'he', 'couldn', 't', 'win', 'the', 'race']


In [6]:
# pattern to identify gaps in tokens
GAP_PATTERN = r'\s+'
regex_wt = nltk.RegexpTokenizer(pattern=GAP_PATTERN, gaps=True)
words = regex_wt.tokenize(sentence)

print(words)

['The', 'brown', 'fox', "wasn't", 'that', 'quick', 'and', 'he', "couldn't", 'win', 'the', 'race']


In [7]:
# get start and end indices of each token and then print them
word_indices = list(regex_wt.span_tokenize(sentence))

print(word_indices)
print([sentence[start:end] for start, end in word_indices])

[(0, 3), (4, 9), (10, 13), (14, 20), (21, 25), (26, 31), (32, 35), (36, 38), (39, 47), (48, 51), (52, 55), (56, 60)]
['The', 'brown', 'fox', "wasn't", 'that', 'quick', 'and', 'he', "couldn't", 'win', 'the', 'race']


In [8]:
wordpunkt_wt = nltk.WordPunctTokenizer()
words = wordpunkt_wt.tokenize(sentence)

print(words)

['The', 'brown', 'fox', 'wasn', "'", 't', 'that', 'quick', 'and', 'he', 'couldn', "'", 't', 'win', 'the', 'race']


In [9]:
whitespace_wt = nltk.WhitespaceTokenizer()
words = whitespace_wt.tokenize(sentence)

print(words)

['The', 'brown', 'fox', "wasn't", 'that', 'quick', 'and', 'he', "couldn't", 'win', 'the', 'race']


## Text Normalization

In [10]:
import nltk
import re
import string
from pprint import pprint

In [11]:
corpus = [
  "The brown fox wasn't that quick and he couldn't win the race",\
  "Hey that's a great deal! I just bought a phone for $199",\
  "@@You'll (learn) a **lot** in the book. Python is an amazing\
  language !@@"
]

### Tokenizer Text

In [12]:
# takes in textual data, extracts sentences from it, and splits each sentence into further tokens
def tokenize_text(text):
  sentences = nltk.sent_tokenize(text)
  word_tokens = [nltk.word_tokenize(sentence) for sentence in sentences]
  return word_tokens

In [14]:
token_list = [tokenize_text(text) for text in corpus]

pprint(token_list)

[[['The',
   'brown',
   'fox',
   'was',
   "n't",
   'that',
   'quick',
   'and',
   'he',
   'could',
   "n't",
   'win',
   'the',
   'race']],
 [['Hey', 'that', "'s", 'a', 'great', 'deal', '!'],
  ['I', 'just', 'bought', 'a', 'phone', 'for', '$', '199']],
 [['@',
   '@',
   'You',
   "'ll",
   '(',
   'learn',
   ')',
   'a',
   '*',
   '*',
   'lot',
   '*',
   '*',
   'in',
   'the',
   'book',
   '.'],
  ['Python', 'is', 'an', 'amazing', 'language', '!'],
  ['@', '@']]]


### Removing Special Characters

In [13]:
def remove_characters_after_tokenization(tokens):
  pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
  filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
  return filtered_tokens

In [15]:
filtered_list_1 = [filter(None,[remove_characters_after_tokenization(tokens) for tokens in sentence_tokens]) for sentence_tokens in token_list]

print(filtered_list_1)

[<filter object at 0x7fad993ebdc0>, <filter object at 0x7fad993ebb50>, <filter object at 0x7fad993eb0a0>]


In [16]:
def remove_characters_before_tokenization(sentence, keep_apostrophes=False):
  sentence = sentence.strip()

  if keep_apostrophes:
    PATTERN = r'[?|$|&|*|%|@|(|)|~]' # add other characters here to remove them
    filtered_sentence = re.sub(PATTERN, r'', sentence)
  else:
    PATTERN = r'[^a-zA-Z0-9 ]' # only extract alpha-numeric characters
    filtered_sentence = re.sub(PATTERN, r'', sentence)

  return filtered_sentence

In [17]:
filtered_list_2 = [remove_characters_before_tokenization(sentence) for sentence in corpus]

print(filtered_list_2)

['The brown fox wasnt that quick and he couldnt win the race', 'Hey thats a great deal I just bought a phone for 199', 'Youll learn a lot in the book Python is an amazing  language ']


In [18]:
cleaned_corpus = [remove_characters_before_tokenization(sentence, keep_apostrophes=True) for sentence in corpus]

print(cleaned_corpus)

["The brown fox wasn't that quick and he couldn't win the race", "Hey that's a great deal! I just bought a phone for 199", "You'll learn a lot in the book. Python is an amazing  language !"]


### Removing Stopwords

In [19]:
def remove_stopwords(tokens):
  stopword_list = nltk.corpus.stopwords.words('english')
  filtered_tokens = [token for token in tokens if token not in stopword_list]

  return filtered_tokens

In [22]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fellipe/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [23]:
expanded_corpus_tokens = [tokenize_text(text) for text in cleaned_corpus]
filtered_list_3 = [[remove_stopwords(tokens) for tokens in sentence_tokens] for sentence_tokens in expanded_corpus_tokens]

print(filtered_list_3)

[[['The', 'brown', 'fox', "n't", 'quick', 'could', "n't", 'win', 'race']], [['Hey', "'s", 'great', 'deal', '!'], ['I', 'bought', 'phone', '199']], [['You', "'ll", 'learn', 'lot', 'book', '.'], ['Python', 'amazing', 'language', '!']]]


In [24]:
# stopwords removed
nltk.corpus.stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## Correcting Words

### Correcting Repeating Characters

In [27]:
old_word = 'finalllyyy'
repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
match_substitution = r'\1\2\3'
step = 1

while True:
  # remove one repeated character
  new_word = repeat_pattern.sub(match_substitution, old_word)

  if new_word != old_word:
    print('Step: {} Word: {}'.format(step, new_word))
    step += 1 # update step

    # update old word to last substituted state
    old_word = new_word
    continue
  else:
    print('Final word: {}'.format(new_word))

    break

Step: 1 Word: finalllyy
Step: 2 Word: finallly
Step: 3 Word: finally
Step: 4 Word: finaly
Final word: finaly


In [31]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/fellipe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/fellipe/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [32]:
from nltk.corpus import wordnet

old_word = 'finalllyyy'
repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
match_substitution = r'\1\2\3'
step = 1

while True:
  # check for semantically correct word
  if wordnet.synsets(old_word):
    print('Final correct word: {}'.format(old_word))
    break

  # remove one repeated character
  new_word = repeat_pattern.sub(match_substitution, old_word)

  if new_word != old_word:
    print('Step: {} Word: {}'.format(step, new_word))
    step += 1 # update step

    # update old word to last substituted state
    old_word = new_word
    continue
  else:
    print('Final word: {}'.format(new_word))
    break

Step: 1 Word: finalllyy
Step: 2 Word: finallly
Step: 3 Word: finally
Final correct word: finally


In [33]:
from nltk.corpus import wordnet

def remove_repeated_characters(tokens):
  repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
  match_substitution = r'\1\2\3'

  def replace(old_word):
    if wordnet.synsets(old_word):
      return old_word

    new_word = repeat_pattern.sub(match_substitution, old_word)

    return replace(new_word) if new_word != old_word else new_word

  correct_tokens = [replace(word) for word in tokens]

  return correct_tokens

In [34]:
sample_sentence = 'My schooool is realllllyyy amaaazingggg'
sample_sentence_tokens = tokenize_text(sample_sentence)[0]

print(sample_sentence_tokens)

['My', 'schooool', 'is', 'realllllyyy', 'amaaazingggg']


In [35]:
print(remove_repeated_characters(sample_sentence_tokens))

['My', 'school', 'is', 'really', 'amazing']


### Correcting Spellings

In [38]:
import re, collections

def tokens(text):
  """
  Get all words from the corpus
  """
  return re.findall('[a-z]+', text.lower())

f = open('big_text.txt', 'r')

WORDS = tokens(f.read())
WORD_COUNTS = collections.Counter(WORDS)

In [39]:
# top 10 words in the corpus
print(WORD_COUNTS.most_common(10))

[('the', 80030), ('of', 40025), ('and', 38313), ('to', 28766), ('in', 22050), ('a', 21155), ('that', 12512), ('he', 12401), ('was', 11410), ('it', 10681)]


In [40]:
def edits0(word):
  """
  Return all strings that are zero edits away
  from the input word (i.e., the word itself).
  """
  return {word}

def edits1(word):
  """
  Return all strings that are one edit away
  from the input word.
  """
  alphabet = 'abcdefghijklmnopqrstuvwxyz'

  def splits(word):
    """
    Return a list of all possible (first, rest) pairs
    that the input word is made of.
    """
    return [(word[:i], word[i:]) for i in range(len(word)+1)]
  
  pairs = splits(word)
  deletes = [a+b[1:] for (a, b) in pairs if b]
  transposes = [a+b[1]+b[0]+b[2:] for (a, b) in pairs if len(b) > 1]
  replaces = [a+c+b[1:] for (a, b) in pairs for c in alphabet if b]
  inserts = [a+c+b for (a, b) in pairs for c in alphabet]

  return set(deletes + transposes + replaces + inserts)

def edits2(word):
  """Return all strings that are two edits away
  from the input word.
  """
  return {e2 for e1 in edits1(word) for e2 in edits1(e1)}

In [41]:
def known(words):
  """
  Return the subset of words that are actually
  in our WORD_COUNTS dictionary.
  """
  return {w for w in words if w in WORD_COUNTS}

In [42]:
# input word
word = 'fianlly'

# zero edit distance from input word
edits0(word)

{'fianlly'}

In [43]:
# returns null set since it is not a valid word
known(edits0(word))

set()

In [44]:
# one edit distance from input word
edits1(word)

{'afianlly',
 'aianlly',
 'bfianlly',
 'bianlly',
 'cfianlly',
 'cianlly',
 'dfianlly',
 'dianlly',
 'efianlly',
 'eianlly',
 'faanlly',
 'faianlly',
 'fainlly',
 'fanlly',
 'fbanlly',
 'fbianlly',
 'fcanlly',
 'fcianlly',
 'fdanlly',
 'fdianlly',
 'feanlly',
 'feianlly',
 'ffanlly',
 'ffianlly',
 'fganlly',
 'fgianlly',
 'fhanlly',
 'fhianlly',
 'fiaally',
 'fiaanlly',
 'fiablly',
 'fiabnlly',
 'fiaclly',
 'fiacnlly',
 'fiadlly',
 'fiadnlly',
 'fiaelly',
 'fiaenlly',
 'fiaflly',
 'fiafnlly',
 'fiaglly',
 'fiagnlly',
 'fiahlly',
 'fiahnlly',
 'fiailly',
 'fiainlly',
 'fiajlly',
 'fiajnlly',
 'fiaklly',
 'fiaknlly',
 'fiallly',
 'fially',
 'fialnlly',
 'fialnly',
 'fiamlly',
 'fiamnlly',
 'fianally',
 'fianaly',
 'fianblly',
 'fianbly',
 'fianclly',
 'fiancly',
 'fiandlly',
 'fiandly',
 'fianelly',
 'fianely',
 'fianflly',
 'fianfly',
 'fianglly',
 'fiangly',
 'fianhlly',
 'fianhly',
 'fianilly',
 'fianily',
 'fianjlly',
 'fianjly',
 'fianklly',
 'fiankly',
 'fianlaly',
 'fianlay',
 'fi

In [45]:
# get correct words from above set
known(edits1(word))

{'finally'}

In [46]:
# two edit distances from input word
edits2(word)

{'cfianmlly',
 'fjazlly',
 'fianlhll',
 'rfiaflly',
 'fianlfll',
 'fianwlg',
 'xfianfly',
 'fianllei',
 'fiacnllly',
 'faiablly',
 'fianslle',
 'fianlblyd',
 'fianyllb',
 'sfianlyy',
 'fibnllg',
 'frianlyly',
 'fiankaly',
 'fidmanlly',
 'fnianqly',
 'fifanllyh',
 'xianllyy',
 'fiarnllye',
 'fiaqnley',
 'fianolnly',
 'hlfianlly',
 'sianlty',
 'fiavolly',
 'fianzoy',
 'finnllh',
 'fimanhly',
 'fianylliy',
 'gfiaolly',
 'fsanlmly',
 'fiadnloly',
 'fijanlay',
 'fianlvjy',
 'fjanlgly',
 'afisnlly',
 'fzanilly',
 'fiaxnllxy',
 'fiannll',
 'feianlsly',
 'flmnlly',
 'fqianllhy',
 'ffanloy',
 'fuannlly',
 'yianllz',
 'fzicanlly',
 'fifankly',
 'faianlny',
 'frianlvly',
 'fikanfly',
 'fikuanlly',
 'cfianllp',
 'lijnlly',
 'fianllybh',
 'afianplly',
 'fziangly',
 'tianllx',
 'fiqanlqly',
 'fiarnsly',
 'fianevlly',
 'fianxxlly',
 'fiamley',
 'fdanoly',
 'fiaawlly',
 'fivnllmy',
 'fikanllyb',
 'cfianblly',
 'gfianlily',
 'fxianllyv',
 'fianlaily',
 'fibanlaly',
 'ffantlly',
 'jianlaly',
 'finnll',


In [47]:
# get correct words from above set
known(edits2(word))

{'faintly', 'finally', 'finely', 'frankly'}

In [48]:
candidates = (known(edits0(word)) or known(edits1(word)) or known(edits2(word)) or [word])

candidates

{'finally'}

In [49]:
def correct(word):
  """
  Get the best correct spelling for the input word
  """
  # Priority is for edit distance 0, then 1, then 2
  # else defaults to the input word itself.
  candidates = (known(edits0(word)) or known(edits1(word)) or known(edits2(word)) or [word])

  return max(candidates, key=WORD_COUNTS.get)

In [50]:
correct('fianlly')

'finally'

In [51]:
correct('FIANLLY')

'FIANLLY'

In [52]:
def correct_match(match):
  """
  Spell-correct word in match,
  and preserve proper upper/lower/title case.
  """
  word = match.group()

  def case_of(text):
    """
    Return the case-function appropriate
    for text: upper, lower, title, or just str.:
    """
    return (str.upper if text.isupper() else str.lower if text.islower() else str.title if text.istitle() else str)

  return case_of(word)(correct(word.lower()))

def correct_text_generic(text):
  """
  Correct all the words within a text,
  returning the corrected text.
  """
  return re.sub('[a-zA-Z]+', correct_match, text)

In [53]:
correct_text_generic('fianlly')

'finally'

In [54]:
correct_text_generic('FIANLLY')

'FINALLY'

### Stemming

In [55]:
# Porter Stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [56]:
print(ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped'))

jump jump jump


In [57]:
print(ps.stem('lying'))

lie


In [58]:
print(ps.stem('strange'))

strang


In [59]:
# Lancaster Stemmer
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()

In [60]:
print(ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped'))

jump jump jump


In [62]:
print(ls.stem('lying'))

lying


In [63]:
print(ls.stem('strange'))

strange


In [64]:
# Regex based stemmer
from nltk.stem import RegexpStemmer
rs = RegexpStemmer('ing$|s$|ed$', min=4)

In [65]:
print(rs.stem('jumping'), rs.stem('jumps'), rs.stem('jumped'))

jump jump jump


In [66]:
print(rs.stem('lying'))

ly


In [67]:
print(rs.stem('strange'))

strange


### Lemmatization

In [68]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [69]:
# lemmatize nouns
print(wnl.lemmatize('cars', 'n'))
print(wnl.lemmatize('men', 'n'))

car
men


In [70]:
# lemmatize verbs
print(wnl.lemmatize('running', 'v'))
print(wnl.lemmatize('ate', 'v'))

run
eat


In [71]:
# lemmatize adjectives
print(wnl.lemmatize('saddest', 'a'))
print(wnl.lemmatize('fancier', 'a'))

sad
fancy
