In [1]:
import nltk
import re
import string
from nltk.corpus import inaugural
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## 1 - Loading the text

In [18]:
# load text
text=inaugural.raw("2013-obama.txt")

## 2 - Cleaning

### a) Getting the punctuation

In [17]:
print ("PUNCTUATION: ", string.punctuation)

PUNCTUATION:  !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


### b) Defining a function to use regex to strip words into tokens

Note that this is just one of the possible methods to tokenize. Other could be used as well and compared to get a broader view on which one would work better for a given future task.

In [20]:
# split the input on anything other than a word
def regwords(text):
    clean_tokens = re.split(r'\W+', text)
    return clean_tokens

### c) Get the tokens

In [24]:
# only words with regex based function
clean_tokens = regwords(text)
print ("This many tokens are gained: \n", len(clean_tokens))

This many tokens are gained: 
 2126


In [None]:
# # in case we would like to see the results:
# print ("Split the input by any non-alphanumeric character.:\n", sorted(clean_tokens))

## 3 - Filter the stopwords out

In [25]:
# get the stop words
stop_words = stopwords.words('english')

# get the rest after filtering out the stop words
nonstop_words = [t for t in clean_tokens if not t in stop_words]

## 4 - Normalization

In [26]:
# normalization by lowering the capitals

words = [word.lower() for word in nonstop_words]

## 5 - Stemming

In [27]:
# stemming | lemmatization
# here we apply stemming

porter = PorterStemmer()
stemmed = [porter.stem(word) for word in words]

# Conclusion: Stemmed Tokens

In [33]:
print ("This many stemmed tokens we have:\n ", len(stemmed))
print ("\n\n\n")
print ("List of stemmed tokens:\n\n ", sorted(stemmed))


This many stemmed tokens we have:
  1127




List of stemmed tokens:

  ['', '1776', '200', '4', '40', '400', 'a', 'abroad', 'absolut', 'act', 'act', 'act', 'act', 'act', 'act', 'action', 'advanc', 'advanc', 'affirm', 'afford', 'africa', 'ago', 'agre', 'allegi', 'allianc', 'alon', 'alon', 'alon', 'along', 'also', 'also', 'alway', 'alway', 'america', 'america', 'america', 'america', 'america', 'america', 'america', 'america', 'america', 'american', 'american', 'american', 'american', 'american', 'american', 'american', 'american', 'american', 'americanâ', 'among', 'an', 'anchor', 'ancient', 'and', 'and', 'and', 'and', 'anew', 'anoth', 'answer', 'anybodi', 'anyon', 'appalachia', 'arm', 'articul', 'asia', 'author', 'avoid', 'away', 'awesom', 'bare', 'basic', 'battl', 'be', 'bear', 'began', 'begun', 'behalf', 'belief', 'believ', 'believ', 'believ', 'believ', 'believ', 'betray', 'better', 'biden', 'bind', 'birthright', 'bleakest', 'bless', 'bless', 'blood', 'blood', 'born', 'bound', 'bounda

In [10]:
print ("# vocabulary: ", len(set(stemmed)))
print ("-------------")
print ("-------------")
print ("Vocabulary: ", sorted(set(stemmed)))


# vocabulary:  652
-------------
-------------
Vocabulary:  ['', '1776', '200', '4', '40', '400', 'a', 'abroad', 'absolut', 'act', 'action', 'advanc', 'affirm', 'afford', 'africa', 'ago', 'agre', 'allegi', 'allianc', 'alon', 'along', 'also', 'alway', 'america', 'american', 'americanâ', 'among', 'an', 'anchor', 'ancient', 'and', 'anew', 'anoth', 'answer', 'anybodi', 'anyon', 'appalachia', 'arm', 'articul', 'asia', 'author', 'avoid', 'away', 'awesom', 'bare', 'basic', 'battl', 'be', 'bear', 'began', 'begun', 'behalf', 'belief', 'believ', 'betray', 'better', 'biden', 'bind', 'birthright', 'bleakest', 'bless', 'blood', 'born', 'bound', 'boundari', 'brave', 'bridg', 'bright', 'bring', 'brink', 'broad', 'brother', 'build', 'built', 'busi', 'but', 'call', 'cannot', 'cap', 'capac', 'capitol', 'care', 'carri', 'cast', 'cede', 'celebr', 'central', 'centuri', 'certain', 'challeng', 'chanc', 'chang', 'charact', 'chariti', 'cherish', 'chief', 'child', 'children', 'choic', 'choos', 'citizen', 'claim