##  Machine Learning Master Tutorial

####  from Jason Brownlee 
####  Ch 5  Manualy cleaning with NLTK

In [1]:
# file from: http://www.gutenberg.org/cache/epub/5200/pg5200.txt

filename = 'metamorphosis_clean.txt'
filepath = 'D:\\OneDrive - QJA\\My Files\\DataScience\\DataSets'

file = open(filepath + '\\' + filename, 'rt')

text = file.read()
file.close()

###  Manual Tokenization

In [2]:
# Split by whitespace

words = text.split()
print(words[:50])
print(len(words))

['One', 'morning,', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams,', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin.', 'He', 'lay', 'on', 'his', 'armour-like', 'back,', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly,', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections.', 'The']
22086


In [3]:
# Select words (exclude punctuation)
# selects alphanumeric (a-z, A-Z, 0-9, '_')
# see better method below

import re

words2 = re.split(r'\W+', text)

print(words2[:50])
print(len(words2))

['One', 'morning', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', 'He', 'lay', 'on', 'his', 'armour', 'like', 'back', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections']
22376


In [4]:
# Split by whitespace and remove punctuation
# using string.punctuation
# note: this is preferrd to re.split(r'\W+', text) above

import string

# set variable to act with desired function
re_punc = re.compile('[%s]' % re.escape(string.punctuation))

# use sub() with compile variable to replace 
strip_pun = [re_punc.sub('', w) for w in words]

stripped = strip_pun



# Remove non-printable characters
# use similar approach as above
re_print = re.compile('[^%s]' % re.escape(string.printable))
strip_pri = [re_print.sub('', w) for w in stripped]

# see if there were any non-printable characters
print(len(strip_pun)); print(len(strip_pri))

print(stripped[:50])

22086
22086
['One', 'morning', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', 'He', 'lay', 'on', 'his', 'armourlike', 'back', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections', 'The']


In [5]:
# Normalizing Case

words3 = [word.lower() for word in stripped]

print(words3[:100])

['one', 'morning', 'when', 'gregor', 'samsa', 'woke', 'from', 'troubled', 'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', 'he', 'lay', 'on', 'his', 'armourlike', 'back', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections', 'the', 'bedding', 'was', 'hardly', 'able', 'to', 'cover', 'it', 'and', 'seemed', 'ready', 'to', 'slide', 'off', 'any', 'moment', 'his', 'many', 'legs', 'pitifully', 'thin', 'compared', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'him', 'waved', 'about', 'helplessly', 'as', 'he', 'looked', 'whats', 'happened', 'to', 'me', 'he', 'thought', 'it', 'wasnt', 'a', 'dream', 'his', 'room', 'a', 'proper', 'human']


### Tokenization and Cleaning using NLTK

In [6]:
# this imports nltk and allows you to download corpora, 
# documents and tools from nltk

# import nltk
# nltk.download()

In [7]:
from nltk import sent_tokenize

filename = 'metamorphosis_clean.txt'
filepath = 'D:\\OneDrive - QJA\\My Files\\DataScience\\DataSets'

file = open(filepath + '\\' + filename, 'rt')

text = file.read()
file.close()

sentences = sent_tokenize(text)
print(sentences[0])

One morning, when Gregor Samsa woke from troubled dreams, he found
himself transformed in his bed into a horrible vermin.


In [8]:
# NLTK split into words and filter out punctuation

from nltk import word_tokenize

tokens = word_tokenize(text)

# for each word in token, keep if is alphanumeric
words = [word for word in tokens if word.isalpha()]

# print(tokens[:50])
print(words[:50])

['One', 'morning', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', 'He', 'lay', 'on', 'his', 'back', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections', 'The', 'bedding']


In [9]:
# NLTK filter out stop words

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

print(stop_words)
print(len(stop_words))

# notice they are lower case with no punctuation
# which requires your text to have same attributes

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [10]:
# Prepare data to use stopwords function

# split to tokens > convert to lower case >
# remove punctuation > filter out non-alphabetic >
# filter out stop words  > filter nonsense words

# break text into tokens
tokens = word_tokenize(text)

# for each word in tokens, convert to lower case
tokens = [w.lower() for w in tokens]

# create object: filter out punctuation and non-printable words
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
re_prin = re.compile('[^%s]' % re.escape(string.printable))

# strip punctuation and non-printable characters
strip_pun = [re_punc.sub('', w) for w in tokens]
strip_pri = [re_prin.sub('', w) for w in strip_pun]

stripped = strip_pri

# remove non-alphabetic words
words = [word for word in stripped if word.isalpha()]

# filter out stop words
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]


# filter out strings that aren't words such as short words with 
#  no vowels that might result from earlier cleaning
#  ex: contractions using NLTK produce tokens like 'nt'. Notice
#  that this didn't happen with manual proccess above

# uses isdisjoint

vowels = set('aeiouAEIOU')
words = [w for w in words 
         if not vowels.isdisjoint(w) and len(w) > 2]

print(words[:100])



['one', 'morning', 'gregor', 'samsa', 'woke', 'troubled', 'dreams', 'found', 'transformed', 'bed', 'horrible', 'vermin', 'lay', 'armourlike', 'back', 'lifted', 'head', 'little', 'could', 'see', 'brown', 'belly', 'slightly', 'domed', 'divided', 'arches', 'stiff', 'sections', 'bedding', 'hardly', 'able', 'cover', 'seemed', 'ready', 'slide', 'moment', 'many', 'legs', 'pitifully', 'thin', 'compared', 'size', 'rest', 'waved', 'helplessly', 'looked', 'happened', 'thought', 'dream', 'room', 'proper', 'human', 'room', 'although', 'little', 'small', 'lay', 'peacefully', 'four', 'familiar', 'walls', 'collection', 'textile', 'samples', 'lay', 'spread', 'table', 'samsa', 'travelling', 'salesman', 'hung', 'picture', 'recently', 'cut', 'illustrated', 'magazine', 'housed', 'nice', 'gilded', 'frame', 'showed', 'lady', 'fitted', 'fur', 'hat', 'fur', 'boa', 'sat', 'upright', 'raising', 'heavy', 'fur', 'muff', 'covered', 'whole', 'lower', 'arm', 'towards', 'viewer', 'gregor']


In [14]:
## Stem Words

# Using Porter Stemmer from NLTK

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
# using tokenize words from above
stemmed_words = [porter.stem(w) for w in words]

print(stemmed_words[:100])

# note much of the 'stemming' is simply removing the 'e' 
# at end of word which doesn't have a lot of value unless
# this is done to all words that will be used in analysis

['one', 'morn', 'gregor', 'samsa', 'woke', 'troubl', 'dream', 'found', 'transform', 'bed', 'horribl', 'vermin', 'lay', 'armourlik', 'back', 'lift', 'head', 'littl', 'could', 'see', 'brown', 'belli', 'slightli', 'dome', 'divid', 'arch', 'stiff', 'section', 'bed', 'hardli', 'abl', 'cover', 'seem', 'readi', 'slide', 'moment', 'mani', 'leg', 'piti', 'thin', 'compar', 'size', 'rest', 'wave', 'helplessli', 'look', 'happen', 'thought', 'dream', 'room', 'proper', 'human', 'room', 'although', 'littl', 'small', 'lay', 'peac', 'four', 'familiar', 'wall', 'collect', 'textil', 'sampl', 'lay', 'spread', 'tabl', 'samsa', 'travel', 'salesman', 'hung', 'pictur', 'recent', 'cut', 'illustr', 'magazin', 'hous', 'nice', 'gild', 'frame', 'show', 'ladi', 'fit', 'fur', 'hat', 'fur', 'boa', 'sat', 'upright', 'rais', 'heavi', 'fur', 'muff', 'cover', 'whole', 'lower', 'arm', 'toward', 'viewer', 'gregor']


In [None]:
## The example text was already pretty clean
# Other considerations:
# extracting html or pdf markup, decode unicode to UTF8 
# characters, domain specific words, phrases, numbers 
# such as dates, amounts, etc, typos/misspellings, etc