In [36]:
# https://machinelearningmastery.com/clean-text-machine-learning-python/

In [20]:
import re
import string
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import  PorterStemmer

In [5]:
fname = './data/metamorphosis.txt'

In [12]:
with open(fname, 'rt') as f:
    text = f.read()

In [23]:
# splitting only by whitespace
words = text.split()

In [24]:
print(words[:50])

['One', 'morning,', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams,', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin.', 'He', 'lay', 'on', 'his', 'armour-like', 'back,', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly,', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections.', 'The']


In [25]:
# splitting based on words only
words1  = re.split(r'\W+', text)

In [26]:
print(words1[:50])

['One', 'morning', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', 'He', 'lay', 'on', 'his', 'armour', 'like', 'back', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections']


In [27]:
print([word for word in words1 if len(word) < 2 and word not in 'aAI'][:50])

['s', 't', 't', 't', 's', 't', 'd', 't', 'd', 'd', 'd', 's', 's', 's', 's', 'o', 's', 'o', 's', 's', 's', 's', 's', 't', 'm', 's', 's', 't', 'm', 't', 't', 'o', 'o', 's', 's', 'o', 's', 's', 't', 't', 't', 's', 's', 's', 's', 't', 't', 'm', 't', 't']


So the built-in method splits all the words by come and preserves punctuations which  leaves such words as (“wasn’t” and “armour-like") together while re.split (depending on the pattern) splits those words as well, which creates a lot of redundant separated letter like above.

In [28]:
string.punctuation  # all the punctuation signs

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [29]:
table = str.maketrans('', '', string.punctuation)  # mapping table (third argument is a list of chars to be removed)

In [30]:
stripped = [word.translate(table) for word in words]

In [33]:
print(stripped[:50])

['One', 'morning', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', 'He', 'lay', 'on', 'his', 'armourlike', 'back', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections', 'The']


#### Normalizing Text Case
* vocabulary will shrink but some meaning is lost

In [34]:
words = [word.lower() for word in words]

In [35]:
print(words[:50])

['one', 'morning,', 'when', 'gregor', 'samsa', 'woke', 'from', 'troubled', 'dreams,', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin.', 'he', 'lay', 'on', 'his', 'armour-like', 'back,', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly,', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections.', 'the']


###### Note:
* Cleaning text is really hard, problem specific, and full of tradeoffs.

###### Remember, simple is better.
- Simpler text data, simpler models, smaller vocabularies. You can always make things more complex later to see if it results in better model skill.

In [1]:
# Using nltk
# nltk.download()

In [16]:
# splitting into sentences
sentences = sent_tokenize(text)

In [17]:
print(sentences[:1])

['One morning, when Gregor Samsa woke from troubled dreams, he found\nhimself transformed in his bed into a horrible vermin.']


In [23]:
# splitting into words (contractions are split apart, quotes, punctuation are kept)
words = word_tokenize(text)

In [24]:
print(words[:50])

['One', 'morning', ',', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams', ',', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', '.', 'He', 'lay', 'on', 'his', 'armour-like', 'back', ',', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', ',', 'slightly', 'domed', 'and', 'divided', 'by']


In [25]:
# filtering out the punctuation
words = [word for word in words if word.isalpha()]

In [26]:
print(words[:50])

['One', 'morning', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', 'He', 'lay', 'on', 'his', 'back', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections', 'The', 'bedding']


In [30]:
# filtering out the stop words
stop_words = stopwords.words('english')
# print(stop_words)

In [33]:
# before applying gotta make sure text is clean and is lowercase
words = [word.lower() for word in words]
print(f'Length before filtering {len(words)}, length of unique {len(set(words))}')
words = [word for word in words if word not in stopwords.words('english')]
print(f'Length after filtering: {len(words)}, length of unique {len(set(words))}')

Length before filtering 21904, length of unique 2539
Length after filtering: 9919, length of unique 2421


In [34]:
# Stemming, good use case is to get sentiment rather than a deeper meaning
words = word_tokenize(text)
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in words]

In [35]:
print(stemmed[:50])

['one', 'morn', ',', 'when', 'gregor', 'samsa', 'woke', 'from', 'troubl', 'dream', ',', 'he', 'found', 'himself', 'transform', 'in', 'hi', 'bed', 'into', 'a', 'horribl', 'vermin', '.', 'he', 'lay', 'on', 'hi', 'armour-lik', 'back', ',', 'and', 'if', 'he', 'lift', 'hi', 'head', 'a', 'littl', 'he', 'could', 'see', 'hi', 'brown', 'belli', ',', 'slightli', 'dome', 'and', 'divid', 'by']
