# Text Cleanup

## Strip Markup Tags

In [1]:
from bs4 import BeautifulSoup

str = """It was the <b>best of times</b>. It was the <b>worst of times</b>! It was the age of wisdom!!
<br>
$123"""

cleantext = BeautifulSoup(str).text
print(cleantext)

It was the best of times. It was the worst of times! It was the age of wisdom!!

$123


## LowerCase

In [2]:
str = """It was the best of times. It was the worst of times! It was the age of wisdom!!
$123"""

str_lower = str.lower()
str_lower

'it was the best of times. it was the worst of times! it was the age of wisdom!!\n$123'

## Tokenize

In [8]:
import nltk

str = """It was the best of times. It was the worst of times! It was the age of wisdom!!
$123"""

str_lower = str.lower()

words = nltk.word_tokenize(str_lower)

print (words)

['it', 'was', 'the', 'best', 'of', 'times', '.', 'it', 'was', 'the', 'worst', 'of', 'times', '!', 'it', 'was', 'the', 'age', 'of', 'wisdom', '!', '!', '$', '123']


## Strip Puncutations/Numbers

In [3]:
import nltk

str = """It was the best of times. It was the worst of times! It was the age of wisdom!!
$123"""

str_lower = str.lower()

words = nltk.word_tokenize(str_lower)

print ('before:\n', words)
words=[word for word in words if word.isalpha()]
print ('after:\n', words)

before:
 ['it', 'was', 'the', 'best', 'of', 'times', '.', 'it', 'was', 'the', 'worst', 'of', 'times', '!', 'it', 'was', 'the', 'age', 'of', 'wisdom', '!', '!', '$', '123']
after:
 ['it', 'was', 'the', 'best', 'of', 'times', 'it', 'was', 'the', 'worst', 'of', 'times', 'it', 'was', 'the', 'age', 'of', 'wisdom']


In [4]:
from nltk.tokenize import RegexpTokenizer

str = """It was the best of times. It was the worst of times! It was the age of wisdom!!
$123"""
tokenizer = RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(str)
print (words)

['It', 'was', 'the', 'best', 'of', 'times', 'It', 'was', 'the', 'worst', 'of', 'times', 'It', 'was', 'the', 'age', 'of', 'wisdom', '123']


## Remove Stop Words

In [9]:
## print out stopwords

import nltk
from nltk.corpus import stopwords

stopwords_en = stopwords.words('english')
stopwords_en.sort()
print ('number of stop words:', len(stopwords_en))
print()
print (stopwords_en)


number of stop words: 179

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn',

In [6]:
import nltk
from nltk.corpus import stopwords

str = """It was the best of times. It was the worst of times! It was the age of wisdom!!
$123"""

## Tokenize
str_lower = str.lower()
words = nltk.word_tokenize(str_lower)
words=[word for word in words if word.isalpha()]

# grab stopwords
stopwords_en = stopwords.words('english')

print ('before:\n', words)
## filter out stopwords
words = [w for w in words if not w in stopwords_en]
print ('after:\n', words)

before:
 ['it', 'was', 'the', 'best', 'of', 'times', 'it', 'was', 'the', 'worst', 'of', 'times', 'it', 'was', 'the', 'age', 'of', 'wisdom']
after:
 ['best', 'times', 'worst', 'times', 'age', 'wisdom']


## Lemmatization

In [7]:
import nltk
from nltk.stem import WordNetLemmatizer

words = ['run', 'running', 'books', 'people', 'leaves',  'mice']

lemmatizer = WordNetLemmatizer() 
for w in words:
    print (w, ':', lemmatizer.lemmatize(w))

run : run
running : running
books : book
people : people
leaves : leaf
mice : mouse
