## Text Normalization (Revision)

In [20]:
import nltk, re, pprint
from nltk import word_tokenize
from bs4 import BeautifulSoup
from nltk import regexp_tokenize

In [33]:
raw = """In earlier program examples we have often converted text to lowercase before doing anything with its words, e.g. set(w.lower() for w in text). 
By using lower(), we have normalized the text to lowercase so that the distinction between The and the is ignored."""

tokens = word_tokenize(raw)

## Different Kinds of Stemming (Porter, Lancaster) and Lemmatization

porter = nltk.PorterStemmer() #initializing porter stemmer
lancaster = nltk.LancasterStemmer() #initializing lancaster stemmer
wnl = nltk.WordNetLemmatizer() #initializing a lemmatizer

from_porter = [porter.stem(t) for t in tokens]
print(f"PORTER STEMMING: {from_porter}")
from_lancaster = [lancaster.stem(t) for t in tokens]
print(f"LANCASTER STEMMING: {from_lancaster}")
from_lemmatization = [wnl.lemmatize(t) for t in tokens]
print("LEMMATIZATION:", from_lemmatization)


PORTER STEMMING: ['In', 'earlier', 'program', 'exampl', 'we', 'have', 'often', 'convert', 'text', 'to', 'lowercas', 'befor', 'do', 'anyth', 'with', 'it', 'word', ',', 'e.g', '.', 'set', '(', 'w.lower', '(', ')', 'for', 'w', 'in', 'text', ')', '.', 'By', 'use', 'lower', '(', ')', ',', 'we', 'have', 'normal', 'the', 'text', 'to', 'lowercas', 'so', 'that', 'the', 'distinct', 'between', 'the', 'and', 'the', 'is', 'ignor', '.']
LANCASTER STEMMING: ['in', 'ear', 'program', 'exampl', 'we', 'hav', 'oft', 'convert', 'text', 'to', 'lowercas', 'bef', 'doing', 'anyth', 'with', 'it', 'word', ',', 'e.g', '.', 'set', '(', 'w.lower', '(', ')', 'for', 'w', 'in', 'text', ')', '.', 'by', 'us', 'low', '(', ')', ',', 'we', 'hav', 'norm', 'the', 'text', 'to', 'lowercas', 'so', 'that', 'the', 'distinct', 'between', 'the', 'and', 'the', 'is', 'ign', '.']
LEMMATIZATION: ['In', 'earlier', 'program', 'example', 'we', 'have', 'often', 'converted', 'text', 'to', 'lowercase', 'before', 'doing', 'anything', 'with', 

Note that both Porter and Lancaster stemmers fall short in giving tokens with real meaning. For example, the word "example" is stemmed as "exampl" and "exampl" is not a valid word. Therefore, we need to use Lemmatization to make sure our resulting tokens are valid dictionary words (lemmas).

EXCELLENTTTTTTTTTTTTTTTTTTT!!!!!!!!! lemmatization is AMAZING!!!!

## 3.7 Use Regex to tokenize text

In [11]:
## Splitting words on white space

regex_tokens = re.split(" ", raw) 
print("NEW TOKENS:", regex_tokens)
print("OLD TOKENS:", tokens)

NEW TOKENS: ['In', 'earlier', 'program', 'examples', 'we', 'have', 'often', 'converted', 'text', 'to', 'lowercase', 'before', 'doing', 'anything', 'with', 'its', 'words,', 'e.g.', 'set(w.lower()', 'for', 'w', 'in', 'text).', '\nBy', 'using', 'lower(),', 'we', 'have', 'normalized', 'the', 'text', 'to', 'lowercase', 'so', 'that', 'the', 'distinction', 'between', 'The', 'and', 'the', 'is', 'ignored.', 'Often', 'we', 'want', 'to', 'go', 'further', 'than', 'this,', '\nand', 'strip', 'off', 'any', 'affixes,', 'a', 'task', 'known', 'as', 'stemming.', 'A', 'further', 'step', 'is', 'to', 'make', 'sure', 'that', 'the', 'resulting', 'form', 'is', 'a', 'known', 'word', 'in', 'a', 'dictionary,', 'a', 'task', 'known', '\nas', 'lemmatization.', 'We', 'discuss', 'each', 'of', 'these', 'in', 'turn.']
OLD TOKENS: ['In', 'earlier', 'program', 'examples', 'we', 'have', 'often', 'converted', 'text', 'to', 'lowercase', 'before', 'doing', 'anything', 'with', 'its', 'words', ',', 'e.g', '.', 'set', '(', 'w.lo

Note that by doing this both tokens are closely similar but in the NEW TOKENS, where there is a period or a comma, they go with the preceding token. 

In [12]:
## Splitting on anything other than letters, digits, or underscore
other_tokens = re.split(r"\W+", raw)
print(other_tokens)

['In', 'earlier', 'program', 'examples', 'we', 'have', 'often', 'converted', 'text', 'to', 'lowercase', 'before', 'doing', 'anything', 'with', 'its', 'words', 'e', 'g', 'set', 'w', 'lower', 'for', 'w', 'in', 'text', 'By', 'using', 'lower', 'we', 'have', 'normalized', 'the', 'text', 'to', 'lowercase', 'so', 'that', 'the', 'distinction', 'between', 'The', 'and', 'the', 'is', 'ignored', 'Often', 'we', 'want', 'to', 'go', 'further', 'than', 'this', 'and', 'strip', 'off', 'any', 'affixes', 'a', 'task', 'known', 'as', 'stemming', 'A', 'further', 'step', 'is', 'to', 'make', 'sure', 'that', 'the', 'resulting', 'form', 'is', 'a', 'known', 'word', 'in', 'a', 'dictionary', 'a', 'task', 'known', 'as', 'lemmatization', 'We', 'discuss', 'each', 'of', 'these', 'in', 'turn', '']


In [13]:
"xx".split('x')

['', '', '']

In [15]:
print(re.split(r"\w+", raw)) #this splits on words, digits, or underscores, which will give us punctuations and white spaces. 

['', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ', ', '.', '. ', '(', '.', '() ', ' ', ' ', ' ', '). \n', ' ', ' ', '(), ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '. ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ', \n', ' ', ' ', ' ', ' ', ', ', ' ', ' ', ' ', ' ', '. ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ', ', ' ', ' ', ' \n', ' ', '. ', ' ', ' ', ' ', ' ', ' ', ' ', '.']


In [17]:
print(re.findall(r"\w+|\S\w*", raw)) #finds all words, digits, or underscores first. Then finds all non-white spaces (S), then any number of words
# (1) a sequence of word characters. (2) if no match, any non-white spaces. (3) then a sequence of word charactters

['In', 'earlier', 'program', 'examples', 'we', 'have', 'often', 'converted', 'text', 'to', 'lowercase', 'before', 'doing', 'anything', 'with', 'its', 'words', ',', 'e', '.g', '.', 'set', '(w', '.lower', '(', ')', 'for', 'w', 'in', 'text', ')', '.', 'By', 'using', 'lower', '(', ')', ',', 'we', 'have', 'normalized', 'the', 'text', 'to', 'lowercase', 'so', 'that', 'the', 'distinction', 'between', 'The', 'and', 'the', 'is', 'ignored', '.', 'Often', 'we', 'want', 'to', 'go', 'further', 'than', 'this', ',', 'and', 'strip', 'off', 'any', 'affixes', ',', 'a', 'task', 'known', 'as', 'stemming', '.', 'A', 'further', 'step', 'is', 'to', 'make', 'sure', 'that', 'the', 'resulting', 'form', 'is', 'a', 'known', 'word', 'in', 'a', 'dictionary', ',', 'a', 'task', 'known', 'as', 'lemmatization', '.', 'We', 'discuss', 'each', 'of', 'these', 'in', 'turn', '.']


In [19]:
client_info = """The client's email is really_hot_summer@client.com and their phone number is (531)-100-0000 or it can also be written as 531-100-0000. 
            The client paid at least $12.4 and their zipcode is 11004"""

### I WILL BE BACK TO THIS LATTER, I PRETTY MUCH WANT TO CLIENT'S EXTRACT EMAIL, PHONE NUMBER, AMOUNT PAID, AND ZIPCODE...THESE THINGS ARE FREAKING HARDDDDDDDDDDDDDDDD


In [21]:
## Sentence Segmentation: breaking your text into different sentences

sents = nltk.sent_tokenize(raw)
pprint.pprint(sents)

['In earlier program examples we have often converted text to lowercase before '
 'doing anything with its words, e.g.',
 'set(w.lower() for w in text).',
 'By using lower(), we have normalized the text to lowercase so that the '
 'distinction between The and the is ignored.',
 'Often we want to go further than this, \n'
 'and strip off any affixes, a task known as stemming.',
 'A further step is to make sure that the resulting form is a known word in a '
 'dictionary, a task known \n'
 'as lemmatization.',
 'We discuss each of these in turn.']


In [31]:
fdist = nltk.FreqDist(['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat'])

for word in sorted(fdist):
    print(word, ":", fdist[word], end= ", ")

cat : 3, dog : 4, snake : 1, 

In [30]:
fdist["dog"]

4

# End of chapter exercises

☼ Define a string s = 'colorless'. Write a Python statement that changes this to "colourless" using only the slice and concatenation operations.

In [40]:
s = 'colorless'
s[:4] + "u" + s[4:]

'colourless'

☼ We can use the slice notation to remove morphological endings on words. For example, 'dogs'[:-1] removes the last character of dogs, leaving dog. Use slice notation to remove the affixes from these words (we've inserted a hyphen to indicate the affix boundary, but omit this from your strings): dish-es, run-ning, nation-ality, un-do, pre-heat.



In [42]:
string1 = "dishes"
print(string1[:-2])
string2 = "running"
print(string2[:-4])

dish
run


☼ We saw how we can generate an IndexError by indexing beyond the end of a string. Is it possible to construct an index that goes too far to the left, before the start of the string?

In [49]:
string1[-8] #NOPE

IndexError: string index out of range

'dishes'

☼ We can specify a "step" size for the slice. The following returns every second character within the slice: monty[6:11:2]. It also works in the reverse direction: monty[10:5:-2] Try these for yourself, then experiment with different step values.

In [50]:
monty = "Robin Monty"
monty[6:11:2]

'Mny'

In [64]:
## odd or even

def odd_or_even(number):
    """Function to determine if a 
    number is odd or even"""
    if number % 2 == 0:
        num = "eovdedn"[::2]
    else:
        num = "eovdedn"[1::2]
    return num

print(odd_or_even(50000))
print(odd_or_even(5))

even
odd


☼ What happens if you ask the interpreter to evaluate monty[::-1]? Explain why this is a reasonable result.



In [65]:
monty[::-1] #reverses the string

'ytnoM niboR'

☼ Write a utility function that takes a URL as its argument, and returns the contents of the URL, with all HTML markup removed. Use from urllib import request and then request.urlopen('http://nltk.org/').read().decode('utf8') to access the contents of the URL.



In [77]:
from urllib import request
import feedparser

def cleans_url_content(url):
    """ Reads content from the link, removes HTML markup"""
    raw = request.urlopen(url).read().decode('utf8')
    parsed = BeautifulSoup(raw, "html.parser").get_text()

    return parsed


conts = cleans_url_content(url = 'http://nltk.org/')
conts[:100]

'\n\n\n\n\n\nNLTK :: Natural Language Toolkit\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nNLTK\n\n\n\nDocumentation\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nNLTK Docume'