### Tokenization
### Stemming & Lemmatization

In [1]:
#import nltk
#nltk.download('popular', halt_on_error=False)
#nltk.download('all', halt_on_error=False)

In [2]:
#! pip install --user textblob

In [1]:
import nltk as nltk
import nltk.corpus  
from nltk.text import Text
import pandas as pd
import re
import sys

In [23]:
print(sys.version)

3.6.4 |Anaconda custom (64-bit)| (default, Mar 12 2018, 20:20:50) [MSC v.1900 64 bit (AMD64)]


In [3]:
text = "Flu season hitting earlier, with dozens more outbreaks — and more severe symptoms"
tokens = nltk.tokenize.word_tokenize(text)
print(tokens)

['Flu', 'season', 'hitting', 'earlier', ',', 'with', 'dozens', 'more', 'outbreaks', '—', 'and', 'more', 'severe', 'symptoms']


In [4]:
directory = './data/'

In [5]:
book = '3boat10.txt'

#### Get most frequent words in a book

In [9]:
f = open(directory+book)
bk_3boat = f.read()

words = nltk.tokenize.word_tokenize(bk_3boat)
# FreqDist - gives words with the number of times they occur, essenitally like a dictionary
fdist = nltk.FreqDist(words)

print(fdist)

#fdist.items() - will give all words
fdist.most_common(10)

<FreqDist with 7773 samples and 79641 outcomes>


[(',', 5702),
 ('the', 3338),
 ('and', 3215),
 ('.', 3081),
 ('to', 1748),
 ('a', 1621),
 ('of', 1425),
 ('I', 1208),
 ('it', 1159),
 ('in', 931)]

## Clean words in a book
## Get most frequent 

In [10]:
#from nltk.corpus import stopwords

words = nltk.tokenize.word_tokenize(bk_3boat)

#stopwords = stopwords.words('english')
# stop words are filler words like "the", "and", "to", etc.
# would like to remove so we can analyze the content of a document
stopwords = set(nltk.corpus.stopwords.words('english'))

# Remove single-character tokens (mostly punctuation or words like "a", "I")
words = [word for word in words if len(word) > 1]

# Remove numbers
words = [word for word in words if not word.isnumeric()]

# Remove punctuation
words = [word for word in words if word.isalpha()]

# Lowercase all words (default_stopwords are lowercase too)
words = [word.lower() for word in words]

# Remove stopwords
words = [word for word in words if word not in stopwords]

fdist = nltk.FreqDist(words)
#fdist.items() - will give all words
fdist.most_common(10)

[('said', 378),
 ('would', 362),
 ('harris', 316),
 ('george', 308),
 ('one', 246),
 ('us', 228),
 ('boat', 186),
 ('get', 179),
 ('could', 175),
 ('got', 163)]

#### We can invoke RegexpTokenizer to eliminate punctuation

In [10]:
#This will match any word characters until it reaches a non-word character, like a space
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(bk_3boat)

len(tokens)

68364

## Text normatliation with stemming and lemmatization

The goal of both stemming and lemmatization is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form. For instance:

    am, are, is =>  be
    dog, dogs, dog's, dogs' => dog

The result of this mapping of text will be something like:

    the girl's dogs are different breeds => the girl dog be differ breed 

In [11]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

In [12]:
words[0:20]

['three',
 'men',
 'boat',
 'say',
 'nothing',
 'dog',
 'three',
 'men',
 'boat',
 'jerome',
 'jerome',
 'chapter',
 'three',
 'invalids',
 'sufferings',
 'george',
 'harris',
 'victim',
 'one',
 'hundred']

#### Converting lists to strings to simplify displaying / visualization

In [14]:
words_l = (words[0:50])
words_s = ', '.join(words_l)
type (words_s)

str

In [15]:
type (words_l)

list

In [16]:
', '.join(words[0:50])

'three, men, boat, say, nothing, dog, three, men, boat, jerome, jerome, chapter, three, invalids, sufferings, george, harris, victim, one, hundred, seven, fatal, maladies, useful, prescriptions, cure, liver, complaint, children, agree, overworked, need, rest, week, rolling, deep, george, suggests, river, montmorency, lodges, objection, original, motion, carried, majority, three, one, four, us'

#### Or using "print"

In [16]:
print (words_l)

['three', 'men', 'boat', 'say', 'nothing', 'dog', 'three', 'men', 'boat', 'jerome', 'jerome', 'chapter', 'three', 'invalids', 'sufferings', 'george', 'harris', 'victim', 'one', 'hundred', 'seven', 'fatal', 'maladies', 'useful', 'prescriptions', 'cure', 'liver', 'complaint', 'children', 'agree', 'overworked', 'need', 'rest', 'week', 'rolling', 'deep', 'george', 'suggests', 'river', 'montmorency', 'lodges', 'objection', 'original', 'motion', 'carried', 'majority', 'three', 'one', 'four', 'us']


#### Stemming usually refers to a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. 

In [17]:
print([porter.stem(t) for t in words[0:50]])

['three', 'men', 'boat', 'say', 'noth', 'dog', 'three', 'men', 'boat', 'jerom', 'jerom', 'chapter', 'three', 'invalid', 'suffer', 'georg', 'harri', 'victim', 'one', 'hundr', 'seven', 'fatal', 'maladi', 'use', 'prescript', 'cure', 'liver', 'complaint', 'children', 'agre', 'overwork', 'need', 'rest', 'week', 'roll', 'deep', 'georg', 'suggest', 'river', 'montmor', 'lodg', 'object', 'origin', 'motion', 'carri', 'major', 'three', 'one', 'four', 'us']


In [18]:
print([lancaster.stem(t) for t in words[0:50]])

['three', 'men', 'boat', 'say', 'noth', 'dog', 'three', 'men', 'boat', 'jerom', 'jerom', 'chapt', 'three', 'invalid', 'suff', 'georg', 'har', 'victim', 'on', 'hundr', 'sev', 'fat', 'malady', 'us', 'prescrib', 'cur', 'liv', 'complaint', 'childr', 'agr', 'overwork', 'nee', 'rest', 'week', 'rol', 'deep', 'georg', 'suggest', 'riv', 'montm', 'lodg', 'object', 'origin', 'mot', 'carry', 'maj', 'three', 'on', 'four', 'us']


#### Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma.

The WordNet lemmatizer only removes affixes if the resulting word is in its dictionary. The dictionary checking makes lemmatizers significantly slower than stemmers

In [17]:
print (words_l)

['three', 'men', 'boat', 'say', 'nothing', 'dog', 'three', 'men', 'boat', 'jerome', 'jerome', 'chapter', 'three', 'invalids', 'sufferings', 'george', 'harris', 'victim', 'one', 'hundred', 'seven', 'fatal', 'maladies', 'useful', 'prescriptions', 'cure', 'liver', 'complaint', 'children', 'agree', 'overworked', 'need', 'rest', 'week', 'rolling', 'deep', 'george', 'suggests', 'river', 'montmorency', 'lodges', 'objection', 'original', 'motion', 'carried', 'majority', 'three', 'one', 'four', 'us']


In [18]:
# Lemmatization is the sophisticated, English knowledge holding version of stemming
wnl = nltk.WordNetLemmatizer()

In [19]:
print([wnl.lemmatize(t) for t in words[0:50]])

['three', 'men', 'boat', 'say', 'nothing', 'dog', 'three', 'men', 'boat', 'jerome', 'jerome', 'chapter', 'three', 'invalid', 'suffering', 'george', 'harris', 'victim', 'one', 'hundred', 'seven', 'fatal', 'malady', 'useful', 'prescription', 'cure', 'liver', 'complaint', 'child', 'agree', 'overworked', 'need', 'rest', 'week', 'rolling', 'deep', 'george', 'suggests', 'river', 'montmorency', 'lodge', 'objection', 'original', 'motion', 'carried', 'majority', 'three', 'one', 'four', 'u']
