##### Tokenize words and sentences using NLTK

In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
s = '''
    Good muffins cost $3.88\nin New York. Please buy me two of them.\n\nThanks.
    '''

word_tokenize splits the text into a list of words

In [3]:
word_tokenize(s)

['Good',
 'muffins',
 'cost',
 '$',
 '3.88',
 'in',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 '.',
 'Thanks',
 '.']

sent_tokenize separates the text by _sent_ence.

In [4]:
sent_tokenize(s)

['\n    Good muffins cost $3.88\nin New York.',
 'Please buy me two of them.',
 'Thanks.']

##### Tokenize words and sentences using Spacy

In [5]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [6]:
doc = nlp(s)

In [7]:
#using list comprehension to tokenize words
[token.text for token in doc]

['\n    ',
 'Good',
 'muffins',
 'cost',
 '$',
 '3.88',
 '\n',
 'in',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 '.',
 '\n\n',
 'Thanks',
 '.',
 '\n    ']

#### NB Errata in book
original code has 'sent.string.strip()' but the text attribure is needed instead as per [stackoverflow](https://stackoverflow.com/questions/67646070/attributeerror-spacy-tokens-span-span-object-has-no-attribute-string)

In [8]:
[sent.text.strip() for sent in doc.sents]

['Good muffins cost $3.88\nin New York.',
 'Please buy me two of them.',
 '',
 'Thanks.',
 '']

In [9]:
doc.sents

<generator at 0x144021d60>

#### Stemming
Stemming is not use so much today in NLP applications, it just chops words down and doesn't consider context.<br>
it looks as if lemmatization may be better.

In [12]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [13]:
words = ['caresses', 'flies', 'dies', 'mules', 'denied',
    'died', 'agreed', 'owned', 'humbled', 'sized',
    'meetings', 'stating', 'siezing', 'itemization',
    'sensational', 'traditional', 'reference', 'colonizer',
    'plotted']

In [14]:
[stemmer.stem(word) for word in words]

['caress',
 'fli',
 'die',
 'mule',
 'deni',
 'die',
 'agre',
 'own',
 'humbl',
 'size',
 'meet',
 'state',
 'siez',
 'item',
 'sensat',
 'tradit',
 'refer',
 'colon',
 'plot']

#### Lemmatization

Lemmatize using nltk

In [15]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [16]:
[ lemmatizer.lemmatize(word) for word in words]

['caress',
 'fly',
 'dy',
 'mule',
 'denied',
 'died',
 'agreed',
 'owned',
 'humbled',
 'sized',
 'meeting',
 'stating',
 'siezing',
 'itemization',
 'sensational',
 'traditional',
 'reference',
 'colonizer',
 'plotted']

##### Lemmatize using Spacy
spaCy considers context (so is presumably better on the whole)

In [17]:
doc = nlp(' '.join(words))

In [18]:
[token.lemma_ for token in doc]

['caress',
 'fly',
 'die',
 'mule',
 'deny',
 'died',
 'agree',
 'own',
 'humble',
 'sized',
 'meeting',
 'state',
 'sieze',
 'itemization',
 'sensational',
 'traditional',
 'reference',
 'colonizer',
 'plot']