In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize

## Download 'punkt'
Or else you'll get this error: Resource punkt not found. You might also have to execute the "Install Certificates.command" inside your Python3 installation folder in macOS

In [11]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/twe/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
example_string = """
Muad'Dib learned rapidly because his first training was in how to learn.
And the first lesson of all was the basic trust that he could learn.
It's shocking to find how many people do not believe they can learn,
and how many more believe learning to be difficult."""

## Get a list of senteces

In [12]:
sent_tokenize(example_string)

["\nMuad'Dib learned rapidly because his first training was in how to learn.",
 'And the first lesson of all was the basic trust that he could learn.',
 "It's shocking to find how many people do not believe they can learn,\nand how many more believe learning to be difficult."]

## Get a list of words

In [13]:
word_tokenize(example_string)

["Muad'Dib",
 'learned',
 'rapidly',
 'because',
 'his',
 'first',
 'training',
 'was',
 'in',
 'how',
 'to',
 'learn',
 '.',
 'And',
 'the',
 'first',
 'lesson',
 'of',
 'all',
 'was',
 'the',
 'basic',
 'trust',
 'that',
 'he',
 'could',
 'learn',
 '.',
 'It',
 "'s",
 'shocking',
 'to',
 'find',
 'how',
 'many',
 'people',
 'do',
 'not',
 'believe',
 'they',
 'can',
 'learn',
 ',',
 'and',
 'how',
 'many',
 'more',
 'believe',
 'learning',
 'to',
 'be',
 'difficult',
 '.']

# Filtering stopwords

In [14]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /Users/twe/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [20]:
from nltk.corpus import stopwords

In [21]:
from nltk.tokenize import word_tokenize

In [22]:
worf_quote = "Sir, I protest. I am not a merry man!"

In [23]:
words_in_quote = word_tokenize(worf_quote)

In [24]:
words_in_quote

['Sir', ',', 'I', 'protest', '.', 'I', 'am', 'not', 'a', 'merry', 'man', '!']

In [25]:
stop_words = set(stopwords.words("english"))

In [26]:
filtered_list = []

In [27]:
for word in words_in_quote:
    if word.casefold() not in stop_words:
        filtered_list.append(word)

# Stemming

**Stemming** is a text processing task in which you reduce words to their root, which is the core part of a word. For example, the words “helping” and “helper” share the root “help.” 

In [28]:
from nltk.stem import PorterStemmer

In [29]:
from nltk.tokenize import word_tokenize

In [30]:
stemmer = PorterStemmer()

In [31]:
string_for_stemming = """
The crew of the USS Discovery discovered many discoveries.
Discovering is what explorers do."""

In [32]:
words = word_tokenize(string_for_stemming)

In [33]:
words

['The',
 'crew',
 'of',
 'the',
 'USS',
 'Discovery',
 'discovered',
 'many',
 'discoveries',
 '.',
 'Discovering',
 'is',
 'what',
 'explorers',
 'do',
 '.']

In [34]:
stemmed_words = [stemmer.stem(word) for word in words]

In [35]:
stemmed_words

['the',
 'crew',
 'of',
 'the',
 'uss',
 'discoveri',
 'discov',
 'mani',
 'discoveri',
 '.',
 'discov',
 'is',
 'what',
 'explor',
 'do',
 '.']

# Tagging Parts of Speech

In [36]:
sagan_quote = """
If you wish to make an apple pie from scratch,
you must first invent the universe."""

In [37]:
words_in_sagan_quote = word_tokenize(sagan_quote)

In [38]:
words_in_sagan_quote

['If',
 'you',
 'wish',
 'to',
 'make',
 'an',
 'apple',
 'pie',
 'from',
 'scratch',
 ',',
 'you',
 'must',
 'first',
 'invent',
 'the',
 'universe',
 '.']

In [43]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.pos_tag(words_in_sagan_quote)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/twe/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('If', 'IN'),
 ('you', 'PRP'),
 ('wish', 'VBP'),
 ('to', 'TO'),
 ('make', 'VB'),
 ('an', 'DT'),
 ('apple', 'NN'),
 ('pie', 'NN'),
 ('from', 'IN'),
 ('scratch', 'NN'),
 (',', ','),
 ('you', 'PRP'),
 ('must', 'MD'),
 ('first', 'VB'),
 ('invent', 'VB'),
 ('the', 'DT'),
 ('universe', 'NN'),
 ('.', '.')]

## List of tags and their meaning

In [45]:
nltk.download('tagsets')
nltk.help.upenn_tagset()

[nltk_data] Downloading package tagsets to /Users/twe/nltk_data...


$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

[nltk_data]   Unzipping help/tagsets.zip.


# Lemmatizing

**lemmatizing** reduces words to their core meaning, but it will give you a complete English word that makes sense on its own instead of just a fragment of a word like 'discoveri'.

In [50]:
from nltk.stem import WordNetLemmatizer

In [51]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/twe/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [47]:
lemmatizer = WordNetLemmatizer()

In [52]:
lemmatizer.lemmatize("scarves")

'scarf'

In [53]:
string_for_lemmatizing = "The friends of DeSoto love scarves."

In [54]:
words = word_tokenize(string_for_lemmatizing)

In [55]:
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

In [56]:
lemmatized_words

['The', 'friend', 'of', 'DeSoto', 'love', 'scarf', '.']

In [59]:
lemmatizer.lemmatize("worst")

'worst'

In [60]:
lemmatizer.lemmatize("worst", pos="a")

'bad'

# Chunking

While tokenizing allows you to identify words and sentences, chunking allows you to identify phrases.

In [62]:
lotr_quote = "It's a dangerous business, Frodo, going out your door."

In [63]:
words_in_lotr_quote = word_tokenize(lotr_quote)

In [64]:
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/twe/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [65]:
lotr_pos_tags = nltk.pos_tag(words_in_lotr_quote)

In [66]:
lotr_pos_tags

[('It', 'PRP'),
 ("'s", 'VBZ'),
 ('a', 'DT'),
 ('dangerous', 'JJ'),
 ('business', 'NN'),
 (',', ','),
 ('Frodo', 'NNP'),
 (',', ','),
 ('going', 'VBG'),
 ('out', 'RP'),
 ('your', 'PRP$'),
 ('door', 'NN'),
 ('.', '.')]

In [67]:
grammar = "NP: {<DT>?<JJ>*<NN>}"

In [68]:
chunk_parser = nltk.RegexpParser(grammar)

In [69]:
tree = chunk_parser.parse(lotr_pos_tags)

In [70]:
tree.draw()