In [1]:
# Importing libraries for tokenizing by sentence and words
import nltk
# Installing NLTK data for the first time
# nltk.download()
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
# String for tokenizing
>>> example_string = """
Muad'Dib learned rapidly because his first training was in how to learn.
And the first lesson of all was the basic trust that he could learn.
It's shocking to find how many people do not believe they can learn,
and how many more believe learning to be difficult."""

In [3]:
# Splitting up into sentences i.e. Tokenizing by sentences
sent_tokenize(example_string)

["\nMuad'Dib learned rapidly because his first training was in how to learn.",
 'And the first lesson of all was the basic trust that he could learn.',
 "It's shocking to find how many people do not believe they can learn,\nand how many more believe learning to be difficult."]

In [4]:
# Splitting up into words i.e. Tokenizing by words
li = word_tokenize(example_string)

In [5]:
from nltk.corpus import stopwords

In [6]:
worf_quote = "Sir, I protest. I am not a merry man!"

In [7]:
words_in_quote = word_tokenize(worf_quote)
words_in_quote

['Sir', ',', 'I', 'protest', '.', 'I', 'am', 'not', 'a', 'merry', 'man', '!']

In [8]:
stop_words = set(stopwords.words("english")) # Creating a list of all stopwords
filtered_list = []

In [9]:
for word in words_in_quote:
    if word.casefold() not in stop_words:
         filtered_list.append(word)
            
''' We are filtering out all the tokenized words that are considered as stopwords
'''

' We are filtering out all the tokenized words that are considered as stopwords\n'

In [10]:
''' We can see that some of the stopwords have been removed'''
print(filtered_list)

['Sir', ',', 'protest', '.', 'merry', 'man', '!']


In [11]:
''' OR '''
filtered_list = [
     word for word in words_in_quote if word.casefold() not in stop_words
 ]

In [12]:
from nltk.stem import PorterStemmer

In [13]:
# Creating a stemmer out of PortStemmer
Stemmer = PorterStemmer()

In [14]:
string_for_stemming = """
    The crew of the USS Discovery discovered many discoveries.
    Discovering is what explorers do."""

In [15]:
words = word_tokenize(string_for_stemming)
words

['The',
 'crew',
 'of',
 'the',
 'USS',
 'Discovery',
 'discovered',
 'many',
 'discoveries',
 '.',
 'Discovering',
 'is',
 'what',
 'explorers',
 'do',
 '.']

In [16]:
stemmed_words = [ Stemmer.stem(word) for word in words]
stemmed_words

['the',
 'crew',
 'of',
 'the',
 'uss',
 'discoveri',
 'discov',
 'mani',
 'discoveri',
 '.',
 'discov',
 'is',
 'what',
 'explor',
 'do',
 '.']

In [17]:
# Creating a text to tag
sagan_quote = """
    If you wish to make an apple pie from scratch,
    you must first invent the universe."""

In [18]:
words_in_sagan_quote = word_tokenize(sagan_quote)
words_in_sagan_quote

['If',
 'you',
 'wish',
 'to',
 'make',
 'an',
 'apple',
 'pie',
 'from',
 'scratch',
 ',',
 'you',
 'must',
 'first',
 'invent',
 'the',
 'universe',
 '.']

In [19]:
# Tagging Parts of speech on the tokienized words
nltk.pos_tag(words_in_sagan_quote)

[('If', 'IN'),
 ('you', 'PRP'),
 ('wish', 'VBP'),
 ('to', 'TO'),
 ('make', 'VB'),
 ('an', 'DT'),
 ('apple', 'NN'),
 ('pie', 'NN'),
 ('from', 'IN'),
 ('scratch', 'NN'),
 (',', ','),
 ('you', 'PRP'),
 ('must', 'MD'),
 ('first', 'VB'),
 ('invent', 'VB'),
 ('the', 'DT'),
 ('universe', 'NN'),
 ('.', '.')]

In [20]:
# nltk.help.upenn_tagset() # i.e. getting the list of tags and their meanings

In [21]:
# Importing WordLemmatizer
from nltk.stem import WordNetLemmatizer

# Creating the lemmatizer to lemmatize words
lemmatizer = WordNetLemmatizer()

In [22]:
# String for Lemmatizing
string_for_lemmatizing = "The friends of DeSoto love scarves."

In [23]:
words = word_tokenize(string_for_lemmatizing)
words

['The', 'friends', 'of', 'DeSoto', 'love', 'scarves', '.']

In [24]:
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
lemmatized_words

['The', 'friend', 'of', 'DeSoto', 'love', 'scarf', '.']

In [25]:
lemmatizer.lemmatize('worst',pos='a')

'bad'

In [26]:
lotr_quote = "It's a dangerous business, Frodo, going out your door."

In [27]:
words_in_lotr = word_tokenize(lotr_quote)
lotr_pos_tag = nltk.pos_tag(words_in_lotr)
lotr_pos_tag

[('It', 'PRP'),
 ("'s", 'VBZ'),
 ('a', 'DT'),
 ('dangerous', 'JJ'),
 ('business', 'NN'),
 (',', ','),
 ('Frodo', 'NNP'),
 (',', ','),
 ('going', 'VBG'),
 ('out', 'RP'),
 ('your', 'PRP$'),
 ('door', 'NN'),
 ('.', '.')]

In [28]:
# Creating a chunk grammer with one regular expression rule
grammar = "NP: {<DT>?<JJ>*<NN>}"

'''
According to the rule you created, your chunks:
    - Start with an optional (?) determiner ('DT')
    - Can have any number (*) of adjectives (JJ)
    - End with a noun (<NN>)'''

# Creating a chunk parser with this grammer rule
chunk_parser = nltk.RegexpParser(grammar)

In [29]:
# Trying the parser with the our quote
tree = chunk_parser.parse(lotr_pos_tag)

In [32]:
# nltk.download("averaged_perceptron_tagger")

In [33]:
tree.draw()

In [34]:
grammar = """
 Chunk: {<.*>+}
        }<JJ>{"""

In [35]:
chunk_parser = nltk.RegexpParser(grammar)

In [37]:
tree = chunk_parser.parse(lotr_pos_tag)

In [39]:
tree.draw

<bound method Tree.draw of Tree('S', [Tree('Chunk', [('It', 'PRP'), ("'s", 'VBZ'), ('a', 'DT')]), ('dangerous', 'JJ'), Tree('Chunk', [('business', 'NN'), (',', ','), ('Frodo', 'NNP'), (',', ','), ('going', 'VBG'), ('out', 'RP'), ('your', 'PRP$'), ('door', 'NN'), ('.', '.')])])>

In [40]:
tree.draw()