# NLTK Notebook

Tokenizer
     - Word Tokenizer
     - Sentence Tokenizer

Corpora
    - body of text, e.g. english language
lexicon
    - word and their meaning

In [9]:
#importing nltk.tokenize
from nltk.tokenize import sent_tokenize, word_tokenize

In [10]:
text = "Dear Mr. Francisc, Today we will be going to study about NLTK, a natural language processing library. Built for python"

In [22]:
#prints out the tokenized text by word
textReturn = word_tokenize(text)
print(word_tokenize(text))

['Dear', 'Mr.', 'Francisc', ',', 'Today', 'we', 'will', 'be', 'going', 'to', 'study', 'about', 'NLTK', ',', 'a', 'natural', 'language', 'processing', 'library', '.', 'Built', 'for', 'python']


In [23]:
words = sent_tokenize(text)

#prints out the tokenized text by sentence
print(sent_tokenize(text))



['Dear Mr. Francisc, Today we will be going to study about NLTK, a natural language processing library.', 'Built for python']


# Stop Words

 - stop words are words that will not give out information and may be neglected and not added to the feature set of the analysis

In [24]:
from nltk.corpus import stopwords
stop_word = set(stopwords.words('english'))

In [25]:
filtered_words = [w for w in textReturn if not w in stop_word]

In [26]:
#printing filtered words
print filtered_words

['Dear', 'Mr.', 'Francisc', ',', 'Today', 'going', 'study', 'NLTK', ',', 'natural', 'language', 'processing', 'library', '.', 'Built', 'python']


# Stemming

- stemming is finding the word closest to the meaning of the actual word being used in the sentence

In [29]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
new_word = "Good Morning, we are hear to learn about stemming, today we must know what is stemming and how it works"
related = ['python', 'pythoner', 'pythonista', 'pythonly', 'pythoning']

for pyn in related:
    print(ps.stem(pyn))

python
python
pythonista
pythonli
python


In [30]:
words = word_tokenize(new_word)
for pyn in words:
    print(ps.stem(pyn))

Good
Morn
,
we
are
hear
to
learn
about
stem
,
today
we
must
know
what
is
stem
and
how
it
work


# Part of Speech Tagging

- part of speech tagging is tagging the different words in a sentence or paragraph on what they are.

POS tag list:

- CC	coordinating conjunction
- CD	cardinal digit
- DT	determiner
- EX    existential there (like: "there is" ... think of it like "there exists")
- FW	foreign word
- IN	preposition/subordinating conjunction
- JJ	adjective	'big'
- JJR	adjective, comparative	'bigger'
- JJS	adjective, superlative	'biggest'
- LS	list marker	1)
- MD	modal	could, will
- NN	noun, singular 'desk'
- NNS	noun plural	'desks'
- NNP	proper noun, singular	'Harrison'
- NNPS	proper noun, plural	'Americans'
- PDT	predeterminer	'all the kids'
- POS	possessive ending	parent's
- PRP	personal pronoun	I, he, she
- PRP(DollarSign)	possessive pronoun	my, his, hers
- RB	adverb	very, silently,
- RBR	adverb, comparative	better
- RBS	adverb, superlative	best
- RP	particle	give up
- TO	to	go 'to' the store.
- UH	interjection	errrrrrrrm
- VB	verb, base form	take
- VBD	verb, past tense	took
- VBG	verb, gerund/present participle	taking
- VBN	verb, past participle	taken
- VBP	verb, sing. present, non-3d	take
- VBZ	verb, 3rd person sing. present	takes
- WDT	wh-determiner	which
- WP	wh-pronoun	who, what
- WP$	possessive wh-pronoun	whose
- WRB	wh-abverb	where, when

In [31]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [32]:
#free contents from nltk, the following are from the state of the union address of George W. Bush
train_set = state_union.raw('2005-GWBush.txt')
test_set = state_union.raw('2006-GWBush.txt')

In [33]:
#tokenizing the training set, splitting them according to the punktsentencetokenizer algorithm
custom_sent_tokenizer = PunktSentenceTokenizer(train_set)

In [36]:
#tokenizing and testing with the test set
tokenized = custom_sent_tokenizer.tokenize(test_set)

In [40]:
def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))

process_content()

[(u'PRESIDENT', 'NNP'), (u'GEORGE', 'NNP'), (u'W.', 'NNP'), (u'BUSH', 'NNP'), (u"'S", 'POS'), (u'ADDRESS', 'NNP'), (u'BEFORE', 'IN'), (u'A', 'NNP'), (u'JOINT', 'NNP'), (u'SESSION', 'NNP'), (u'OF', 'IN'), (u'THE', 'NNP'), (u'CONGRESS', 'NNP'), (u'ON', 'NNP'), (u'THE', 'NNP'), (u'STATE', 'NNP'), (u'OF', 'IN'), (u'THE', 'NNP'), (u'UNION', 'NNP'), (u'January', 'NNP'), (u'31', 'CD'), (u',', ','), (u'2006', 'CD'), (u'THE', 'NNP'), (u'PRESIDENT', 'NNP'), (u':', ':'), (u'Thank', 'NNP'), (u'you', 'PRP'), (u'all', 'DT'), (u'.', '.')]
[(u'Mr.', 'NNP'), (u'Speaker', 'NNP'), (u',', ','), (u'Vice', 'NNP'), (u'President', 'NNP'), (u'Cheney', 'NNP'), (u',', ','), (u'members', 'NNS'), (u'of', 'IN'), (u'Congress', 'NNP'), (u',', ','), (u'members', 'NNS'), (u'of', 'IN'), (u'the', 'DT'), (u'Supreme', 'NNP'), (u'Court', 'NNP'), (u'and', 'CC'), (u'diplomatic', 'JJ'), (u'corps', 'NN'), (u',', ','), (u'distinguished', 'JJ'), (u'guests', 'NNS'), (u',', ','), (u'and', 'CC'), (u'fellow', 'JJ'), (u'citizens', 'NN

# Chunking

- chunking is creating a set of conditions on which chunks all of the data the conforms with the conditions then outputs them into a list.


# Chinking

- chinking also works with chunking, it is the reverse process of chunking, it simply means we are going to remove something.


In [1]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [3]:
#free contents from nltk, the following are from the state of the union address of George W. Bush
train_set = state_union.raw('2005-GWBush.txt')
test_set = state_union.raw('2006-GWBush.txt')

In [4]:
#tokenizing the training set, splitting them according to the punktsentencetokenizer algorithm
custom_sent_tokenizer = PunktSentenceTokenizer(train_set)

In [5]:
#tokenizing and testing with the test set
tokenized = custom_sent_tokenizer.tokenize(test_set)

In [10]:
%nltk inline
def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            
            #This means we're removing from the chink one or more verbs, prepositions, determiners, or the word 'to'.
            chunGramwithChink = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""
            
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            chunked.draw()
            
    except Exception as e:
        print(str(e))

process_content()

ERROR: Line magic function `%nltk` not found.


# Named Entity Recognition

- named entity recognition sets a specific rules on which if this tokenized and POS tagged word adheres to this it is a named entity

- sample: Name George W. Bush

NE Type and Examples

- ORGANIZATION - Georgia-Pacific Corp., WHO
- PERSON       - Eddy Bonte, President Obama
- LOCATION     - Murray River, Mount Everest
- DATE         - June, 2008-06-29
- TIME         - two fifty a m, 1:30 p.m.
- MONEY        - 175 million Canadian Dollars, GBP 10.40
- PERCENT      - twenty pct, 18.75 %
- FACILITY     - Washington Monument, Stonehenge
- GPE          - South East Asia, Midlothian

In [13]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [14]:
#free contents from nltk, the following are from the state of the union address of George W. Bush
train_set = state_union.raw('2005-GWBush.txt')
test_set = state_union.raw('2006-GWBush.txt')

In [15]:
#tokenizing the training set, splitting them according to the punktsentencetokenizer algorithm
custom_sent_tokenizer = PunktSentenceTokenizer(train_set)

In [16]:
#tokenizing and testing with the test set
tokenized = custom_sent_tokenizer.tokenize(test_set)

In [18]:
%nltk
def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            #Create a variable holding the named entity chunked
            namedEnt = nltk.ne_chunk(tagged, binary = True)
            namedEnt.draw()
            
    except Exception as e:
        print(str(e))

process_content()

ERROR: Line magic function `%nltk` not found.


# Lemmatizing

- lemmatizing is knowing the simplest word that is associated with the word being tested without losing information and making it more useful.
- We could also use lemmatizing as a means of chunking or grouping a specific set and making it more compact not like stemming
- lemmatizing can be configured using the part of speech keyword. sample pos='a'

In [19]:
from nltk.stem import WordNetLemmatizer

In [20]:
lemmatizer = WordNetLemmatizer()

In [21]:
print(lemmatizer.lemmatize('better'))

better


In [22]:
print(lemmatizer.lemmatize('better', pos='a'))

good
