# NLTK Notebook

Tokenizer
     - Word Tokenizer
     - Sentence Tokenizer

Corpora
    - body of text, e.g. english language
lexicon
    - word and their meaning

In [9]:
#importing nltk.tokenize
from nltk.tokenize import sent_tokenize, word_tokenize

In [10]:
text = "Dear Mr. Francisc, Today we will be going to study about NLTK, a natural language processing library. Built for python"

In [22]:
#prints out the tokenized text by word
textReturn = word_tokenize(text)
print(word_tokenize(text))

['Dear', 'Mr.', 'Francisc', ',', 'Today', 'we', 'will', 'be', 'going', 'to', 'study', 'about', 'NLTK', ',', 'a', 'natural', 'language', 'processing', 'library', '.', 'Built', 'for', 'python']


In [23]:
words = sent_tokenize(text)

#prints out the tokenized text by sentence
print(sent_tokenize(text))



['Dear Mr. Francisc, Today we will be going to study about NLTK, a natural language processing library.', 'Built for python']


# Stop Words

 - stop words are words that will not give out information and may be neglected and not added to the feature set of the analysis

In [24]:
from nltk.corpus import stopwords
stop_word = set(stopwords.words('english'))

In [25]:
filtered_words = [w for w in textReturn if not w in stop_word]

In [26]:
#printing filtered words
print filtered_words

['Dear', 'Mr.', 'Francisc', ',', 'Today', 'going', 'study', 'NLTK', ',', 'natural', 'language', 'processing', 'library', '.', 'Built', 'python']


# Stemming

- stemming is finding the word closest to the meaning of the actual word being used in the sentence

In [29]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
new_word = "Good Morning, we are hear to learn about stemming, today we must know what is stemming and how it works"
related = ['python', 'pythoner', 'pythonista', 'pythonly', 'pythoning']

for pyn in related:
    print(ps.stem(pyn))

python
python
pythonista
pythonli
python


In [30]:
words = word_tokenize(new_word)
for pyn in words:
    print(ps.stem(pyn))

Good
Morn
,
we
are
hear
to
learn
about
stem
,
today
we
must
know
what
is
stem
and
how
it
work


# Part of Speech Tagging

- part of speech tagging is tagging the different words in a sentence or paragraph on what they are.

POS tag list:

- CC    ==> coordinating conjunction
- CD	==> cardinal digit
- DT	==> determiner
- EX    ==> existential there (like: "there is" ... think of it like "there exists")
- FW	==> foreign word
- IN	==> preposition/subordinating conjunction
- JJ	==> adjective	'big'
- JJR	==> adjective, comparative	'bigger'
- JJS	==> adjective, superlative	'biggest'
- LS	==> list marker	1)
- MD	==> modal	could, will
- NN	==> noun, singular 'desk'
- NNS	==> noun plural	'desks'
- NNP	==> proper noun, singular	'Harrison'
- NNPS	==> proper noun, plural	'Americans'
- PDT	==> predeterminer	'all the kids'
- POS	==> possessive ending	parent's
- PRP	==> personal pronoun	I, he, she
- PRP(DollarSign)	==> possessive pronoun	my, his, hers
- RB	==> adverb	very, silently,
- RBR	==> adverb, comparative	better
- RBS	==> adverb, superlative	best
- RP	==> particle	give up
- TO	==> to	go 'to' the store.
- UH	==> interjection	errrrrrrrm
- VB	==> verb, base form	take
- VBD	==> verb, past tense	took
- VBG	==> verb, gerund/present participle	taking
- VBN	==> verb, past participle	taken
- VBP	==> verb, sing. present, non-3d	take
- VBZ	==> verb, 3rd person sing. present	takes
- WDT	==> wh-determiner	which
- WP	==> wh-pronoun	who, what
- WP(DollarSign)	==> possessive wh-pronoun	whose
- WRB	==> wh-abverb	where, when

In [78]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [32]:
#free contents from nltk, the following are from the state of the union address of George W. Bush
train_set = state_union.raw('2005-GWBush.txt')
test_set = state_union.raw('2006-GWBush.txt')

In [33]:
#tokenizing the training set, splitting them according to the punktsentencetokenizer algorithm
custom_sent_tokenizer = PunktSentenceTokenizer(train_set)

In [36]:
#tokenizing and testing with the test set
tokenized = custom_sent_tokenizer.tokenize(test_set)

In [40]:
def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))

process_content()

[(u'PRESIDENT', 'NNP'), (u'GEORGE', 'NNP'), (u'W.', 'NNP'), (u'BUSH', 'NNP'), (u"'S", 'POS'), (u'ADDRESS', 'NNP'), (u'BEFORE', 'IN'), (u'A', 'NNP'), (u'JOINT', 'NNP'), (u'SESSION', 'NNP'), (u'OF', 'IN'), (u'THE', 'NNP'), (u'CONGRESS', 'NNP'), (u'ON', 'NNP'), (u'THE', 'NNP'), (u'STATE', 'NNP'), (u'OF', 'IN'), (u'THE', 'NNP'), (u'UNION', 'NNP'), (u'January', 'NNP'), (u'31', 'CD'), (u',', ','), (u'2006', 'CD'), (u'THE', 'NNP'), (u'PRESIDENT', 'NNP'), (u':', ':'), (u'Thank', 'NNP'), (u'you', 'PRP'), (u'all', 'DT'), (u'.', '.')]
[(u'Mr.', 'NNP'), (u'Speaker', 'NNP'), (u',', ','), (u'Vice', 'NNP'), (u'President', 'NNP'), (u'Cheney', 'NNP'), (u',', ','), (u'members', 'NNS'), (u'of', 'IN'), (u'Congress', 'NNP'), (u',', ','), (u'members', 'NNS'), (u'of', 'IN'), (u'the', 'DT'), (u'Supreme', 'NNP'), (u'Court', 'NNP'), (u'and', 'CC'), (u'diplomatic', 'JJ'), (u'corps', 'NN'), (u',', ','), (u'distinguished', 'JJ'), (u'guests', 'NNS'), (u',', ','), (u'and', 'CC'), (u'fellow', 'JJ'), (u'citizens', 'NN

# Chunking

- chunking is creating a set of conditions on which chunks all of the data the conforms with the conditions then outputs them into a list.


# Chinking

- chinking also works with chunking, it is the reverse process of chunking, it simply means we are going to remove something.


In [1]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [3]:
#free contents from nltk, the following are from the state of the union address of George W. Bush
train_set = state_union.raw('2005-GWBush.txt')
test_set = state_union.raw('2006-GWBush.txt')

In [4]:
#tokenizing the training set, splitting them according to the punktsentencetokenizer algorithm
custom_sent_tokenizer = PunktSentenceTokenizer(train_set)

In [5]:
#tokenizing and testing with the test set
tokenized = custom_sent_tokenizer.tokenize(test_set)

In [10]:
%nltk inline
def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            
            #This means we're removing from the chink one or more verbs, prepositions, determiners, or the word 'to'.
            chunGramwithChink = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""
            
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            chunked.draw()
            
    except Exception as e:
        print(str(e))

process_content()

ERROR: Line magic function `%nltk` not found.


# Named Entity Recognition

- named entity recognition sets a specific rules on which if this tokenized and POS tagged word adheres to this it is a named entity

- sample: Name George W. Bush

NE Type and Examples

- ORGANIZATION - Georgia-Pacific Corp., WHO
- PERSON       - Eddy Bonte, President Obama
- LOCATION     - Murray River, Mount Everest
- DATE         - June, 2008-06-29
- TIME         - two fifty a m, 1:30 p.m.
- MONEY        - 175 million Canadian Dollars, GBP 10.40
- PERCENT      - twenty pct, 18.75 %
- FACILITY     - Washington Monument, Stonehenge
- GPE          - South East Asia, Midlothian

In [13]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [14]:
#free contents from nltk, the following are from the state of the union address of George W. Bush
train_set = state_union.raw('2005-GWBush.txt')
test_set = state_union.raw('2006-GWBush.txt')

In [15]:
#tokenizing the training set, splitting them according to the punktsentencetokenizer algorithm
custom_sent_tokenizer = PunktSentenceTokenizer(train_set)

In [16]:
#tokenizing and testing with the test set
tokenized = custom_sent_tokenizer.tokenize(test_set)

In [18]:
%nltk
def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            #Create a variable holding the named entity chunked
            namedEnt = nltk.ne_chunk(tagged, binary = True)
            namedEnt.draw()
            
    except Exception as e:
        print(str(e))

process_content()

ERROR: Line magic function `%nltk` not found.


# Lemmatizing

- lemmatizing is knowing the simplest word that is associated with the word being tested without losing information and making it more useful.
- We could also use lemmatizing as a means of chunking or grouping a specific set and making it more compact not like stemming
- lemmatizing can be configured using the part of speech keyword. sample pos='a'

In [19]:
from nltk.stem import WordNetLemmatizer

In [20]:
lemmatizer = WordNetLemmatizer()

In [21]:
print(lemmatizer.lemmatize('better'))

better


In [22]:
print(lemmatizer.lemmatize('better', pos='a'))

good


# Corpora

- again, we will be discussing about corpora, it is a collection of data sets that can be used from the natural language tool kit downloadable library

In [None]:
#Sample import of a corpora downloaded from the nltk.download()
from nltk.corpus import gutenberg

# Wordnet

- is a way on which we can access the different possibilities of information given by a word, information like antonyms, synonyms, definition / description, examples and a lot more

# Synsets

- enables us to get the different information of the word which we passed through the function
- a set of synonyms from a single information source or a single meaning

# Lemmas

- enables us to get the synonymous words of the word that we passed in, not only the synonymous word of one meaning but also the synoynmous word of a different meaning, after passing synsets.

- sample: Good may have a synonymous word of good

# Similarity

- similarity gives us a percentage on which 2 words are similar

In [59]:
#importing wordnet
from nltk.corpus import wordnet

In [60]:
#creating a variable that will hold the synsets of the word program
#We can change the program inside the module synsets
syns = wordnet.synsets("able")

In [61]:
#prints out the complete synset value of the first object on the synset list
print(syns[0])

Synset('able.a.01')


In [62]:
#prints out the name first synsets of the word program
print(syns[0].name())

able.a.01


In [63]:
#prints out the lemmas of the synset of the first value from the synset list
print(syns[0].lemmas())

[Lemma('able.a.01.able')]


In [64]:
#prints out the name of the first lemmas of the synset of the first value form the synset list
print(syns[0].lemmas()[0].name())

able


In [65]:
#prints out the antonym with its properties of the word being passed through
print(syns[0].lemmas()[0].antonyms())

#prints out the antonym of the word being passed through
print(syns[0].lemmas()[0].antonyms()[0].name())

[Lemma('unable.a.01.unable')]
unable


In [66]:
#arranging everything into synonyms, antonyms and 

In [68]:
#wup_similarity will rate the word being compared on how related they are using wu and p's taxonomy algorithm
word1 = wordnet.synsets('ship')
word2 = wordnet.synsets('cactus')

print(word1[0].wup_similarity(word2[0]))

0.380952380952


In [77]:
#wup_similarity will rate the word being compared on how related they are using wu and p's taxonomy algorithm
word1 = wordnet.synsets('cat')
word2 = wordnet.synsets('dog')

print(word1[0].wup_similarity(word2[0]))

0.857142857143


# Text Classification (Bag of Words)

Process:
1. tokenizing the words
2. classifying using frequency distribution
3. testing
4. removing stop words
5. testing if how many positive and negative words are in a sentence



- text classification is the process of classifying the text or corpus into positives or negatives, the user can say what will be the outcome of the classification.

In [81]:
import nltk
import random
from nltk.corpus import movie_reviews

In [None]:
# Ternary 
# aa = [1,2,3]
# bb = [0,0,0]

# #foo = [(i.j) for i in aa for j in bb if i==1]
# print foo

# for i in aa:
#     for j in bb:
#         if i == 1:
#             print (i,j)



In [83]:
#creates a documents variable holding a list from words within the categories of the movie_review module
documents = [(list(movie_reviews.words(fileid)), category) 
             for category in movie_reviews.categories() 
             for fileid in movie_reviews.fileids(category)]

In [84]:
#randomizes the positioning of the items inside the list
random.shuffle(documents)

In [86]:
#lower cases all the items inside the list.
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

In [88]:
#Creates a frequency distribution of all the items
all_words = nltk.FreqDist(all_words)

In [89]:
#prints the most common words according to frequency
print(all_words.most_common(15))

[(u',', 77717), (u'the', 76529), (u'.', 65876), (u'a', 38106), (u'and', 35576), (u'of', 34123), (u'to', 31937), (u"'", 30585), (u'is', 25195), (u'in', 21822), (u's', 18513), (u'"', 17612), (u'it', 16107), (u'that', 15924), (u'-', 15595)]


In [92]:
#prints the number of times the word stupid is mentioned
print(all_words["stupid"])

253


# Converting words to Features

- converting words to features means that we must select the words that gives us information then remove all the words that doesn't give any information, we could also neglect words if they are not part of a given sample size on which we would train our Machine Learning Algorithm, the sample size is dependent to the user/analyst.

In [2]:
import nltk
from nltk.corpus import movie_reviews
import random

In [3]:
#creates a documents variable holding a list from words within the categories of the movie_review module
documents = [(list(movie_reviews.words(fileid)), category) 
             for category in movie_reviews.categories() 
             for fileid in movie_reviews.fileids(category)]

In [4]:
#randomizes the positioning of the items inside the list
random.shuffle(documents)

In [5]:
#lower cases all the items inside the list.
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

In [6]:
#Creates a frequency distribution of all the items
all_words = nltk.FreqDist(all_words)

In [7]:
word_features = list(all_words.keys())[:3000]

In [13]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
        
    return features

In [17]:
#print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_features(rev), category) for (rev, category) in documents]

# Naive Bayes Classifier with NLTK

- nltk has a built in naive bayes classifier like the one from scikit-learn

In [19]:
#Training sets will act as the main information on which the machine will learn using labels and features
training_set = featuresets[:1900]

#Testing sets will act as the checker of our machine learning algorithm it. The machine will test
#the learned algorithm on the testing set to know the accuracy of the learned algorithm
testing_set = featuresets[1900:]

In [20]:
#We created a Naive Bayes Classifier algorithm that will train on the training_set variable
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [21]:
#We the print out the
print("Classifier Accuracy Percentage: ", (nltk.classify.accuracy(classifier, testing_set))*100)

('Classifier Accuracy Percentage: ', 67.0)


In [23]:
classifier.show_most_informative_features(15)

Most Informative Features
               insulting = True              neg : pos    =      9.8 : 1.0
                    sans = True              neg : pos    =      8.4 : 1.0
            refreshingly = True              pos : neg    =      8.3 : 1.0
              mediocrity = True              neg : pos    =      7.7 : 1.0
                    hugo = True              pos : neg    =      7.6 : 1.0
                  wasted = True              neg : pos    =      6.7 : 1.0
             bruckheimer = True              neg : pos    =      6.4 : 1.0
             overwhelmed = True              pos : neg    =      6.3 : 1.0
               uplifting = True              pos : neg    =      5.8 : 1.0
                   wires = True              neg : pos    =      5.7 : 1.0
               dismissed = True              pos : neg    =      5.6 : 1.0
                    lang = True              pos : neg    =      5.6 : 1.0
                 topping = True              pos : neg    =      5.6 : 1.0

# Saving Classifiers in NLTK (using Pickle)

- saving classifiers could save us huge amount of time in traing and retraining our machine learning algorithm

In [24]:
#import pickle a standard library from python
import pickle

creating a pickle file

In [25]:
#open a naivebeyes.pickle file on the path your on, 'wb' means write in bytes - Python 3.x.x
save_classifier = open('naivebeyes.pickle', 'wb')

#create a dump, pass in the classifier we created before and the file to save into
pickle.dump(classifier, save_classifier)

#close the file 
save_classifier.close()

opening file

In [None]:
#open up the file
classifier_f = open('naivebeyes.pickle', 'rb')

#load the pickle file
classifier_f = pickle.load(classifier_f)

#close the file
classifier_f.close()

# Scikit-Learn with NLTK

In [29]:
from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.svm import NuSVC,SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB