# Natural Langugage Processing using NLTK

In [1]:
import nltk

### Word Tokenization: Breaking a sentence into words

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

data="This is a Hello World example of using NLTK"
print(word_tokenize(data))

['This', 'is', 'a', 'Hello', 'World', 'example', 'of', 'using', 'NLTK']


### Sentence Tokenization: Splitting a paragraph into individual sentences

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize
data="My name is Dipanjan! Who are you?"
print(sent_tokenize(data))

['My name is Dipanjan!', 'Who are you?']


In [4]:
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
phrases=sent_tokenize(data)
words=word_tokenize(data)

print(phrases)
print(words)

['All work and no play makes jack dull boy.', 'All work and no play makes jack a dull boy.']
['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', '.']


### Stop Words: Stop words are natural language words which have very little meaning, such as “and”, “the”, “a”, “an”, and similar words. These words do not provide immense value in Natural Language Understanding (NLU) and are hence removed before processing.

In [5]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
stopWords=set(stopwords.words('english'))
words=word_tokenize(data)
wordsFiltered=[]

for w in words:
    if w not in stopWords:
        wordsFiltered.append(w)

print(wordsFiltered)

print(words)

['All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.']
['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', '.']


### Stemming: Its a process of reducing derived words to their word stem or root form.

In [6]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
words= ["game", "gaming", "gamed", "games", "match", "tournament"]
ps=PorterStemmer()

for word in words:
    print(ps.stem(word) + "<--" + word)


game<--game
game<--gaming
game<--gamed
game<--games
match<--match
tournament<--tournament


### Parts of Speech (POS) Tagging:  It simply means labeling words with their appropriate Part-Of-Speech.

In [19]:
import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer


#### PunktSentenceTokenizer is the abstract class of the default sentence tokenizer, i.e. sent_tokenize(). You could also train the punkt tokenizer with your own corpus of data, by passing it in the constructor

In [37]:
sentence= """the little yellow dog barked at the cat."""
tokenizer=PunktSentenceTokenizer()
tokenized= tokenizer.tokenize(sentence)

print (tokenized)

for sent in tokenized:
    words=nltk.word_tokenize(sent)
    tagged=nltk.pos_tag(words)
    
    print(tagged)






['the little yellow dog barked at the cat.']
[('the', 'DT'), ('little', 'JJ'), ('yellow', 'JJ'), ('dog', 'NN'), ('barked', 'VBD'), ('at', 'IN'), ('the', 'DT'), ('cat', 'NN'), ('.', '.')]


#### Chunking - Its a process of extracting phrases from unstructured text. Instead of just simple tokens which may not represent the actual meaning of the text, its advisable to use phrases such as “South Africa” as a single word instead of ‘South’ and ‘Africa’ separate words. Chunking works on top of POS tagging, it uses pos-tags as input and provides chunks as output. Similar to POS tags, there are a standard set of Chunk tags like Noun Phrase(NP), Verb Phrase (VP), etc. Chunking is very important when you want to extract information from text such as Locations, Person Names etc. In NLP called Named Entity Extraction.

In [41]:
#Define your grammer using regular expression. We will consider Noun Phrase Chunking. The rule states that whenever the chunk finds an optional determiner (DT) followed by any number of adjectives (JJ) and then a noun (NN) then the Noun Phrase(NP) chunk should be formed.

grammer = ('''
    NP: {<DT>?<JJ>*<NN>} #NP
    ''')

chunkParser=nltk.RegexpParser(grammer)

tree=chunkParser.parse(tagged)
for subtree in tree.subtrees():
    print(subtree)


(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN)
  ./.)
(NP the/DT little/JJ yellow/JJ dog/NN)
(NP the/DT cat/NN)


In [42]:
tree.draw()