# Text Preprocessing in Python | Set 2

In [9]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
import string
import re

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/eosindo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/eosindo/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/eosindo/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


## Part of Speech Tagging

In [3]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# convert text into word_tokens with their tags
def pos_tagging(text):
    word_tokens = word_tokenize(text)
    return pos_tag(word_tokens)

pos_tagging('You just gave me a scare')

[('You', 'PRP'),
 ('just', 'RB'),
 ('gave', 'VBD'),
 ('me', 'PRP'),
 ('a', 'DT'),
 ('scare', 'NN')]

In [4]:
# download the tagset
nltk.download('tagsets')

# extract information about the tag
nltk.help.upenn_tagset('NN')

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


[nltk_data] Downloading package tagsets to /home/eosindo/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


### Chunking

In [5]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# define chunking function with text and regular
# expression representing grammar as parameter
def chunking(text, grammar):
    word_tokens = word_tokenize(text)
    
    # label words with part of speech
    word_pos = pos_tag(word_tokens)
    
    # create a chunk parser using grammar
    chunkParser = nltk.RegexpParser(grammar)
    
    # test it on the list of word tokens with tagged pos
    tree = chunkParser.parse(word_pos)
    
    for subtree in tree.subtrees():
        print(subtree)
    tree.draw()
    
sentence = 'the little yellow bird is flying in the sky'
grammar = "NP: {<DT>?<JJ>*<NN>}"
chunking(sentence, grammar)

(S
  (NP the/DT little/JJ yellow/JJ bird/NN)
  is/VBZ
  flying/VBG
  in/IN
  (NP the/DT sky/NN))
(NP the/DT little/JJ yellow/JJ bird/NN)
(NP the/DT sky/NN)


### Named Entity Recognition 

In [None]:
Named Entity Recognition is used to extract information from unstructured text. It is used to classify entities present in a text into categories like a person, organization, event, places, etc. It gives us detailed knowledge about the text and the relationships between the different entities.

In [10]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk
  
def named_entity_recognition(text):
    # tokenize the text
    word_tokens = word_tokenize(text)
  
    # part of speech tagging of words
    word_pos = pos_tag(word_tokens)
  
    # tree of word entities
    print(ne_chunk(word_pos))
  
text = 'Bill works for GeeksforGeeks so he went to Delhi for a meetup.'
named_entity_recognition(text)

(S
  (PERSON Bill/NNP)
  works/VBZ
  for/IN
  (ORGANIZATION GeeksforGeeks/NNP)
  so/RB
  he/PRP
  went/VBD
  to/TO
  (GPE Delhi/NNP)
  for/IN
  a/DT
  meetup/NN
  ./.)
