### Part-of-Speech Tagging 

<br>
<hr>

In [1]:
import nltk
#nltk.download('all')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
from nltk.tag import pos_tag
from nltk.tag import pos_tag_sents
from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [2]:
text = "Time to start with natural language processing. Python will make our life easier!"

In [3]:
# tokenization 
sent_tokens = sent_tokenize(text)
print(sent_tokens)

['Time to start with natural language processing.', 'Python will make our life easier!']


In [4]:
word_tokens = word_tokenize(text)
print(word_tokens)

['Time', 'to', 'start', 'with', 'natural', 'language', 'processing', '.', 'Python', 'will', 'make', 'our', 'life', 'easier', '!']


In [5]:
# apply POS-tagging 
tags = pos_tag(word_tokens)

In [6]:
# word gramatical classes 
print(tags)

[('Time', 'NNP'), ('to', 'TO'), ('start', 'VB'), ('with', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('.', '.'), ('Python', 'NNP'), ('will', 'MD'), ('make', 'VB'), ('our', 'PRP$'), ('life', 'NN'), ('easier', 'JJR'), ('!', '.')]


In [7]:
# what mean POS-tagger codes 
nltk.help.upenn_tagset('JJ')

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


In [8]:
# list of POS-taggers identificated at text 
list_of_tags = []
for pair in tags:
    list_of_tags.append(pair[1])
list_of_tags = list(set(list_of_tags))
list_of_tags

['MD', 'IN', 'PRP$', 'NN', 'JJ', 'NNP', '.', 'JJR', 'VB', 'TO']

In [9]:
# POS-tagger explanation presenting in text 
for pos in list_of_tags:
  print(nltk.help.upenn_tagset(pos))

MD: modal auxiliary
    can cannot could couldn't dare may might must need ought shall should
    shouldn't will would
None
IN: preposition or conjunction, subordinating
    astride among uppon whether out inside pro despite on by throughout
    below within for towards near behind atop around if like until below
    next into if beside ...
None
PRP$: pronoun, possessive
    her his mine my our ours their thy your
None
NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
None
JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...
None
NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvet

<br>
<hr>
<br>


### POS-tagger on sentence 



In [10]:
text = "San francisco is the best city on the california state. On the other hand Colorado is bored."
sent_token = sent_tokenize(text)

In [11]:
print(sent_token)

['San francisco is the best city on the california state.', 'On the other hand Colorado is bored.']


In [12]:
# POS-tagger on sentence
sent_pos = pos_tag_sents(sent_token)

In [13]:
print(sent_pos)

[[('S', 'VB'), ('a', 'DT'), ('n', 'JJ'), (' ', 'NN'), ('f', 'NN'), ('r', 'VBZ'), ('a', 'DT'), ('n', 'JJ'), ('c', 'NN'), ('i', 'NN'), ('s', 'VBP'), ('c', 'NN'), ('o', 'IN'), (' ', 'NN'), ('i', 'NN'), ('s', 'VBP'), (' ', 'JJ'), ('t', 'NN'), ('h', 'NN'), ('e', 'NN'), (' ', 'NNP'), ('b', 'NN'), ('e', 'NN'), ('s', 'NN'), ('t', 'NN'), (' ', 'NNP'), ('c', 'NN'), ('i', 'NN'), ('t', 'VBP'), ('y', 'NN'), (' ', 'NNP'), ('o', 'VBZ'), ('n', 'JJ'), (' ', 'NNP'), ('t', 'NN'), ('h', 'NN'), ('e', 'NN'), (' ', 'NNP'), ('c', 'VBZ'), ('a', 'DT'), ('l', 'NN'), ('i', 'NN'), ('f', 'VBP'), ('o', 'JJ'), ('r', 'NN'), ('n', 'NN'), ('i', 'VBZ'), ('a', 'DT'), (' ', 'JJ'), ('s', 'NN'), ('t', 'VBD'), ('a', 'DT'), ('t', 'NN'), ('e', 'NN'), ('.', '.')], [('O', 'NNP'), ('n', 'CC'), (' ', 'NNP'), ('t', 'VBP'), ('h', 'NN'), ('e', 'NN'), (' ', 'NNP'), ('o', 'VBZ'), ('t', 'JJ'), ('h', 'NN'), ('e', 'NN'), ('r', 'NN'), (' ', 'NNP'), ('h', 'VBZ'), ('a', 'DT'), ('n', 'JJ'), ('d', 'NN'), (' ', 'NNP'), ('C', 'NNP'), ('o', 'MD'),

In [14]:
# POS-tagger content 
list_pos_sentence = []

for pair in sent_pos:
  list_pos_sentence.append(pair[1])

list_content_pos = list(set(list_pos_sentence))
list_content_pos

[('n', 'CC'), ('a', 'DT')]