<a href="https://colab.research.google.com/github/divya-r-kamat/END/blob/main/Session1/POS_Tagging_based_on_Heuristics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Import necessary Libraries

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

## A sentence under consideration for Information Extraction (NER)

In [2]:
sentence = 'Virat Kohli is an Indian cricketer who currently captains the India national team. A right-handed top-order batsman, Kohli is regarded as one of the best batsmen in the world'

## Apply word tokenization and part-of-speech tagging to the sentence

In [3]:
def preprocess(sent):
    sent =  word_tokenize(sent)
    sent =  nltk.pos_tag(sent)
    return sent

In [4]:
sent = preprocess(sentence)
sent

[('Virat', 'NNP'),
 ('Kohli', 'NNP'),
 ('is', 'VBZ'),
 ('an', 'DT'),
 ('Indian', 'JJ'),
 ('cricketer', 'NN'),
 ('who', 'WP'),
 ('currently', 'RB'),
 ('captains', 'VBZ'),
 ('the', 'DT'),
 ('India', 'NNP'),
 ('national', 'JJ'),
 ('team', 'NN'),
 ('.', '.'),
 ('A', 'DT'),
 ('right-handed', 'JJ'),
 ('top-order', 'NN'),
 ('batsman', 'NN'),
 (',', ','),
 ('Kohli', 'NNP'),
 ('is', 'VBZ'),
 ('regarded', 'VBN'),
 ('as', 'IN'),
 ('one', 'CD'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('best', 'JJS'),
 ('batsmen', 'NNS'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('world', 'NN')]

## Plot a Parse Tree

In [5]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'
NPChunker =  nltk.RegexpParser(pattern)
result = NPChunker.parse(sent)
print(result)
#result.draw()

(S
  Virat/NNP
  Kohli/NNP
  is/VBZ
  (NP an/DT Indian/JJ cricketer/NN)
  who/WP
  currently/RB
  captains/VBZ
  the/DT
  India/NNP
  (NP national/JJ team/NN)
  ./.
  (NP A/DT right-handed/JJ top-order/NN)
  (NP batsman/NN)
  ,/,
  Kohli/NNP
  is/VBZ
  regarded/VBN
  as/IN
  one/CD
  of/IN
  the/DT
  best/JJS
  batsmen/NNS
  in/IN
  (NP the/DT world/NN))


In [6]:
from nltk.tree import Tree
Tree.fromstring(str(result)).pretty_print()

                                                                                                                                  S                                                                                                                                                          
     _____________________________________________________________________________________________________________________________|_______________________________________________________________________________________________________________________________________________            
    |         |       |      |         |            |         |        |      |   |      |       |         |         |     |      |     |       |          |        |             NP                              NP                     NP                        NP             NP         
    |         |       |      |         |            |         |        |      |   |      |       |         |         |     |      |     |    

## POS Tags

In [7]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(result)
pprint(iob_tagged)

[('Virat', 'NNP', 'O'),
 ('Kohli', 'NNP', 'O'),
 ('is', 'VBZ', 'O'),
 ('an', 'DT', 'B-NP'),
 ('Indian', 'JJ', 'I-NP'),
 ('cricketer', 'NN', 'I-NP'),
 ('who', 'WP', 'O'),
 ('currently', 'RB', 'O'),
 ('captains', 'VBZ', 'O'),
 ('the', 'DT', 'O'),
 ('India', 'NNP', 'O'),
 ('national', 'JJ', 'B-NP'),
 ('team', 'NN', 'I-NP'),
 ('.', '.', 'O'),
 ('A', 'DT', 'B-NP'),
 ('right-handed', 'JJ', 'I-NP'),
 ('top-order', 'NN', 'I-NP'),
 ('batsman', 'NN', 'B-NP'),
 (',', ',', 'O'),
 ('Kohli', 'NNP', 'O'),
 ('is', 'VBZ', 'O'),
 ('regarded', 'VBN', 'O'),
 ('as', 'IN', 'O'),
 ('one', 'CD', 'O'),
 ('of', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('best', 'JJS', 'O'),
 ('batsmen', 'NNS', 'O'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('world', 'NN', 'I-NP')]


In [8]:
tree=conlltags2tree(iob_tagged)
print(tree)

(S
  Virat/NNP
  Kohli/NNP
  is/VBZ
  (NP an/DT Indian/JJ cricketer/NN)
  who/WP
  currently/RB
  captains/VBZ
  the/DT
  India/NNP
  (NP national/JJ team/NN)
  ./.
  (NP A/DT right-handed/JJ top-order/NN)
  (NP batsman/NN)
  ,/,
  Kohli/NNP
  is/VBZ
  regarded/VBN
  as/IN
  one/CD
  of/IN
  the/DT
  best/JJS
  batsmen/NNS
  in/IN
  (NP the/DT world/NN))


**This is how Information is extracted using heuristics based techniques. Try using another pattern.**

In [9]:
import pandas as pd
sentence='No new emoji may be released in 2021 due to COVID-19 pandemic word'
nltk_pos_tagged=nltk.pos_tag(sentence.split())
pd.DataFrame(nltk_pos_tagged,columns=['word','POS tag'])

Unnamed: 0,word,POS tag
0,No,DT
1,new,JJ
2,emoji,NN
3,may,MD
4,be,VB
5,released,VBN
6,in,IN
7,2021,CD
8,due,JJ
9,to,TO


In [10]:
chunk_tree=NPChunker.parse(nltk_pos_tagged)
print(chunk_tree)

(S
  (NP No/DT new/JJ emoji/NN)
  may/MD
  be/VB
  released/VBN
  in/IN
  2021/CD
  due/JJ
  to/TO
  (NP COVID-19/JJ pandemic/JJ word/NN))


In [11]:
Tree.fromstring(str(chunk_tree)).pretty_print()

                                                 S                                                        
   ______________________________________________|__________________________________________               
  |      |        |         |      |      |      |           NP                             NP            
  |      |        |         |      |      |      |      _____|_______            ___________|_________     
may/MD be/VB released/VBN in/IN 2021/CD due/JJ to/TO No/DT new/JJ emoji/NN COVID-19/JJ pandemic/JJ word/NN

