<h1 align="center">  Generate Parts of Speech tags using various python libraries </h1>

<h2 align="center"> Generating POS tags using Polyglot library </h2>

###  Download polyglot POS model for English language


In [1]:
from polyglot.downloader import downloader
print(downloader.supported_languages_table("pos2"))

  1. Slovene                    2. French                     3. Hungarian                
  4. Swedish                    5. Spanish; Castilian         6. Portuguese               
  7. Indonesian                 8. English                    9. German                   
 10. Danish                    11. Czech                     12. Bulgarian                
 13. Italian                   14. Irish                     15. Dutch                    
 16. Finnish                  


###  Load POS model

In [2]:
from polyglot.downloader import downloader
downloader.download("embeddings2.en")
downloader.download("pos2.en")

[polyglot_data] Downloading package embeddings2.en to
[polyglot_data]     /home/jalaj/polyglot_data...
[polyglot_data]   Package embeddings2.en is already up-to-date!
[polyglot_data] Downloading package pos2.en to
[polyglot_data]     /home/jalaj/polyglot_data...
[polyglot_data]   Package pos2.en is already up-to-date!


True

###  Import dependencies

In [3]:
import polyglot
from polyglot.text import Text, Word

### Detect the language

In [4]:
text = Text("Bonjour, Mesdames.")
print("Language Detected: Code={}, Name={}\n".format(text.language.code, text.language.name))


Language Detected: Code=fr, Name=French



### Tokenization of the sentences

In [5]:
words_list = Text("Beautiful is better than ugly. "
           "Explicit is better than implicit. "
           "Simple is better than complex.")
print(words_list.words)

['Beautiful', 'is', 'better', 'than', 'ugly', '.', 'Explicit', 'is', 'better', 'than', 'implicit', '.', 'Simple', 'is', 'better', 'than', 'complex', '.']


###  Generate POS tags for given sentence


In [6]:
sent = """We will meet at eight o'clock on Thursday morning."""
text = Text(sent)

In [7]:
text.pos_tags

[('We', 'PRON'),
 ('will', 'AUX'),
 ('meet', 'VERB'),
 ('at', 'ADP'),
 ('eight', 'NUM'),
 ("o'clock", 'NOUN'),
 ('on', 'ADP'),
 ('Thursday', 'PROPN'),
 ('morning', 'NOUN'),
 ('.', 'PUNCT')]

In [8]:
print("{:<16}{}".format("Word", "POS Tag")+"\n"+"-"*30)
for word, tag in text.pos_tags:
    print(u"{:<16}{:>2}".format(word, tag))

Word            POS Tag
------------------------------
We              PRON
will            AUX
meet            VERB
at              ADP
eight           NUM
o'clock         NOUN
on              ADP
Thursday        PROPN
morning         NOUN
.               PUNCT


In [9]:
text = Text("This is a car")
text.pos_tags
print("{:<16}{}".format("Word", "POS Tag")+"\n"+"-"*30)
for word, tag in text.pos_tags:
    print(u"{:<16}{:>2}".format(word, tag))

Word            POS Tag
------------------------------
This            DET
is              VERB
a               DET
car             NOUN


In [10]:
text = Text("Alexander the Great...!")
text.pos_tags
print("{:<16}{}".format("Word", "POS Tag")+"\n"+"-"*30)
for word, tag in text.pos_tags:
    print(u"{:<16}{:>2}".format(word, tag))

Word            POS Tag
------------------------------
Alexander       PROPN
the             DET
Great           PROPN
.               PUNCT
.               PUNCT
.               PUNCT
!               PUNCT


In [11]:
text = Text("Alexander the Great, was a king of the ancient Greek kingdom of Macedon.")
text.pos_tags
print("{:<16}{}".format("Word", "POS Tag")+"\n"+"-"*30)
for word, tag in text.pos_tags:
    print(u"{:<16}{:>2}".format(word, tag))

Word            POS Tag
------------------------------
Alexander       PROPN
the             DET
Great           PROPN
,               PUNCT
was             VERB
a               DET
king            NOUN
of              ADP
the             DET
ancient         ADJ
Greek           ADJ
kingdom         NOUN
of              ADP
Macedon         PROPN
.               PUNCT


<h2 align="center"> Generating POS tags using Spacy library </h2>

###  Import dependencies

In [18]:
import spacy

###  Load model

In [19]:
nlp = spacy.load('en_core_web_sm')

###  Generate POS tag for given sentence

In [20]:
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
print("{:<32}{:<32}{}".format("Word", "Word Category", "POS Tag")+"\n"+"-"*80)
for token in doc:
    #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
    print(u"{:<32}{:<32}{}".format(token.text,token.pos_, token.tag_))   

Word                            Word Category                   POS Tag
--------------------------------------------------------------------------------
Apple                           PROPN                           NNP
is                              VERB                            VBZ
looking                         VERB                            VBG
at                              ADP                             IN
buying                          VERB                            VBG
U.K.                            PROPN                           NNP
startup                         NOUN                            NN
for                             ADP                             IN
$                               SYM                             $
1                               NUM                             CD
billion                         NUM                             CD


<h2 align="center"> Why do we need to develop our own POS tagger? </h2>


* Dealing with domain specific terminology

* Dealing with ambiguity 



In [21]:
doc = nlp(u'The name of your medicine is Paracetamol 500mg Tablets (called paracetamol throughout this leaflet). ')
print("{:<32}{:<32}{}".format("Word", "Word Category", "POS Tag")+"\n"+"-"*80)
for token in doc:
    #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
    print(u"{:<32}{:<32}{}".format(token.text,token.pos_, token.tag_)) 

Word                            Word Category                   POS Tag
--------------------------------------------------------------------------------
The                             DET                             DT
name                            NOUN                            NN
of                              ADP                             IN
your                            ADJ                             PRP$
medicine                        NOUN                            NN
is                              VERB                            VBZ
Paracetamol                     PROPN                           NNP
500                             NUM                             CD
mg                              ADJ                             JJ
Tablets                         NOUN                            NNS
(                               PUNCT                           -LRB-
called                          VERB                            VBN
paracetamol                     NO