<img src="../Pics/MLSb-T.png" width="160">
<br><br>
<center><u><H1>Part of Speech Tagging</H1></u></center>

In [None]:
import nltk

In [None]:
from nltk import word_tokenize
from nltk import pos_tag

In [None]:
string = "I was watching movies"

In [None]:
print(pos_tag(word_tokenize(string)))

In [None]:
# PRP: Personal pronoun
# VBD: Verb, past tense
# VBG: Veb, gerund
# NNS: Noun plural

In [None]:
#Retrieving all nouns
s = 'My favourite scientist is Carl Sagan'
tagged = pos_tag(word_tokenize(s))

In [None]:
allnoun = [word for word, pos in tagged if pos in ['NN','NNP']]
allnoun

## Stanford tagger

In [None]:
from nltk.tag.stanford import StanfordPOSTagger

In [None]:
jar = '../Resources/stanford-postagger/stanford-postagger.jar'
model = '../Resources/stanford-postagger/models/english-bidirectional-distsim.tagger'
pos_tagger = StanfordPOSTagger(model, jar)
pos_tagger.tag('The life is beautiful'.split())

In [None]:
st = StanfordPOSTagger('../Resources/stanford-postagger-full/models/english-bidirectional-distsim.tagger','../Resources/stanford-postagger-full/stanford-postagger.jar')
st.tag('What is the airspeed of an unladen swallow ?'.split())

### The Brown Corpus was the first million-word electronic corpus of English, created in 1961 at Brown University. This corpus contains text from 500 sources, and the sources have been categorized by genre, such as news, editorial, and so on

In [None]:
from nltk.corpus import brown

In [None]:
brown.categories()

In [None]:
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]

In [None]:
import operator
freq = nltk.FreqDist(tags)
tags_freq = sorted(freq.items(), key=operator.itemgetter(1))
tags_freq[-10:]

## Default Tagger:

In [None]:
brown_tagged_sents = brown.tagged_sents(categories='news')
default_tagger = nltk.DefaultTagger('NN')
print(default_tagger.evaluate(brown_tagged_sents))

## N-gram Tagger:

### N-gram tagger takes previous n words in the context, to predict the POS tag for the given token.

In [None]:
from nltk.tag import UnigramTagger
from nltk.tag import DefaultTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

In [None]:
# splitting the data into train and test datasets
train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)]
test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):] 

In [None]:
#unigram considers the conditional frequency of tags and predicts the most
#frequent tag for the every given token.
unigram_tagger = UnigramTagger(train_data, backoff=default_tagger)
print(unigram_tagger.evaluate(test_data))

In [None]:
#bigram consider the tags of the given word and previous word, and tag as
#tuple to get the given tag for the test word.
bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
print(bigram_tagger.evaluate(test_data))

In [None]:
#trigram looks for the previous two words with the similar process.
trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger)
print(trigram_tagger.evaluate(test_data))

In [None]:
#We are combining the three taggers. First it will look for the Trigram
#of the given word sequence for predicting the tag, if not found it Backoff 
#to BigramTagger parameter and to a UnigramTagger and in the end to a NN tag.

## Regex tagger

In [None]:
from nltk.tag.sequential import RegexpTagger

In [None]:
regexp_tagger = RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), #cardinal numbers
 (r'(The|the|A|a|An|an)$', 'AT'), #articles
 (r'.*able$', 'JJ'), #adjectives
 (r'.*ness$', 'NN'), #nouns formed from adj
 (r'.*ly$', 'RB'), #adverbs
 (r'.*s$', 'NNS'), #plural nouns
 (r'.*ing$', 'VRG'), #gerunds
 (r'.*ed$', 'VBD'), #past tense verbs
 (r'.*', 'NN') # nouns (default)
])

In [None]:
print(regexp_tagger.evaluate(test_data))

## Named Entity Recognition (NER)

In [None]:
from nltk import ne_chunk

In [None]:
# the ne_chunk method recognizes people(names), places(location),
#and organizations.

In [None]:
text = "Stephen Hawking teach maths at the Oxford University in England"

In [None]:
print(ne_chunk(nltk.pos_tag(word_tokenize(text)), binary=False))

In [None]:
# if bynary parameter is True it provides the output for the entire
# sentence tree and tags everything.
print(ne_chunk(nltk.pos_tag(word_tokenize(text)), binary=True))

## Stanford NER

In [None]:
from nltk.tag.stanford import StanfordNERTagger

In [None]:
jar_ner = '../Resources/stanford-ner/stanford-ner.jar'
model_ner = '../Resources/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz'
st_ner = StanfordNERTagger(model_ner, jar_ner)
st_ner.tag('Carl Sagan taught at the Cornell University in USA'.split())

## References: 

https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

http://www.nltk.org/book/ch02.html

https://nlp.stanford.edu/software/

http://www.nltk.org/api/nltk.tag.html#module-nltk.tag.stanford

https://en.wikipedia.org/wiki/Brown_Corpus

https://nlp.stanford.edu/software/CRF-NER.html