# Usando um POS tagger 

In [1]:
import nltk
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/ar/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [2]:
text = nltk.word_tokenize("And now for something completely different")
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

In [3]:
# será que existe forma de usar o pos_tag passando idioma? ou seja, existe algum modelo pre-treinado para POR?
text = nltk.word_tokenize('O gato branco fugiu do dono.')
nltk.pos_tag(text)

[('O', 'NNP'),
 ('gato', 'NN'),
 ('branco', 'NN'),
 ('fugiu', 'NN'),
 ('do', 'VBP'),
 ('dono', 'NNS'),
 ('.', '.')]

# comentários

1. nota sobre POS tags!
2. slides
3. documentação de UD vs tags MacMorpho (artigo sobre MacMorpho)

The text.similar() method takes a word w, finds all contexts w1 w w2, then finds all words w' that appear in the same context, i.e. w1 w' w2

In [4]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('woman')

man time day year car moment world house family child country boy
state job place way war girl work word


In [5]:
text.similar('over')

in on to of and for with from at by that into as up out down through
is all about


In [6]:
text.similar('the')

a his this their its her an that our any all one these my in your no
some other and


# lendo corpus tageados

In [7]:
nltk.corpus.brown.tagged_words()

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [8]:
nltk.corpus.mac_morpho.tagged_words()

[('Jersei', 'N'), ('atinge', 'V'), ('média', 'N'), ...]

In [9]:
from nltk.corpus import brown

# https://stackoverflow.com/questions/29419379/nltk-typeerror-tagged-words-got-an-unexpected-keyword-argument-simplify-ta
# como mapeamento é implementado? usa contexto ou apenas 1-1 tags?
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.keys()

dict_keys(['DET', 'NOUN', 'ADJ', 'VERB', 'ADP', '.', 'ADV', 'CONJ', 'PRT', 'PRON', 'NUM', 'X'])

In [10]:
word_tag_pairs = nltk.bigrams(brown_news_tagged)
list(nltk.FreqDist(a[1] for (a,b) in word_tag_pairs if b[1] == 'NOUN'))[:10]

['NOUN', 'DET', 'ADJ', 'ADP', '.', 'VERB', 'CONJ', 'NUM', 'ADV', 'PRT']

In [11]:
wsj = nltk.corpus.treebank.tagged_words(tagset='universal')
word_tag_fd = nltk.FreqDist(wsj)
[word + '/' + tag for (word,tag) in word_tag_fd if tag == 'VERB'][:15]

['is/VERB',
 'said/VERB',
 'was/VERB',
 'are/VERB',
 'be/VERB',
 'has/VERB',
 'have/VERB',
 'will/VERB',
 'says/VERB',
 'would/VERB',
 'were/VERB',
 'had/VERB',
 'been/VERB',
 'could/VERB',
 "'s/VERB"]

In [12]:
cfd1 = nltk.ConditionalFreqDist(wsj)
cfd1['yield'].keys()

dict_keys(['NOUN', 'VERB'])

In [13]:
from nltk.corpus import mac_morpho

mac_tagged = mac_morpho.tagged_words()
mac_fd = nltk.FreqDist(tag for (word, tag) in mac_tagged)
mac_fd.keys()

dict_keys(['N', 'V', 'PREP', 'CUR', 'NUM', 'PREP|+', 'ART', 'NPROP', 'PROADJ', ',', 'PRO-KS-REL', 'N|AP', 'ADJ', 'KC', 'VAUX', '(', ')', 'KS', '"', 'PCP', 'ADV', 'PROPESS', 'V|+', 'PDEN', 'PROSUB', '-', ':', 'PRO-KS', 'VAUX|+', 'ADV-KS-REL', '$', 'N|HOR', ';', 'ART|+', '[', 'ADJ|EST', 'N|EST', 'N|TEL', 'N|DAT', 'KC|[', 'KC|]', '?', '!', 'ADV|[', 'ADV|]', 'ADV-KS', 'ADV|+', 'NUM|TEL', '.', 'IN', "'", '/', 'PREP|[', 'PREP|]', 'NPROP|+', 'PREP|', 'NPRO', 'ADV|EST', '...', 'N|DAD', '=', 'VAUX|!', 'ADV|HOR', 'ADJ|+', 'ART|EST', 'PREP|+]', 'PROP', 'V|EST', 'PREP|EST', 'KC|EST', 'PROADJ|+', 'KS|[', 'KS|]', 'IN|EST', 'PROPESS|EST', 'PDEN|EST', '((', '))', '`', 'PROPESS|+', 'KC|+', 'V|!'])

In [14]:
cfd1 = nltk.ConditionalFreqDist(mac_tagged)

In [15]:
cfd1['visto'] # can use .keys()

FreqDist({'PCP': 50, 'N': 6, 'KS': 4, 'V': 3})

In [16]:
cfd2 = nltk.ConditionalFreqDist((tag,word) for (word,tag) in mac_tagged)

In [17]:
cfd2['KS']

FreqDist({'que': 8857, 'se': 877, 'como': 569, 'quando': 549, 'Se': 537, 'porque': 520, 'do': 441, 'Quando': 253, 'Como': 182, 'enquanto': 175, ...})

In [18]:
tags = [a[1] for (a, b) in nltk.bigrams(mac_tagged) if b[0] == 'visto']
fd = nltk.FreqDist(tags)
fd.tabulate()

   VAUX       V       ,     ADV     ART   NPROP      KC     ADJ PROPESS 
     30      12       7       4       4       3       1       1       1 


# automatic tagging

In [19]:
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories = 'news')
brown_sents = brown.sents(categories = 'news')

## default tagger

In [20]:
tags = [tag for (word,tag) in brown.tagged_words(categories = 'news')]
nltk.FreqDist(tags).max()

'NN'

In [21]:
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
default_tagger.tag(tokens)

[('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('green', 'NN'),
 ('eggs', 'NN'),
 ('and', 'NN'),
 ('ham', 'NN'),
 (',', 'NN'),
 ('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('them', 'NN'),
 ('Sam', 'NN'),
 ('I', 'NN'),
 ('am', 'NN'),
 ('!', 'NN')]

In [22]:
default_tagger.evaluate(brown_tagged_sents)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  default_tagger.evaluate(brown_tagged_sents)


0.13089484257215028

## regex tagger

OBS: como poderiamos adaptar para o POR e avaliar com o MacMorpho?

In [23]:
patterns = [
 (r'.*ing$', 'VBG'),               # gerunds
 (r'.*ed$', 'VBD'),                # simple past
 (r'.*es$', 'VBZ'),                # 3rd singular present
 (r'.*ould$', 'MD'),               # modals
 (r'.*\'s$', 'NN$'),               # possessive nouns
 (r'.*s$', 'NNS'),                 # plural nouns
 (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
 (r'.*', 'NN')                     # nouns (default)
]

In [24]:
regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger.tag(brown_sents[3])

[('``', 'NN'),
 ('Only', 'NN'),
 ('a', 'NN'),
 ('relative', 'NN'),
 ('handful', 'NN'),
 ('of', 'NN'),
 ('such', 'NN'),
 ('reports', 'NNS'),
 ('was', 'NNS'),
 ('received', 'VBD'),
 ("''", 'NN'),
 (',', 'NN'),
 ('the', 'NN'),
 ('jury', 'NN'),
 ('said', 'NN'),
 (',', 'NN'),
 ('``', 'NN'),
 ('considering', 'VBG'),
 ('the', 'NN'),
 ('widespread', 'NN'),
 ('interest', 'NN'),
 ('in', 'NN'),
 ('the', 'NN'),
 ('election', 'NN'),
 (',', 'NN'),
 ('the', 'NN'),
 ('number', 'NN'),
 ('of', 'NN'),
 ('voters', 'NNS'),
 ('and', 'NN'),
 ('the', 'NN'),
 ('size', 'NN'),
 ('of', 'NN'),
 ('this', 'NNS'),
 ('city', 'NN'),
 ("''", 'NN'),
 ('.', 'NN')]

In [25]:
regexp_tagger.evaluate(brown_tagged_sents)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  regexp_tagger.evaluate(brown_tagged_sents)


0.20326391789486245

## lookup (UnigramTagger)

In [26]:
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
most_freq_words = list(fd.keys())[:500] # try 100
likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)

baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff = nltk.DefaultTagger('NN')) # remove backoff
baseline_tagger.evaluate(brown_tagged_sents)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  baseline_tagger.evaluate(brown_tagged_sents)


0.5912942299659885

In [27]:
sent = brown.sents(categories='news')[3]
sent = nltk.word_tokenize("The DELPH-IN Consortium is a collaboration among computational linguists from research sites world-wide.")
baseline_tagger.tag(sent)

[('The', 'AT'),
 ('DELPH-IN', 'NN'),
 ('Consortium', 'NN'),
 ('is', 'BEZ'),
 ('a', 'AT'),
 ('collaboration', 'NN'),
 ('among', 'IN'),
 ('computational', 'NN'),
 ('linguists', 'NN'),
 ('from', 'IN'),
 ('research', 'NN'),
 ('sites', 'NN'),
 ('world-wide', 'NN'),
 ('.', '.')]

### comentários

1. avaliação sobre anotação golden
2. guidelines (ex participio/adj https://www.conjugacao.com.br/formas-nominais-do-verbo/)
3. splitting dev, test and train

“Developing an annotated corpus is a major undertaking. Apart from the data, it generates sophisticated tools, documentation, and practices for ensuring high-quality annotation. The tagsets and other coding schemes inevitably depend on some theoretical position that is not shared by all. ”

Excerpt From: Steven Bird. “Natural Language Processing with Python.” Apple Books. 

## N-Gram tagging

### UniGram Tagging

In [28]:
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')

# http://www.nltk.org/api/nltk.tag.html 
# not using model 
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])

[('Various', 'JJ'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('apartments', 'NNS'),
 ('are', 'BER'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('terrace', 'NN'),
 ('type', 'NN'),
 (',', ','),
 ('being', 'BEG'),
 ('on', 'IN'),
 ('the', 'AT'),
 ('ground', 'NN'),
 ('floor', 'NN'),
 ('so', 'QL'),
 ('that', 'CS'),
 ('entrance', 'NN'),
 ('is', 'BEZ'),
 ('direct', 'JJ'),
 ('.', '.')]

In [29]:
unigram_tagger.evaluate(brown_tagged_sents)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  unigram_tagger.evaluate(brown_tagged_sents)


0.9349006503968017

In [30]:
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
unigram_tagger = nltk.UnigramTagger(train_sents)
unigram_tagger.evaluate(test_sents)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  unigram_tagger.evaluate(test_sents)


0.8121200039868434

### general N-Gram tagging

In [31]:
bigram_tagger = nltk.BigramTagger(train_sents)
bigram_tagger.tag(brown_sents[2007])

[('Various', 'JJ'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('apartments', 'NNS'),
 ('are', 'BER'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('terrace', 'NN'),
 ('type', 'NN'),
 (',', ','),
 ('being', 'BEG'),
 ('on', 'IN'),
 ('the', 'AT'),
 ('ground', 'NN'),
 ('floor', 'NN'),
 ('so', 'CS'),
 ('that', 'CS'),
 ('entrance', 'NN'),
 ('is', 'BEZ'),
 ('direct', 'JJ'),
 ('.', '.')]

In [32]:
sent = nltk.word_tokenize("The DELPH-IN Consortium is a collaboration among computational linguists.")
bigram_tagger.tag(sent)

[('The', 'AT'),
 ('DELPH-IN', None),
 ('Consortium', None),
 ('is', None),
 ('a', None),
 ('collaboration', None),
 ('among', None),
 ('computational', None),
 ('linguists', None),
 ('.', None)]

In [33]:
bigram_tagger.evaluate(test_sents)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  bigram_tagger.evaluate(test_sents)


0.10206319146815508

In [34]:
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1) # contexts que apareceram pelo menos X vezes cutoff=2
t2.evaluate(test_sents)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  t2.evaluate(test_sents)


0.8452108043456593

In [35]:
sent = nltk.word_tokenize("The DELPH-IN Consortium is a collaboration among computational linguists.")
t2.tag(sent)

[('The', 'AT'),
 ('DELPH-IN', 'NN'),
 ('Consortium', 'NN'),
 ('is', 'BEZ'),
 ('a', 'AT'),
 ('collaboration', 'NN'),
 ('among', 'IN'),
 ('computational', 'NN'),
 ('linguists', 'NN'),
 ('.', '.')]

#### comentários

1. tagging unknown words with UNK
2. “Thus, 1 out of 20 trigrams is ambiguous. Given the current word and the previous two tags, in 5% of cases there is more than one tag that could be legitimately assigned to the current word according to the training data.” Excerpt From: Steven Bird. “Natural Language Processing with Python.” Apple Books. 
3. confusion matrix ~> revise tagset
4. coarse-grained vs fine-grained tagsets
5. como determinar categoria (POS) de uma palavra? 
6. What kinds of linguistic phenomena are captured in these bigram statistics? 
7. Para saber mais https://web.stanford.edu/~jurafsky/slp3/3.pdf, nota rodapé pagina 4, ver  https://skeptric.com/ngram-sentence-boundaries/ 
8. Como implementar? See opensource codes such as Freeling?! Book tem considerações interessantes sobre performance de modelos n-grams.
9. Sources from https://www.nltk.org/book/ch05.html (stupid backoff from the slp3 book)
10. the HMM is implemented in http://www.nltk.org/api/nltk.tag.html#module-nltk.tag.hmm

In [36]:
cfd = nltk.ConditionalFreqDist(((x[1], y[1], z[0]), z[1]) 
                               for sent in brown_tagged_sents 
                               for x, y, z in nltk.trigrams(sent))
ambiguous_contexts = [c for c in cfd.conditions() if len(cfd[c]) > 1]
sum(cfd[c].N() for c in ambiguous_contexts) / cfd.N()

0.049297702068029296

In [37]:
test_tags = [tag for sent in brown.sents(categories='editorial') 
             for (word, tag) in t2.tag(sent)]
gold_tags = [tag for (word, tag) in brown.tagged_words(categories='editorial')]
nltk.ConfusionMatrix(gold_tags, test_tags)

<ConfusionMatrix: 52073/61604 correct>