### Part of Speech (POS) tagging

In [15]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize

[nltk_data] Downloading package punkt to /home/fxr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/fxr/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Labelling text in English

In [16]:
text = word_tokenize('As cities grow, they can often become more unequal as increased economic activity pushes up land value.')
nltk.pos_tag(text)

[('As', 'IN'),
 ('cities', 'NNS'),
 ('grow', 'VBP'),
 (',', ','),
 ('they', 'PRP'),
 ('can', 'MD'),
 ('often', 'RB'),
 ('become', 'VB'),
 ('more', 'RBR'),
 ('unequal', 'JJ'),
 ('as', 'IN'),
 ('increased', 'JJ'),
 ('economic', 'JJ'),
 ('activity', 'NN'),
 ('pushes', 'VBZ'),
 ('up', 'RP'),
 ('land', 'NN'),
 ('value', 'NN'),
 ('.', '.')]

In [17]:
nltk.download('tagsets')
for _, tag in nltk.pos_tag(text):
     print(nltk.help.upenn_tagset(tag))

IN: preposition or conjunction, subordinating
    astride among uppon whether out inside pro despite on by throughout
    below within for towards near behind atop around if like until below
    next into if beside ...
None
NNS: noun, common, plural
    undergraduates scotches bric-a-brac products bodyguards facets coasts
    divestitures storehouses designs clubs fragrances averages
    subjectivists apprehensions muses factory-jobs ...
None
VBP: verb, present tense, not 3rd person singular
    predominate wrap resort sue twist spill cure lengthen brush terminate
    appear tend stray glisten obtain comprise detest tease attract
    emphasize mold postpone sever return wag ...
None
,: comma
    ,
None
PRP: pronoun, personal
    hers herself him himself hisself it itself me myself one oneself ours
    ourselves ownself self she thee theirs them themselves they thou thy us
None
MD: modal auxiliary
    can cannot could couldn't dare may might must need ought shall should
    shouldn't wi

[nltk_data] Downloading package tagsets to /home/fxr/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [6]:
text = word_tokenize('They do not permit other people to get residence permit')
nltk.pos_tag(text)

[('They', 'PRP'),
 ('do', 'VBP'),
 ('not', 'RB'),
 ('permit', 'VB'),
 ('other', 'JJ'),
 ('people', 'NNS'),
 ('to', 'TO'),
 ('get', 'VB'),
 ('residence', 'NN'),
 ('permit', 'NN')]

### Labelling text in Spanish

In [3]:
nltk.download('cess_esp')
from nltk.corpus import cess_esp as cess
from nltk import UnigramTagger as ut
from nltk import BigramTagger as bt

[nltk_data] Downloading package cess_esp to /home/fxr/nltk_data...
[nltk_data]   Package cess_esp is already up-to-date!


### Train a model to tag with unigrams in Spanish

In [11]:
cess_sents = cess.tagged_sents()
fraction = int(len(cess_sents)*90/100)
uni_tagger = ut(cess_sents[:fraction])
uni_tagger.evaluate(cess_sents[fraction+1:])

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  uni_tagger.evaluate(cess_sents[fraction+1:])


0.8069484240687679

In [12]:
uni_tagger.tag('Robot humanoide cautiva a visitantes en feria de tecnología'.split(" "))

[('Robot', None),
 ('humanoide', None),
 ('cautiva', 'vmip3s0'),
 ('a', 'sps00'),
 ('visitantes', 'nccp000'),
 ('en', 'sps00'),
 ('feria', 'ncfs000'),
 ('de', 'sps00'),
 ('tecnología', 'ncfs000')]

### Train a model to tag with bigrams in Spanish

In [13]:
fraction = int(len(cess_sents)*90/100)
bi_tagger = bt(cess_sents[:fraction])
bi_tagger.evaluate(cess_sents[fraction+1:])

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  bi_tagger.evaluate(cess_sents[fraction+1:])


0.1095272206303725

In [14]:
bi_tagger.tag('Robot humanoide cautiva a visitantes en feria de tecnología'.split(" "))

[('Robot', None),
 ('humanoide', None),
 ('cautiva', None),
 ('a', None),
 ('visitantes', None),
 ('en', None),
 ('feria', None),
 ('de', None),
 ('tecnología', None)]

### POS Tagging with Stanza

In [18]:
# !pip install stanza
import stanza
stanza.download('es')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json: 142kB [00:00, 15.4MB/s]                    
2022-01-16 23:02:06 INFO: Downloading default packages for language: es (Spanish)...
Downloading https://huggingface.co/stanfordnlp/stanza-es/resolve/v1.3.0/models/default.zip: 100%|██████████| 566M/566M [01:29<00:00, 6.29MB/s] 
2022-01-16 23:03:41 INFO: Finished downloading models and saved to /home/fxr/stanza_resources.


In [23]:
nlp = stanza.Pipeline('es', processors ='tokenize, pos')
doc = nlp('Robot humanoide cautiva a visitantes en feria de tecnología')
doc

2022-01-16 23:08:04 INFO: Loading these models for language: es (Spanish):
| Processor | Package |
-----------------------
| tokenize  | ancora  |
| mwt       | ancora  |
| pos       | ancora  |

2022-01-16 23:08:04 INFO: Use device: cpu
2022-01-16 23:08:04 INFO: Loading: tokenize
2022-01-16 23:08:04 INFO: Loading: mwt
2022-01-16 23:08:04 INFO: Loading: pos
2022-01-16 23:08:04 INFO: Done loading processors!


[
  [
    {
      "id": 1,
      "text": "Robot",
      "upos": "NOUN",
      "feats": "Gender=Masc|Number=Sing",
      "start_char": 0,
      "end_char": 5
    },
    {
      "id": 2,
      "text": "humanoide",
      "upos": "ADJ",
      "feats": "Gender=Masc|Number=Sing",
      "start_char": 6,
      "end_char": 15
    },
    {
      "id": 3,
      "text": "cautiva",
      "upos": "VERB",
      "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
      "start_char": 16,
      "end_char": 23
    },
    {
      "id": 4,
      "text": "a",
      "upos": "ADP",
      "start_char": 24,
      "end_char": 25
    },
    {
      "id": 5,
      "text": "visitantes",
      "upos": "NOUN",
      "feats": "Number=Plur",
      "start_char": 26,
      "end_char": 36
    },
    {
      "id": 6,
      "text": "en",
      "upos": "ADP",
      "start_char": 37,
      "end_char": 39
    },
    {
      "id": 7,
      "text": "feria",
      "upos": "NOUN",
      "feats": "Gender=Fem|Number=Si

In [24]:
for sentence in doc.sentences:
    for word in sentence.words:
        print(word.text, word.pos)

Robot NOUN
humanoide ADJ
cautiva VERB
a ADP
visitantes NOUN
en ADP
feria NOUN
de ADP
tecnología NOUN


### Bibliography:
- [Stanza](https://stanfordnlp.github.io/stanza/)
- [Stanza Online Demo](http://stanza.run/)
- [A Python Natural Language Processing Toolkit for Many Human Languages](https://arxiv.org/pdf/2003.07082.pdf)