## POS-Tagger

### NLTK Tagger

#### A part-of-speech tagger, or POS-tagger, processes a sequence of words, and attaches a part of speech tag to each word.

In [6]:
import nltk
nltk.download('tagsets')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('tagsets_json')


[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\laric\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\laric\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\laric\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\laric\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets_json to
[nltk_data]     C:\Users\laric\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping help\tagsets_json.zip.


True

In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize

text1 = word_tokenize("And now for something completely different")
print(nltk.pos_tag(text1))
print()

text2 = word_tokenize("They refuse to permit us to obtain the refuse permit")
print(nltk.pos_tag(text2))
print()

# to get the meaning of the tags
nltk.help.upenn_tagset('RB')

text3 = word_tokenize("The back door.")
print(nltk.pos_tag(text3))
print()

text4 = word_tokenize("I couldn’t get back to sleep")
print(nltk.pos_tag(text4))
print()

[('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('completely', 'RB'), ('different', 'JJ')]

[('They', 'PRP'), ('refuse', 'VBP'), ('to', 'TO'), ('permit', 'VB'), ('us', 'PRP'), ('to', 'TO'), ('obtain', 'VB'), ('the', 'DT'), ('refuse', 'NN'), ('permit', 'NN')]

RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly fiscally pitilessly ...
[('The', 'DT'), ('back', 'JJ'), ('door', 'NN'), ('.', '.')]

[('I', 'PRP'), ('couldn', 'VBP'), ('’', 'JJ'), ('t', 'NN'), ('get', 'VB'), ('back', 'RB'), ('to', 'TO'), ('sleep', 'VB')]



#### The text.similar() method takes a word w, finds all contexts w1 w w2, then finds all words w' that appear in the same context, i.e. w1 w' w2.

In [9]:
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\laric\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


True

In [10]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
word_list = ['woman', 'bought', 'over', 'the']
for w in word_list:
    print("\nwords in text similar to '"+ w + "' are: ")
    text.similar(w)


words in text similar to 'woman' are: 
man time day year car moment world house family child country boy
state job place way war girl work word

words in text similar to 'bought' are: 
made said done put had seen found given left heard was been brought
set got that took in told felt

words in text similar to 'over' are: 
in on to of and for with from at by that into as up out down through
is all about

words in text similar to 'the' are: 
a his this their its her an that our any all one these my in your no
some other and


## Universal Part-of-Speech Tagset

#### Let's see which of these tags are the most common in the news category of the Brown corpus:

In [12]:
from nltk.corpus import brown
nltk.download('universal_tagset')
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.most_common()

[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\laric\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\universal_tagset.zip.


[('NOUN', 30654),
 ('VERB', 14399),
 ('ADP', 12355),
 ('.', 11928),
 ('DET', 11389),
 ('ADJ', 6706),
 ('ADV', 3349),
 ('CONJ', 2717),
 ('PRON', 2535),
 ('PRT', 2264),
 ('NUM', 2166),
 ('X', 92)]

## Nouns

#### Nouns generally refer to people, places, things, or concepts, e.g.: woman, Scotland, book, intelligence. Nouns can appear after determiners and adjectives, and can be the subject or object of the verb

In [13]:
word_tag_pairs = nltk.bigrams(brown_news_tagged)
noun_preceders = [a[1] for (a, b) in word_tag_pairs if b[1] == 'NOUN']
fdist = nltk.FreqDist(noun_preceders)
[tag for (tag, _) in fdist.most_common()]

['NOUN',
 'DET',
 'ADJ',
 'ADP',
 '.',
 'VERB',
 'CONJ',
 'NUM',
 'ADV',
 'PRT',
 'PRON',
 'X']

## Verbs

#### Verbs are words that describe events and actions, e.g. fall, eat. In the context of a sentence, verbs typically express a relation involving the referents of one or more noun phrases.

In [15]:
nltk.download('treebank')
wsj = nltk.corpus.treebank.tagged_words(tagset='universal')
word_tag_fd = nltk.FreqDist(wsj)
[wt[0] for (wt, _) in word_tag_fd.most_common() if wt[1] == 'VERB']

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\laric\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\treebank.zip.


['is',
 'said',
 'was',
 'are',
 'be',
 'has',
 'have',
 'will',
 'says',
 'would',
 'were',
 'had',
 'been',
 'could',
 "'s",
 'can',
 'do',
 'say',
 'make',
 'may',
 'did',
 'rose',
 'made',
 'does',
 'expected',
 'buy',
 'take',
 'get',
 'might',
 'sell',
 'added',
 'sold',
 'help',
 'including',
 'should',
 'reported',
 'according',
 'pay',
 'compared',
 'being',
 'fell',
 'began',
 'based',
 'used',
 'closed',
 "'re",
 'want',
 'see',
 'took',
 'yield',
 'offered',
 'set',
 'priced',
 'approved',
 'come',
 'noted',
 'cut',
 'ended',
 'found',
 'increased',
 'become',
 'think',
 'named',
 'go',
 'trying',
 'proposed',
 'received',
 'growing',
 'declined',
 'held',
 'give',
 'came',
 'use',
 'put',
 'making',
 'continue',
 'raise',
 'estimated',
 'called',
 'paid',
 'designed',
 'going',
 'expects',
 'seeking',
 'must',
 'plans',
 'wo',
 'increasing',
 'saying',
 'got',
 'owns',
 'trading',
 'acquired',
 'gained',
 'fined',
 'reached',
 'holding',
 'announced',
 'filed',
 'became',


#### Note that the items being counted in the frequency distribution are word-tag pairs. Since words and tags are paired, we can treat the word as a condition and the tag as an event, and initialize a conditional frequency distribution with a list of condition-event pairs. This lets us see a frequency-ordered list of tags given a word:

In [16]:
cfd1 = nltk.ConditionalFreqDist(wsj)
cfd1['yield'].most_common()
cfd1['cut'].most_common()

[('VERB', 25), ('NOUN', 3)]

#### We can reverse the order of the pairs, so that the tags are the conditions, and the words are the events. Now we can see likely words for a given tag. We will do this for the WSJ tagset rather than the universal tagset:

In [17]:
wsj = nltk.corpus.treebank.tagged_words()
cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in wsj)
list(cfd2['VBN'])

['been',
 'expected',
 'made',
 'compared',
 'based',
 'used',
 'priced',
 'sold',
 'named',
 'designed',
 'held',
 'fined',
 'taken',
 'paid',
 'traded',
 'increased',
 'said',
 'filed',
 'reached',
 'called',
 'scheduled',
 'disclosed',
 'reported',
 'proposed',
 'estimated',
 'set',
 'known',
 'built',
 'approved',
 'given',
 'acquired',
 'found',
 'offered',
 'received',
 'caused',
 'considered',
 'ordered',
 'required',
 'preferred',
 'led',
 'issued',
 'fixed',
 'listed',
 'prepared',
 'involved',
 'aimed',
 'needed',
 'launched',
 'produced',
 'put',
 'planned',
 'seen',
 'alleged',
 'valued',
 'barred',
 'become',
 'related',
 'improved',
 'changed',
 'provided',
 'come',
 'got',
 'allowed',
 'mixed',
 'suspended',
 'owned',
 'elected',
 'worried',
 'completed',
 'combined',
 'raised',
 'left',
 'placed',
 'invested',
 'fallen',
 'failed',
 'helped',
 'run',
 'opposed',
 'quoted',
 'continued',
 'threatened',
 'offset',
 'shipped',
 'eliminated',
 'followed',
 'sought',
 'hurt'

#### To clarify the distinction between VBD (past tense) and VBN (past participle), let's find words which can be both VBD and VBN, and see some surrounding text:

In [19]:
[w for w in cfd1.conditions() if 'VBD' in cfd1[w] and 'VBN' in cfd1[w]]
idx1 = wsj.index(('kicked', 'VBD'))
print(wsj[idx1-4:idx1+1])
idx2 = wsj.index(('kicked', 'VBN'))
print(wsj[idx2-4:idx2+1])

[('While', 'IN'), ('program', 'NN'), ('trades', 'NNS'), ('swiftly', 'RB'), ('kicked', 'VBD')]
[('head', 'NN'), ('of', 'IN'), ('state', 'NN'), ('has', 'VBZ'), ('kicked', 'VBN')]


## The Regular Expression Tagger

#### The regular expression tagger assigns tags to tokens on the basis of matching patterns. For instance, we might guess that any word ending in ed is the past participle of a verb, and any word ending with 's is a possessive noun. We can express these as a list of regular expressions:

In [20]:
patterns = [
     (r'.*ing$', 'VBG'),               # gerunds
     (r'.*ed$', 'VBD'),                # simple past
     (r'.*es$', 'VBZ'),                # 3rd singular present
     (r'.*ould$', 'MD'),               # modals
     (r'.*\'s$', 'NN$'),               # possessive nouns
     (r'.*s$', 'NNS'),                 # plural nouns
     (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
     (r'.*', 'NN'),                    # nouns (default)
     (r'^\d+$', 'CD'),
     (r'.*ing$', 'VBG'),               # gerunds, i.e. wondering
     (r'.*ment$', 'NN'),               # i.e. wonderment
     (r'.*ful$', 'JJ')                 # i.e. wonderful
 ]

regexp_tagger = nltk.RegexpTagger(patterns)
tagger = nltk.tag.sequential.RegexpTagger(patterns)

import nltk
from nltk.tokenize import word_tokenize
text1 = word_tokenize('Python is a high-level, general-purpose programming language')
print(tagger.tag(text1))
print()
print(nltk.pos_tag(text1)) # using NLTK


[('Python', 'NN'), ('is', 'NNS'), ('a', 'NN'), ('high-level', 'NN'), (',', 'NN'), ('general-purpose', 'NN'), ('programming', 'VBG'), ('language', 'NN')]

[('Python', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('high-level', 'JJ'), (',', ','), ('general-purpose', 'JJ'), ('programming', 'NN'), ('language', 'NN')]


## Text Blob Tagger

In [21]:
from textblob import TextBlob
wiki = TextBlob("Python is a high-level, general-purpose programming language. Python is a high-level, general-purpose programming language.")
print(wiki.tags)

import nltk
from nltk.tokenize import word_tokenize
print()
text1 = word_tokenize("Python is a high-level, general-purpose programming language. Python is a high-level, general-purpose programming language.")
print(nltk.pos_tag(text1))


[('Python', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('high-level', 'JJ'), ('general-purpose', 'JJ'), ('programming', 'NN'), ('language', 'NN'), ('Python', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('high-level', 'JJ'), ('general-purpose', 'JJ'), ('programming', 'NN'), ('language', 'NN')]

[('Python', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('high-level', 'JJ'), (',', ','), ('general-purpose', 'JJ'), ('programming', 'NN'), ('language', 'NN'), ('.', '.'), ('Python', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('high-level', 'JJ'), (',', ','), ('general-purpose', 'JJ'), ('programming', 'NN'), ('language', 'NN'), ('.', '.')]


In [22]:
import nltk
from textblob import TextBlob
wiki = TextBlob("Programming skill is very important for a analytics person.")
print(wiki.tags)
print(wiki.noun_phrases)
print()

import nltk
from nltk.tokenize import word_tokenize
text1 = word_tokenize("Programming skill is very important for a analytics person.")
print(nltk.pos_tag(text1))
print()

text2 = word_tokenize("what men?")
print(nltk.pos_tag(text2))
print()
print(nltk.help.upenn_tagset("VBZ"))
print(nltk.help.upenn_tagset("WP"))

[('Programming', 'VBG'), ('skill', 'NN'), ('is', 'VBZ'), ('very', 'RB'), ('important', 'JJ'), ('for', 'IN'), ('a', 'DT'), ('analytics', 'NNS'), ('person', 'NN')]
['programming', 'analytics person']

[('Programming', 'VBG'), ('skill', 'NN'), ('is', 'VBZ'), ('very', 'RB'), ('important', 'JJ'), ('for', 'IN'), ('a', 'DT'), ('analytics', 'NNS'), ('person', 'NN'), ('.', '.')]

[('what', 'WP'), ('men', 'NNS'), ('?', '.')]

VBZ: verb, present tense, 3rd person singular
    bases reconstructs marks mixes displeases seals carps weaves snatches
    slumps stretches authorizes smolders pictures emerges stockpiles
    seduces fizzes uses bolsters slaps speaks pleads ...
None
WP: WH-pronoun
    that what whatever whatsoever which who whom whosoever
None


#### Lemmatization with POS Tags Specifications

In [23]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


# Init Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize Single Word with the appropriate POS tag
word = 'feet'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

# Lemmatize a Sentence with the appropriate POS tag
sentence = "The striped bats are hanging on their feet for best"
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])

foot
['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']
