<a href="https://colab.research.google.com/github/ejini6969/Text-Analytics/blob/main/Lecture_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import nltk
nltk.download("punkt")
nltk.download("tagsets")
nltk.download("averaged_perceptron_tagger")

from nltk.tokenize import word_tokenize
text = "i am learning python"
tokens = word_tokenize(text)        # tokenization first
print(nltk.pos_tag(tokens))         # then do post tagging (puncutation marks in sentence should not get tagged)

text = "I am learning Python"
tokens = word_tokenize(text) 
print(nltk.pos_tag(tokens) )        # `i` (noun) -> `I` (Personal pronoun) ; `python` (noun) -> `Python` (proper noun, singular)

text = "I am Andrew"
tokens = word_tokenize(text) 
print(nltk.pos_tag(tokens) )        # If word not found in dictionary, will be tagged depending on adjacent word type (typically noun or adjective)

[('i', 'NN'), ('am', 'VBP'), ('learning', 'VBG'), ('python', 'NN')]
[('I', 'PRP'), ('am', 'VBP'), ('learning', 'VBG'), ('Python', 'NNP')]
[('I', 'PRP'), ('am', 'VBP'), ('Andrew', 'RB')]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [12]:
for x in ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']:
    print(nltk.help.upenn_tagset(x)) # give all details regarding specific tag


CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
None
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
None
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
None
EX: existential there
    there
None
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
None
IN: preposition or conjunction, subordinating
    astride among uppon whether out inside pro despite on by throughout
    below within for towards near behind atop around if like until below
    next into if beside ...
None
JJ: adjective or nume

Regular expression tagging has lower performance compared to NLTK POS tagging as NLTK checks with dictionary and handles 36 types of tags while Regular expression requires users to specify each of them manually.

In [13]:
patterns = [
     (r'.*ing$', 'VBG'),               # gerunds
     (r'.*ed$', 'VBD'),                # simple past
     (r'.*es$', 'VBZ'),                # 3rd singular present
     (r'.*ould$', 'MD'),               # modals
     (r'.*\'s$', 'NN$'),               # possessive nouns
     (r'.*s$', 'NNS'),                 # plural nouns
     (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
     (r'.*', 'NN'),                    # nouns (default)
     (r'^\d+$', 'CD'),
     (r'.*ing$', 'VBG'),               # gerunds, i.e. wondering
     (r'.*ment$', 'NN'),               # i.e. wonderment
     (r'.*ful$', 'JJ')                 # i.e. wonderful
 ]

regexp_tagger = nltk.RegexpTagger(patterns)
tagger = nltk.tag.sequential.RegexpTagger(patterns)

text1 = word_tokenize('Python is a high-level, general-purpose programming language')
print(tagger.tag(text1))             # punctuations, adjectives, verb, determiner tagged as noun  -> inaccurate

[('Python', 'NN'), ('is', 'NNS'), ('a', 'NN'), ('high-level', 'NN'), (',', 'NN'), ('general-purpose', 'NN'), ('programming', 'VBG'), ('language', 'NN')]


In [14]:
from textblob import TextBlob
text2 = TextBlob('Python is a high-level, general-purpose programming language') 
text2.tags  # punctuation mark is missing since tokenization is not performed (same POS tagging results)

[('Python', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('high-level', 'JJ'),
 ('general-purpose', 'JJ'),
 ('programming', 'NN'),
 ('language', 'NN')]

Lemmatization with POS Tags Specifications

In [None]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# specify POS
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN) 

# Init Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize Single Word with the appropriate POS tag (relevant lemma will be returned based on POS)
word = 'feet'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

# Lemmatize a Sentence with the appropriate POS tag
sentence = "The striped bats are hanging on their feet for best"
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])