# Pos-conditioned KENTstemmer
1. import text
2. PoS-tag text
3. lemmatize text
4. evaluate lemmatization

In [None]:
# import nltk resources for lemmatizer evaluation
import nltk
from nltk.book import *

In [None]:
tag = nltk.pos_tag(nltk.book.text1)
tag[:10]

In [None]:
# KENTstemmer3: read a tuple (word, PoS) return stem
def KENTstemmer3(t, dic) :  
        
    (wrd, pos) = t
    stem = wrd.lower()
    if(stem in dic) : stem = dic[stem]
                                 
    if(pos == "VBD"):
        stem = re.sub(r'(ed)$', '', stem)

    elif(pos == "VBN"):
        stem = re.sub(r'(ed)$', '', stem)
        
    elif(pos == "VBG"):
        stem = re.sub(r'(ing)$', '', stem)             
        
    elif(pos == "VBZ"):
        stem = re.sub(r'(s)$', '', stem)
     
    elif(pos == "NN"):
        stem = re.sub(r'(ment|ence|ation)$', '', stem)

    elif(pos == "NNS"):
        stem = re.sub(r'(s|ies)$', '', stem)
        
    elif(pos == "NNP"):
        stem = re.sub(r'(ed)$', '', stem)
        
    elif(pos =="JJ"):
        stem = re.sub(r'(est|ing|ly)$', '', stem)

    elif(pos == "JJR"):
        stem = re.sub(r'(er)$', '', stem)               
        
    elif(pos == "JJS"):
        stem = re.sub(r'(est)$', '', stem)
        
    elif(pos == "RBR"):
        stem = re.sub(r'(er)$', '', stem)               
        
    elif(pos == "RBS"):
        stem = re.sub(r'(est)$', '', stem)                
        
    elif(pos == "RB"):
        stem = re.sub(r'(ly|ed)$', '', stem)    
        
    elif(pos == "FW"):
        stem = re.sub(r'(o)$', '', stem)                
    
    return(stem)

In [None]:
#read a exception dictionary

def readDictionary(dictionary) :
    Dic = {}
    with open(dictionary,"r", encoding="utf8") as file:
        for entry in file:
            lem, tok = re.findall("^(.*?)[\s]+(.*?)$", entry)[0]
            Dic[tok.lower()] = lem.lower()
    return (Dic)

In [None]:
# uncomment to download additional packages if required
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')

# Import porter and snowball wordnet stemmers
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import re

# instantiate stemmers 
wordNet = WordNetLemmatizer()
porter = PorterStemmer()
snowball = SnowballStemmer("english")

# read reference lemmas (adjust the path)
BNClemma = readDictionary("lemmaLexicon/BNC_lemmafile5-unix.txt")

# tokenize NLTK text1 
tok = nltk.book.text1

# pos-tag text for the KENTstemmer
tag = nltk.pos_tag(tok)

# number of matching lemmas
kentM = portM = snowM = wnM = bncM = 0

# read exception lexicon for KENTstemmer
dic = readDictionary("lemmaLexicon/tokenLemma.txt")

# count correctly stemmed words, which are in the BNClemma lexicon
for w, p in tag :
    if w in BNClemma:
        # count matching lemmas
        if(BNClemma[w] == KENTstemmer3((w, p), dic)) : kentM = kentM + 1
        if(BNClemma[w] == porter.stem(w)) : portM = portM + 1
        if(BNClemma[w] == snowball.stem(w)) : snowM = snowM + 1
        if(BNClemma[w] == wordNet.lemmatize(w)) : wnM = wnM + 1
        bncM = bncM + 1

# print out number of matched tokens
print("Tokens:{} Match:{} KENT:{} PORT:{} SNOW:{} WN:{}".format(len(tag), bncM, kentM, portM, snowM, wnM))


In [None]:
# store lists of produced lemmas
kentL = portL = snowL = wnL = bncL = word = pos = []

kentL = [KENTstemmer3((w, p), dic)  for w, p in tag if w in BNClemma]
portL = [porter.stem(w)             for w, p in tag if w in BNClemma]
snowL = [snowball.stem(w)           for w, p in tag if w in BNClemma]
wnL   = [wordNet.lemmatize(w)       for w, p in tag if w in BNClemma]
bncL  = [BNClemma[w]                for w, p in tag if w in BNClemma]
word  = [w                          for w, p in tag if w in BNClemma]
pos   = [p                          for w, p in tag if w in BNClemma]


In [None]:
# tasks: write a function to measure accuracy
#  measure accuracy of the four stemmers

def accuracy(l1, l2):
# ....
    return #the accuracy 



In [None]:
# 
print("accuray KENT:{:4.4} PORT:{:4.4} SNOW:{:4.4} WN:{:4.4}".
      format(accuracy(kentL, bncL), 
             accuracy(portL, bncL),
             accuracy(snowL, bncL),
             accuracy(wnL, bncL)))

In [None]:
#tasks: 
#  add better lemmatization rules 
#  use different text material (nltk.book.text1, nltk.book.text2, nltk.book.text3, ...)
#  compute type / token ratio for each lemmatizer
#  compare with accuracy of KENTstemmer0
#  check frequency distribution of lemmatized / non-lemmatized words
#  check which words are not lemmatized and why


In [None]:
import pandas as pd
# delete duplicates from the list of words / lemmas / stems
df = pd.DataFrame({'token': word, 'BNC' : bncL, 'kent3' : kentL, 'porter': portL, 'snow':snowL, 'WN' : wnL, 'pos':pos})

# delete duplicates from the list of words / lemmas / stems
df1 = df[["token", "BNC", "kent3", "porter", "snow", "WN", "pos"]].drop_duplicates()

# delete all rows in which the corrwct lemmatization
df2 = [df1.iloc[[i]] for i in range(0,df1.shape[0]) if (df1.iloc[i]["kent3"] != df1.iloc[i]["BNC"])]

# sort the list by PoS and print out
result = pd.concat(df2).sort_values(by="pos")
result.head(50)
