## STEMMING AND LEMMATIZATION

#### Demonstrating COCA

In [22]:
#Stemming 
#Explore coca for three different search strings- include three personality each
import nltk
from nltk.stem import PorterStemmer
stemmerporter = PorterStemmer()

In [23]:
a = stemmerporter.stem('sleeping')
b = stemmerporter.stem('eating')
c = stemmerporter.stem('craziness')
d = stemmerporter.stem('laziness')
e = stemmerporter.stem('dazzling')

In [24]:
print(a)
print(b)
print(c)
print(d)
print(e)

sleep
eat
crazi
lazi
dazzl


In [25]:
import nltk
from nltk.stem import LancasterStemmer
stemmerLC = LancasterStemmer()

In [5]:
a1 = stemmerLC.stem('sleeping')
b1 = stemmerLC.stem('eating')
c1 = stemmerLC.stem('craziness')
d1 = stemmerLC.stem('laziness')
e1 = stemmerLC.stem('dazzling')

In [26]:
print(a1)
print(b1)
print(c1)
print(d1)
print(e1)

sleep
eat
crazy
lazy
dazzl


In [27]:
import nltk
from nltk.stem import RegexpStemmer
st = RegexpStemmer('ing$|s$|e$|able$', min=4)

In [28]:
a0 = st.stem('sleeping')
b0 = st.stem('eating')
c0 = st.stem('craziness')
d0 = st.stem('laziness')
e0 = st.stem('dazzling')

In [29]:
print(a0)
print(b0)
print(c0)
print(d0)
print(e0)

sleep
eat
crazines
lazines
dazzl


In [10]:
 nltk.corpus.indian.words('hindi.pos')

['पूर्ण', 'प्रतिबंध', 'हटाओ', ':', 'इराक', 'संयुक्त', ...]

In [30]:
# Demonstrate snowball stemming in HIndi/Tamil/French
import nltk
from nltk.stem.snowball import SnowballStemmer
SnowballStemmer.languages


('arabic',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'porter',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish')

In [31]:
French_stemmer = SnowballStemmer('french')
French_stemmer.stem ('Bonjoura')

'bonjour'

In [32]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
ex = "An quick brown fox jumped over a lazy dog"
ex = [stemmer.stem(token) for token in ex.split(" ")]
print(" ".join(ex))

An quick brown fox jump over a lazi dog


### What is Lemmatization?
Lemmatization technique is like stemming. The output we get after lemmatization is called ‘lemma’, which is a root word rather than root stem, the output of stemming. After lemmatization, we will be getting a valid word.

NLTK provides WordNetLemmatizer class which is a thin wrapper around the wordnet corpus. This class uses morphy() function to the WordNet CorpusReader class to find a lemma. Let us understand it with an example −

In [33]:
#Lemmatizer
import nltk

from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 

In [34]:
q1 = lemmatizer.lemmatize('eating')
w1 = lemmatizer.lemmatize('books')

In [35]:
print(q1)
print(w1)

eating
book


In [36]:
word_list = ["friend", "friendship", "friends", "friendships","stabil","destabilize","misunderstanding","railroad","moonlight","football","sleeping","eating","walking","drinking","staring","glancing","steering","handling","estimation"]
print("{0:20}{1:20}{2:20}{3:20}{4:20}".format("Word","Porter Stemmer","lancaster Stemmer","RE stemmer", "Lemmatizer"))
for word in word_list:
    print("{0:20}{1:20}{2:20}{3:20}{4:20}".format(word,stemmerporter.stem(word),stemmerLC.stem(word),st.stem(word),lemmatizer.lemmatize(word)))

Word                Porter Stemmer      lancaster Stemmer   RE stemmer          Lemmatizer          
friend              friend              friend              friend              friend              
friendship          friendship          friend              friendship          friendship          
friends             friend              friend              friend              friend              
friendships         friendship          friend              friendship          friendship          
stabil              stabil              stabl               stabil              stabil              
destabilize         destabil            dest                destabiliz          destabilize         
misunderstanding    misunderstand       misunderstand       misunderstand       misunderstanding    
railroad            railroad            railroad            railroad            railroad            
moonlight           moonlight           moonlight           moonlight           moonlight  

In [37]:
#chinese segmentation using JIEBA
import jieba
seg = jieba.cut("全模式，把句子中所有的可以成词的词语都扫描出来, 速度非常快，但是不能解决歧义", cut_all = True)
print(" ".join((seg)))

全 模式 ， 把 句子 中所 所有 的 可以 成 词 的 词语 都 扫描 描出 描出来 出来 ,    速度 非常 快 ， 但是 不能 能解 解决 歧义


In [38]:
#Basic text procssing pipelining
import nltk
sent = "支持三种分词模式：精确模式，试图将句子最精确地切开，适合文本分析；全模式，"
        
words = nltk.word_tokenize(sent)
print(words)

['支持三种分词模式：精确模式，试图将句子最精确地切开，适合文本分析；全模式，']


In [39]:
text = ["Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as a single item. Lemmatization is similar to stemming but it brings context to the words. So it links words with similar meaning to one word. Text preprocessing includes both Stemming as well as Lemmatization. Many times people find these two terms confusing. Some treat these two as same. Actually, lemmatization is preferred over Stemming because lemmatization does morphological analysis of the words."]
import nltk
for texts in text:
    sentences = nltk.sent_tokenize(texts)
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        print(words)

['Lemmatization', 'is', 'the', 'process', 'of', 'grouping', 'together', 'the', 'different', 'inflected', 'forms', 'of', 'a', 'word', 'so', 'they', 'can', 'be', 'analysed', 'as', 'a', 'single', 'item', '.']
['Lemmatization', 'is', 'similar', 'to', 'stemming', 'but', 'it', 'brings', 'context', 'to', 'the', 'words', '.']
['So', 'it', 'links', 'words', 'with', 'similar', 'meaning', 'to', 'one', 'word', '.']
['Text', 'preprocessing', 'includes', 'both', 'Stemming', 'as', 'well', 'as', 'Lemmatization', '.']
['Many', 'times', 'people', 'find', 'these', 'two', 'terms', 'confusing', '.']
['Some', 'treat', 'these', 'two', 'as', 'same', '.']
['Actually', ',', 'lemmatization', 'is', 'preferred', 'over', 'Stemming', 'because', 'lemmatization', 'does', 'morphological', 'analysis', 'of', 'the', 'words', '.']


In [40]:
text1 = ["Thanjavur, formerly Tanjore, is a city in the Indian state of Tamil Nadu. Thanjavur is an important center of South Indian religion, art, and architecture. Most of the Great Living Chola Temples, which are UNESCO World Heritage Monuments, are located in and around Thanjavur. The foremost among these, the Brihadeeswara Temple, is located in the centre of the city. Thanjavur is also home to Tanjore painting, a painting style unique to the region. Thanjavur is the headquarters of the Thanjavur District. The city is an important agricultural centre located in the Cauvery Delta and is known as the Rice bowl of Tamil Nadu. Thanjavur is administered by a municipal corporation covering an area of 36.33 km2 (14.03 sq mi) and had a population of 222,943 in 2011. Roadways are the major means of transportation, while the city also has rail connectivity. The nearest airport is Tiruchirapalli International Airport, located 59.6 km (37.0 mi) away from the city. The nearest seaport is Karaikal, which is 94 km (58 mi) away from Thanjavur."] 
for texts in text1:
    sentences = nltk.sent_tokenize(texts)
    for sentence in sentences:
        words1 = nltk.word_tokenize(sentence)
        print(words1)

['Thanjavur', ',', 'formerly', 'Tanjore', ',', 'is', 'a', 'city', 'in', 'the', 'Indian', 'state', 'of', 'Tamil', 'Nadu', '.']
['Thanjavur', 'is', 'an', 'important', 'center', 'of', 'South', 'Indian', 'religion', ',', 'art', ',', 'and', 'architecture', '.']
['Most', 'of', 'the', 'Great', 'Living', 'Chola', 'Temples', ',', 'which', 'are', 'UNESCO', 'World', 'Heritage', 'Monuments', ',', 'are', 'located', 'in', 'and', 'around', 'Thanjavur', '.']
['The', 'foremost', 'among', 'these', ',', 'the', 'Brihadeeswara', 'Temple', ',', 'is', 'located', 'in', 'the', 'centre', 'of', 'the', 'city', '.']
['Thanjavur', 'is', 'also', 'home', 'to', 'Tanjore', 'painting', ',', 'a', 'painting', 'style', 'unique', 'to', 'the', 'region', '.']
['Thanjavur', 'is', 'the', 'headquarters', 'of', 'the', 'Thanjavur', 'District', '.']
['The', 'city', 'is', 'an', 'important', 'agricultural', 'centre', 'located', 'in', 'the', 'Cauvery', 'Delta', 'and', 'is', 'known', 'as', 'the', 'Rice', 'bowl', 'of', 'Tamil', 'Nadu', 