# Stemming

## Test out Porter stemmer

In [2]:
import nltk

ps = nltk.PorterStemmer()

In [3]:
dir(ps)

['MARTIN_EXTENSIONS',
 'NLTK_EXTENSIONS',
 'ORIGINAL_ALGORITHM',
 '__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_apply_rule_list',
 '_contains_vowel',
 '_ends_cvc',
 '_ends_double_consonant',
 '_has_positive_measure',
 '_is_consonant',
 '_measure',
 '_replace_suffix',
 '_step1a',
 '_step1b',
 '_step1c',
 '_step2',
 '_step3',
 '_step4',
 '_step5a',
 '_step5b',
 'mode',
 'pool',
 'stem',
 'vowels']

In [5]:
print(ps.stem('grows'), ps.stem('growing'), ps.stem('grow'), ps.stem('grew'))

grow grow grow grew


In [6]:
print(ps.stem('run'), ps.stem('running'), ps.stem('runner'))

run run runner


## Read in raw text

In [8]:
import pandas as pd
import re
import string
pd.set_option('display.max_colwidth', 100)

# Read in the raw text (file downloaded from kaggle and renamed https://www.kaggle.com/uciml/sms-spam-collection-dataset)
rawData = open("SMSSpamCollection.csv").read()
parsedData = rawData.replace("ham,", "ham\t")
parsedData = parsedData.replace("spam,", "spam\t")
parsedData = parsedData.replace("\t", "\n").split("\n")
parsedData = parsedData[1:]
labelList = parsedData[0::2]
textList = parsedData[1::2]
data = pd.DataFrame({'label': labelList,
                     'body_text': textList})
data.head()

Unnamed: 0,label,body_text
0,ham,"""Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there ..."
1,ham,"Ok lar... Joking wif u oni...,,,"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,"U dun say so early hor... U c already then say...,,,"
4,ham,"""Nah I don't think he goes to usf, he lives around here though"",,,"


In [10]:
stopwords = nltk.corpus.stopwords.words('english')


def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

data['body_text_nonstop'] = data['body_text'].apply(lambda x: clean_text(x.lower()))
data.head()

Unnamed: 0,label,body_text,body_text_nonstop
0,ham,"""Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there ...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,"Ok lar... Joking wif u oni...,,,","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,"U dun say so early hor... U c already then say...,,,","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"""Nah I don't think he goes to usf, he lives around here though"",,,","[nah, dont, think, goes, usf, lives, around, though]"


## Stem text

In [12]:
def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

data['body_text_stemmed'] = data['body_text_nonstop'].apply(lambda x: stemming(x))
data.head()

Unnamed: 0,label,body_text,body_text_nonstop,body_text_stemmed
0,ham,"""Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there ...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]"
1,ham,"Ok lar... Joking wif u oni...,,,","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,..."
3,ham,"U dun say so early hor... U c already then say...,,,","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"""Nah I don't think he goes to usf, he lives around here though"",,,","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]"


## Lemmatize

### Test out WordNet lemmatizer

In [13]:
import nltk

wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

In [15]:
dir(wn)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'lemmatize']

In [16]:
print(ps.stem('meanness'))
print(ps.stem('meaning'))

mean
mean


In [17]:
print(wn.lemmatize('meanness'))
print(wn.lemmatize('meaning'))

meanness
meaning


In [18]:
print(ps.stem('goose'))
print(ps.stem('geese'))

goos
gees


In [19]:
print(wn.lemmatize('goose'))
print(wn.lemmatize('geese'))

goose
goose


### Read raw text

In [22]:
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

data['body_text_lemmatized'] = data['body_text_nonstop'].apply(lambda x: lemmatizing(x))
data.head(10)

Unnamed: 0,label,body_text,body_text_nonstop,body_text_stemmed,body_text_lemmatized
0,ham,"""Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there ...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,"Ok lar... Joking wif u oni...,,,","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,"U dun say so early hor... U c already then say...,,,","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"""Nah I don't think he goes to usf, he lives around here though"",,,","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]","[nah, dont, think, go, usf, life, around, though]"
5,spam,"""FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for...","[freemsg, hey, darling, 3, weeks, word, back, id, like, fun, still, tb, ok, xxx, std, chgs, send...","[freemsg, hey, darl, 3, week, word, back, id, like, fun, still, tb, ok, xxx, std, chg, send, å, ...","[freemsg, hey, darling, 3, week, word, back, id, like, fun, still, tb, ok, xxx, std, chgs, send,..."
6,ham,"Even my brother is not like to speak with me. They treat me like aids patent.,,,","[even, brother, like, speak, treat, like, aids, patent]","[even, brother, like, speak, treat, like, aid, patent]","[even, brother, like, speak, treat, like, aid, patent]"
7,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,"[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, callers, pr...","[per, request, mell, mell, oru, minnaminungint, nurungu, vettam, set, callertun, caller, press, ...","[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, caller, pre..."
8,spam,WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To ...,"[winner, valued, network, customer, selected, receivea, å, 900, prize, reward, claim, call, 0906...","[winner, valu, network, custom, select, receivea, å, 900, prize, reward, claim, call, 0906170146...","[winner, valued, network, customer, selected, receivea, å, 900, prize, reward, claim, call, 0906..."
9,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with came...,"[mobile, 11, months, u, r, entitled, update, latest, colour, mobiles, camera, free, call, mobile...","[mobil, 11, month, u, r, entitl, updat, latest, colour, mobil, camera, free, call, mobil, updat,...","[mobile, 11, month, u, r, entitled, update, latest, colour, mobile, camera, free, call, mobile, ..."
