# Stemming
* How to create Stemmers using NLTK & Gensim libraries
* How to stem words and documents
* Advantages & Disadvantages of stemming

In [None]:
import pandas as pd, nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Create stemmer using nltk

In [None]:
stemr = PorterStemmer()

In [None]:
doc = 'I visited my grandparents last week; We had a good time together'

In [None]:
# Tokenize Document
tokenizer = RegexpTokenizer('\w+')
tokens = tokenizer.tokenize(doc.lower())
tokens


['i',
 'visited',
 'my',
 'grandparents',
 'last',
 'week',
 'we',
 'had',
 'a',
 'good',
 'time',
 'together']

In [None]:
# Stem each token 
for x in tokens:
  print(x, stemr.stem(x))

i i
visited visit
my my
grandparents grandpar
last last
week week
we we
had had
a a
good good
time time
together togeth


In [None]:
([stemr.stem(x) for x in tokens ])

['i',
 'visit',
 'my',
 'grandpar',
 'last',
 'week',
 'we',
 'had',
 'a',
 'good',
 'time',
 'togeth']

In [None]:
# Concat tokens to form a sentence
doc_stemmed = ' '.join(([stemr.stem(x) for x in tokens ]))
doc_stemmed

'i visit my grandpar last week we had a good time togeth'

In [None]:
docs_list = []
docs = ['I visited my grandparents last week; We had a good time together', 'nlp engineers spend most of their time on text cleaning'] # list of lists

for x in docs:
  toks = tokenizer.tokenize(x.lower())
  doc_cleaned = ' '.join([stemr.stem(x) for x in toks])
  docs_list.append(doc_cleaned)

docs_list

['i visit my grandpar last week we had a good time togeth',
 'nlp engin spend most of their time on text clean']

# Stemming using gensim

In [None]:
from gensim.parsing.porter import PorterStemmer
gp = PorterStemmer()

In [None]:
# Stem individual tokens
words = ['visited', 'visiting', 'visits']

for x in words:
  print(gp.stem(x))

visit
visit
visit


In [None]:
# stem a complete document
doc = 'I visited my grandparents last week; We had a good time together'
gp.stem_sentence(doc)

'i visit my grandpar last week; we had a good time togeth'

In [None]:
# stem multiple docs together
docs = ['I visited my grandparents last week; We had a good time together', 'nlp engineers spend most of their time on text cleaning'] # list of lists

doc_stemmed = gp.stem_documents(docs)
doc_stemmed


['i visit my grandpar last week; we had a good time togeth',
 'nlp engin spend most of their time on text clean']

* Here gensim library will go through one document at a time 
* Then stem one token at a time internally
* We can evade using a 'for loop'

## Performing on csv data

In [None]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
downloaded = drive.CreateFile({'id':'12CUjW29tTTxYAcPhxuKb_qSn0UTzc4BR'}) # replace the id with id of file you want to access
downloaded.GetContentFile('imdb_sentiment.csv') 

In [None]:
data = pd.read_csv('imdb_sentiment.csv')
data.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [None]:
# Stem all the rows in the text column together
docs_stemmed = gp.stem_documents(data['review'].str.lower().str.replace('[^\w+\s]',''))

In [None]:
docs_stemmed[:10]

['a veri veri veri slowmov aimless movi about a distress drift young man',
 'not sure who wa more lost the flat charact or the audienc nearli half of whom walk out',
 'attempt arti with black white and clever camera angl the movi disappoint becam even more ridicul as the act wa poor and the plot and line almost nonexist',
 'veri littl music or anyth to speak of',
 'the best scene in the movi wa when gerardo is try to find a song that keep run through hi head',
 'the rest of the movi lack art charm mean if it about empti it work i guess becaus it empti',
 'wast two hour',
 'saw the movi todai and thought it wa a good effort good messag for kid',
 'a bit predict',
 'love the cast of jimmi buffet as the scienc teacher']

* We were able to stem all the words w/t using a for loop

# Pros and cons of stemming

### Advantages

In [None]:
toks_stems = []

docs = data['review'].str.lower().str.replace('[^\w+\s]','')
for x in docs:
  toks = tokenizer.tokenize(x) # tokenizing
  for y in toks:
    stems = stemr.stem(y) # stemming
    if y != stems:
      toks_stems.append((y, stems))

In [None]:
df = pd.DataFrame(toks_stems, columns=['original','root'])
df

Unnamed: 0,original,root
0,very,veri
1,very,veri
2,very,veri
3,slowmoving,slowmov
4,movie,movi
...,...,...
4481,exceptionally,except
4482,its,it
4483,ones,one
4484,intelligence,intellig


In [None]:
# Drop duplicates
df.drop_duplicates().head(10)

Unnamed: 0,original,root
0,very,veri
3,slowmoving,slowmov
4,movie,movi
5,distressed,distress
6,drifting,drift
7,was,wa
8,characters,charact
9,audience,audienc
10,nearly,nearli
11,walked,walk


In [None]:
df.drop_duplicates()['root'].value_counts()

continu    5
deliv      4
consid     4
relat      4
emot       4
          ..
machin     1
readi      1
roll       1
presenc    1
indic      1
Name: root, Length: 1464, dtype: int64

In [None]:
df[df['root']=='continu'].drop_duplicates

<bound method DataFrame.drop_duplicates of           original     root
1093   continually  continu
1116  continuation  continu
1815    continuity  continu
2168      continue  continu
2771  continuously  continu
3079    continuity  continu>

In [None]:
df[df['root']=='imagin'].drop_duplicates

<bound method DataFrame.drop_duplicates of          original    root
118    imaginable  imagin
580   imagination  imagin
718    imaginable  imagin
977   imagination  imagin
1274  imagination  imagin
2380  imaginative  imagin
2570  imagination  imagin
3828      imagine  imagin
4349  imaginative  imagin>

### Disadvantage

In [None]:
stemr.stem('organization')

'organ'

In [None]:
stemr.stem('president')

'presid'