### Stemming & Lemmatization & POS

In [18]:
# Perform Stemming with NLTK
from nltk.stem import PorterStemmer, SnowballStemmer
stemmer = PorterStemmer()
words = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting", "generously"]

for word in words:
    print(word, "|", stemmer.stem(word))

eating | eat
eats | eat
eat | eat
ate | ate
adjustable | adjust
rafting | raft
ability | abil
meeting | meet
generously | gener


In [17]:
stemmer = SnowballStemmer("english")
words = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting", "generously"]

for word in words:
    print(word, "|", stemmer.stem(word))

eating | eat
eats | eat
eat | eat
ate | ate
adjustable | adjust
rafting | raft
ability | abil
meeting | meet
generously | generous


In [21]:
# Perform @ Lemmatization with Spacy
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp("Niya talked for 3 hours although talking isn't her likes")
for token in doc:
    print(token, " | ", token.lemma_)

Niya  |  Niya
talked  |  talk
for  |  for
3  |  3
hours  |  hour
although  |  although
talking  |  talk
is  |  be
n't  |  not
her  |  her
likes  |  like


In [5]:
doc = nlp("eating eats eat ate adjustable rafting ability meeting better")
for token in doc:
    print(token, " | ", token.lemma_)

eating  |  eat
eats  |  eat
eat  |  eat
ate  |  eat
adjustable  |  adjustable
rafting  |  raft
ability  |  ability
meeting  |  meeting
better  |  well


In [19]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [22]:
doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token.text, "|", token.lemma_)

Bro | bro
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brah
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust


In [29]:
ar = nlp.get_pipe('attribute_ruler')

ar.add([[{"TEXT":"Bro", "TEXT":"Brah"}]],{"LEMMA":"Brother"})
doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token.text, "|", token.lemma_)

Bro | Brother
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brother
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust


In [31]:
# POS
doc = nlp("Wow! Harry said, what an amazing movie is Avenger!")

for token in doc:
    print(token," | ", token.pos_, " | ", spacy.explain(token.pos_))

Wow  |  INTJ  |  interjection
!  |  PUNCT  |  punctuation
Harry  |  PROPN  |  proper noun
said  |  VERB  |  verb
,  |  PUNCT  |  punctuation
what  |  PRON  |  pronoun
an  |  DET  |  determiner
amazing  |  ADJ  |  adjective
movie  |  NOUN  |  noun
is  |  AUX  |  auxiliary
Avenger  |  PROPN  |  proper noun
!  |  PUNCT  |  punctuation


In [33]:
# Tags
doc = nlp("Wow! Harry said, what an amazing movie is Avenger!")

for token in doc:
    print(token," | ", token.pos_, " | ", spacy.explain(token.pos_), " | ", token.tag_, " | ", spacy.explain(token.tag_))

Wow  |  INTJ  |  interjection  |  UH  |  interjection
!  |  PUNCT  |  punctuation  |  .  |  punctuation mark, sentence closer
Harry  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
said  |  VERB  |  verb  |  VBD  |  verb, past tense
,  |  PUNCT  |  punctuation  |  ,  |  punctuation mark, comma
what  |  PRON  |  pronoun  |  WP  |  wh-pronoun, personal
an  |  DET  |  determiner  |  DT  |  determiner
amazing  |  ADJ  |  adjective  |  JJ  |  adjective (English), other noun-modifier (Chinese)
movie  |  NOUN  |  noun  |  NN  |  noun, singular or mass
is  |  AUX  |  auxiliary  |  VBZ  |  verb, 3rd person singular present
Avenger  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
!  |  PUNCT  |  punctuation  |  .  |  punctuation mark, sentence closer


In [34]:
# Remove space, puncutations etc
filtered_tokens = []

for token in doc:
    if token.pos_ not in ["SPACE", "PUNCT", "X"]:
        filtered_tokens.append(token)

In [35]:
print(filtered_tokens)

[Wow, Harry, said, what, an, amazing, movie, is, Avenger]
