In [2]:
import nltk
import pandas as pd

## Load Dataset

In [3]:
train = pd.read_csv("data/train.tsv", sep="\t", index_col="PhraseId")

print(train.shape)
train.head()

(156060, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


### Stemmer

In [4]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')
stemmer

<nltk.stem.snowball.SnowballStemmer at 0x26b7b66bf98>

In [5]:
stemmer.stem('foods')

'food'

In [6]:
stemmer.stem('recommended')

'recommend'

In [7]:
stemmer.stem('studied')

'studi'

In [8]:
phrase = train.loc[2274]["Phrase"]
phrase

'Highly recommended viewing for its courage , ideas , technical proficiency and great acting .'

In [9]:
words = phrase.split(" ")

stemmed_words = []

for word in words:
    stemmed_word = stemmer.stem(word)
    stemmed_words.append(stemmed_word)
    
stemmed_phrase = " ".join(stemmed_words)
stemmed_phrase

'high recommend view for it courag , idea , technic profici and great act .'

In [10]:
words = phrase.split(" ")
stemmed_words = [stemmer.stem(w) for w in words]

stemmed_phrase = " ".join(stemmed_words)
stemmed_phrase

'high recommend view for it courag , idea , technic profici and great act .'

In [11]:
!pip install tqdm



In [12]:
from tqdm import tqdm

def stem_phrase(phrase):
    words = phrase.split(" ")

    stemmed_words = [stemmer.stem(w) for w in words]
    stemmed_phrase = " ".join(stemmed_words)

    return stemmed_phrase

tqdm.pandas(desc="stemming...")
train["Phrase(Stemmed)"] = train["Phrase"].progress_apply(stem_phrase)

train.head()

stemming...: 100%|███████████████████████████████████████████████████████████| 156060/156060 [00:24<00:00, 6417.68it/s]


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(Stemmed)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,A series of escapades demonstrating the adage ...,1,a seri of escapad demonstr the adag that what ...
2,1,A series of escapades demonstrating the adage ...,2,a seri of escapad demonstr the adag that what ...
3,1,A series,2,a seri
4,1,A,2,a
5,1,series,2,seri


### Lemmatizer

In [13]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatizer

<WordNetLemmatizer>

In [14]:
stemmer.stem('feet'), lemmatizer.lemmatize('feet')

('feet', 'foot')

In [15]:
stemmer.stem('studies'), lemmatizer.lemmatize('studies')

('studi', 'study')

In [16]:
# pos == part of speech
stemmer.stem('went'), lemmatizer.lemmatize('went', pos='v')

('went', 'go')

### Pos Tagger

In [17]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\optim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize

phrase = train.loc[2274]["Phrase"]

words_pos = pos_tag(word_tokenize(phrase))
words_pos

[('Highly', 'RB'),
 ('recommended', 'JJ'),
 ('viewing', 'NN'),
 ('for', 'IN'),
 ('its', 'PRP$'),
 ('courage', 'NN'),
 (',', ','),
 ('ideas', 'NNS'),
 (',', ','),
 ('technical', 'JJ'),
 ('proficiency', 'NN'),
 ('and', 'CC'),
 ('great', 'JJ'),
 ('acting', 'NN'),
 ('.', '.')]

In [19]:
phrase = "you should close the door"
words = phrase.split(" ")

stemmed_words = [stemmer.stem(w) for w in words]
stemmed_phrase = " ".join(stemmed_words)

pos_tag(word_tokenize(stemmed_phrase))

[('you', 'PRP'),
 ('should', 'MD'),
 ('close', 'VB'),
 ('the', 'DT'),
 ('door', 'NN')]

In [20]:
phrase = "the door is already closed"
words = phrase.split(" ")

stemmed_words = [stemmer.stem(w) for w in words]
stemmed_phrase = " ".join(stemmed_words)

pos_tag(word_tokenize(stemmed_phrase))

[('the', 'DT'),
 ('door', 'NN'),
 ('is', 'VBZ'),
 ('alreadi', 'VBN'),
 ('close', 'RB')]