In [1]:
import nltk #import library nltk
from nltk.tokenize import word_tokenize #import word_tokenize for tokenizing text into words 
from nltk.tokenize import sent_tokenize #import sent_tokenize for tokenizing paragraph into sentences
from nltk.stem.porter import PorterStemmer #import Porter Stemmer Algorithm 
from nltk.stem import WordNetLemmatizer #import WordNet lemmatizer 
from nltk.corpus import stopwords #import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory #import Indonesian Stemmer
import re #import regular expression

In [13]:
#sentence tokenization
def sentence_tokenization(s):
    sentences_list = sent_tokenize(s)
    
    return sentences_list

text_data = "Saya suka belajar. Karena ingin menjadi pintar. Selain itu, saya ingin membuat bahagia kedua orang tua."
sentence_tokenization(text_data)

['Saya suka belajar.',
 'Karena ingin menjadi pintar.',
 'Selain itu, saya ingin membuat bahagia kedua orang tua.']

In [15]:
#word tokenization
def word_tokenization(s):
    tokens = word_tokenize(s)

    return tokens
    
text_data = "Saya suka belajar. Karena ingin menjadi pintar. Selain itu, saya ingin membuat bahagia kedua orang tua."
word_tokenization(text_data)

['Saya',
 'suka',
 'belajar',
 '.',
 'Karena',
 'ingin',
 'menjadi',
 'pintar',
 '.',
 'Selain',
 'itu',
 ',',
 'saya',
 'ingin',
 'membuat',
 'bahagia',
 'kedua',
 'orang',
 'tua',
 '.']

In [16]:
#casefolding
def casefolding(s):
    new_str = s.lower()
    
    return new_str

text_data = "Saya suka belajar. Karena ingin menjadi pintar. Selain itu, saya ingin membuat bahagia kedua orang tua."
casefolding(text_data)

'saya suka belajar. karena ingin menjadi pintar. selain itu, saya ingin membuat bahagia kedua orang tua.'

In [17]:
#Stemming Indonesian
def stemmingIndo(str):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return stemmer.stem(str)

text_data = "Saya suka belajar. Karena ingin menjadi pintar. Selain itu, saya ingin membuat bahagia kedua orang tua."
stemmingIndo(text_data)

'saya suka ajar karena ingin jadi pintar selain itu saya ingin buat bahagia dua orang tua'

In [19]:
#Stemming English
def stemmingEnglish(str):
    porter_stemmer = PorterStemmer()
    words = word_tokenize(str)
    result = list()
    for word in words:
        result.append(porter_stemmer.stem(word))
        
    return ' '.join(result)

text_data = "She had been with her father and sister when she was attacked and received first aid at the scene, an official said."
stemmingEnglish(text_data)

'she had been with her father and sister when she wa attack and receiv first aid at the scene , an offici said .'

In [23]:
porter_stemmer = PorterStemmer()

word_data = "It originated from the idea that there are readers who prefer learning new skills from the comforts of their drawing rooms"
# First Word tokenization
nltk_tokens = nltk.word_tokenize(word_data)
#Next find the roots of the word
for w in nltk_tokens:
       print("Actual: %s  Stem: %s"  % (w,porter_stemmer.stem(w)))

Actual: It  Stem: It
Actual: originated  Stem: origin
Actual: from  Stem: from
Actual: the  Stem: the
Actual: idea  Stem: idea
Actual: that  Stem: that
Actual: there  Stem: there
Actual: are  Stem: are
Actual: readers  Stem: reader
Actual: who  Stem: who
Actual: prefer  Stem: prefer
Actual: learning  Stem: learn
Actual: new  Stem: new
Actual: skills  Stem: skill
Actual: from  Stem: from
Actual: the  Stem: the
Actual: comforts  Stem: comfort
Actual: of  Stem: of
Actual: their  Stem: their
Actual: drawing  Stem: draw
Actual: rooms  Stem: room


In [21]:
#Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

word_data = "It originated from the idea that there are readers who prefer learning new skills from the comforts of their drawing rooms"
nltk_tokens = nltk.word_tokenize(word_data)
for w in nltk_tokens:
       print ("Actual: %s  Lemma: %s"  % (w,wordnet_lemmatizer.lemmatize(w)))

Actual: It  Lemma: It
Actual: originated  Lemma: originated
Actual: from  Lemma: from
Actual: the  Lemma: the
Actual: idea  Lemma: idea
Actual: that  Lemma: that
Actual: there  Lemma: there
Actual: are  Lemma: are
Actual: readers  Lemma: reader
Actual: who  Lemma: who
Actual: prefer  Lemma: prefer
Actual: learning  Lemma: learning
Actual: new  Lemma: new
Actual: skills  Lemma: skill
Actual: from  Lemma: from
Actual: the  Lemma: the
Actual: comforts  Lemma: comfort
Actual: of  Lemma: of
Actual: their  Lemma: their
Actual: drawing  Lemma: drawing
Actual: rooms  Lemma: room


In [24]:
#remove digit from string
def removeDigit(str):
    new_string =  re.sub(r"[0-9]", " ", str)
    return new_string

text_data = "saya lahir tanggal 1 Januari 2016"
removeDigit(text_data)

'saya lahir tanggal   Januari     '

In [25]:
#pos tagging
def postag(str):
    tok_sentence = nltk.word_tokenize(str)
    tagged_sentence = nltk.pos_tag(tok_sentence)
    return tagged_sentence

text_data = "It originated from the idea that there are readers who prefer learning new skills from the comforts of their drawing rooms"
postag(text_data)

[('It', 'PRP'),
 ('originated', 'VBD'),
 ('from', 'IN'),
 ('the', 'DT'),
 ('idea', 'NN'),
 ('that', 'IN'),
 ('there', 'EX'),
 ('are', 'VBP'),
 ('readers', 'NNS'),
 ('who', 'WP'),
 ('prefer', 'VBP'),
 ('learning', 'VBG'),
 ('new', 'JJ'),
 ('skills', 'NNS'),
 ('from', 'IN'),
 ('the', 'DT'),
 ('comforts', 'NNS'),
 ('of', 'IN'),
 ('their', 'PRP$'),
 ('drawing', 'NN'),
 ('rooms', 'NNS')]