## Inisialisasi

In [1]:
import spacy

In [2]:
nlp = spacy.load("id_nusantara")

In [3]:
sample_text = "Presiden Joko Widodo tersenyum gembira ketika Kereta Cepat␣Jakarta-Bandung atau KCJB bisa meraih kecepatan 350 kilometer per jam␣sehingga jarak 142,3 km jalur kereta cepat tersebut bisa ditempuh hanya␣dalam waktu 30 menit. Penunjuk kecepatan kereta dengan angka 350 km/h muncul␣di layar yang terpajang di bagian atas pintu penghubung antargerbong.␣Presiden menegaskan kereta tersebut tetap nyaman digunakan, bahkan saat␣mencapai kecepatan maksimal yang diperbolehkan, yaitu 350 km/jam."

In [4]:
doc = nlp(sample_text)

## Tokenisasi

In [5]:
for sent in doc.sents:
    print(sent.text)

Presiden Joko Widodo tersenyum gembira ketika Kereta Cepat␣Jakarta-Bandung atau KCJB bisa meraih kecepatan 350 kilometer per jam␣sehingga jarak 142,3 km jalur kereta cepat tersebut bisa ditempuh hanya␣dalam waktu 30 menit.
Penunjuk kecepatan kereta dengan angka 350 km/h muncul␣di layar yang terpajang di bagian atas pintu penghubung antargerbong.␣Presiden menegaskan kereta tersebut tetap nyaman digunakan, bahkan saat␣mencapai kecepatan maksimal yang diperbolehkan, yaitu 350 km/jam.


In [6]:
sentences = [sent for sent in doc.sents]

In [7]:
sample_sent = sentences[0]

In [8]:
print(sample_sent)

Presiden Joko Widodo tersenyum gembira ketika Kereta Cepat␣Jakarta-Bandung atau KCJB bisa meraih kecepatan 350 kilometer per jam␣sehingga jarak 142,3 km jalur kereta cepat tersebut bisa ditempuh hanya␣dalam waktu 30 menit.


## Langkah 2: Tokenisasi Kata

In [9]:
for token in sentences[0]:
    print(token.text)

Presiden
Joko
Widodo
tersenyum
gembira
ketika
Kereta
Cepat
␣
Jakarta
-
Bandung
atau
KCJB
bisa
meraih
kecepatan
350
kilometer
per
jam
␣
sehingga
jarak
142,3
km
jalur
kereta
cepat
tersebut
bisa
ditempuh
hanya
␣
dalam
waktu
30
menit
.


# Lematisasi

In [10]:
for token in sentences[0]:
    print(token.text, token.lemma_)

Presiden presiden
Joko joko
Widodo widodo
tersenyum tersenyum
gembira gembira
ketika ketika
Kereta kereta
Cepat cepat
␣ ␣
Jakarta jakarta
- -
Bandung bandung
atau atau
KCJB KCJB
bisa bisa
meraih raih
kecepatan cepat
350 350
kilometer kilometer
per per
jam jam
␣ ␣
sehingga sehingga
jarak jarak
142,3 142,3
km km
jalur jalur
kereta kereta
cepat cepat
tersebut tersebut
bisa bisa
ditempuh tempuh
hanya hanya
␣ ␣
dalam dalam
waktu waktu
30 30
menit menit
. .


In [11]:
for token in sentences[0]:
    print(token.text, token.norm_)

Presiden presiden
Joko joko
Widodo widodo
tersenyum tersenyum
gembira gembira
ketika ketika
Kereta kereta
Cepat cepat
␣ ␣
Jakarta jakarta
- -
Bandung bandung
atau atau
KCJB kcjb
bisa bisa
meraih meraih
kecepatan kecepatan
350 350
kilometer kilometer
per per
jam jam
␣ ␣
sehingga sehingga
jarak jarak
142,3 142,3
km km
jalur jalur
kereta kereta
cepat cepat
tersebut tersebut
bisa bisa
ditempuh ditempuh
hanya hanya
␣ ␣
dalam dalam
waktu waktu
30 30
menit menit
. .


## Tagging

In [12]:
for token in sentences[0]:
    print(token.text, token.pos_)

Presiden PROPN
Joko PROPN
Widodo PROPN
tersenyum VERB
gembira NOUN
ketika SCONJ
Kereta PROPN
Cepat PROPN
␣ PROPN
Jakarta PROPN
- PUNCT
Bandung PROPN
atau CCONJ
KCJB PROPN
bisa AUX
meraih VERB
kecepatan NOUN
350 NUM
kilometer NOUN
per DET
jam NOUN
␣ PROPN
sehingga SCONJ
jarak NOUN
142,3 NUM
km NOUN
jalur NOUN
kereta NOUN
cepat ADJ
tersebut DET
bisa AUX
ditempuh VERB
hanya ADV
␣ PROPN
dalam ADP
waktu NOUN
30 NUM
menit NOUN
. PUNCT


In [13]:
for token in sentences[0]:
    print(token.text, token.tag_)

Presiden NSD
Joko F--
Widodo F--
tersenyum X--
gembira VSA
ketika S--
Kereta NSD
Cepat ASP
␣ X--
Jakarta NSD
- Z--
Bandung NSD
atau H--
KCJB X--
bisa M--
meraih VSA
kecepatan NSD
350 CC-
kilometer F--
per R--
jam NSD
␣ X--
sehingga S--
jarak ASP
142,3 CC-
km F--
jalur NSD
kereta NSD
cepat ASP
tersebut B--
bisa M--
ditempuh VSP
hanya D--
␣ X--
dalam ASP
waktu NSD
30 CC-
menit NSD
. Z--


## Parsing

In [14]:
for token in sentences[0]:
    print(token.text, token.dep_)

Presiden nsubj
Joko flat:name
Widodo flat:name
tersenyum ROOT
gembira obj
ketika case
Kereta nsubj
Cepat flat:name
␣ flat:name
Jakarta flat:name
- punct
Bandung flat:name
atau cc
KCJB conj
bisa aux
meraih dep
kecepatan obj
350 nummod
kilometer nummod
per det
jam compound
␣ nmod
sehingga case
jarak nsubj:pass
142,3 nummod
km dep
jalur compound
kereta compound
cepat amod
tersebut det
bisa aux
ditempuh parataxis
hanya advmod
␣ obj
dalam case
waktu obl
30 nummod
menit nmod
. punct


In [15]:
from spacy import displacy
displacy.render(sentences[0], style="dep")

In [16]:
sentence_spans = list(doc.sents)
displacy.render(sentence_spans, style="dep")

## Case Identification

In [17]:
for token in sentences[0]:
    print(token.text, token.lower_)

Presiden presiden
Joko joko
Widodo widodo
tersenyum tersenyum
gembira gembira
ketika ketika
Kereta kereta
Cepat cepat
␣ ␣
Jakarta jakarta
- -
Bandung bandung
atau atau
KCJB kcjb
bisa bisa
meraih meraih
kecepatan kecepatan
350 350
kilometer kilometer
per per
jam jam
␣ ␣
sehingga sehingga
jarak jarak
142,3 142,3
km km
jalur jalur
kereta kereta
cepat cepat
tersebut tersebut
bisa bisa
ditempuh ditempuh
hanya hanya
␣ ␣
dalam dalam
waktu waktu
30 30
menit menit
. .


# Morphology

In [18]:
for token in sentences[0]:
    print(token.text, token.morph)

Presiden 
Joko 
Widodo 
tersenyum 
gembira 
ketika 
Kereta 
Cepat 
␣ 
Jakarta 
- 
Bandung 
atau 
KCJB 
bisa 
meraih Mood=Ind|Voice=Act
kecepatan Number=Sing
350 NumType=Card
kilometer 
per PronType=Tot
jam Number=Sing
␣ 
sehingga 
jarak 
142,3 NumType=Card
km 
jalur Number=Sing
kereta Number=Sing
cepat 
tersebut PronType=Dem
bisa 
ditempuh Mood=Ind|Voice=Pass
hanya 
␣ 
dalam 
waktu Number=Sing
30 NumType=Card
menit Number=Sing
. 


## UGD 

Nama : Emmnuel Mathew Krisna Rata 
NPM : 200710530

In [19]:
import nltk
import pandas as pd
from pprint import pprint
import re
import string
import collections
import numpy as np
from contractionsid import CONTRACTION_MAP

In [20]:
df_sms_10530=pd.read_csv('dataset_sms_spam _v1.csv')

In [21]:
df_sms_10530.head(20)

Unnamed: 0,Teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2
5,5 HARI LAGI ! EKSTRA Pulsa 50rb dg beli paket ...,2
6,"Ada iRing dgn tarif Rp. 0,1/7hr (perpanjangan ...",2
7,Akhir bulan harus tetap eksis loh! Internetan ...,2
8,Aktifkan iRing Coboy Jr - Terhebat. Tekan *808...,2
9,Ambil bonus harianmu di *600# (Bebas Pulsa). D...,2


Tokenisasi Kalimat

In [22]:
default_st = nltk.sent_tokenize
df_sms_10530['teks_token_kalimat'] = df_sms_10530['Teks'].apply(default_st)
df_sms_10530['teks_token_spacy_kalimat'] = df_sms_10530['Teks'].apply(lambda x: [sent.text for sent in nlp(x).sents])

In [23]:
print ("-------------------------------------------")
print ('\nTotal Kalimat teks_token_kalimat NLTK', len(df_sms_10530['teks_token_kalimat']))
print ('5 Lima Kalimat teks token dari awal:-')
pprint(df_sms_10530['teks_token_kalimat'].head(5))

-------------------------------------------

Total Kalimat teks_token_kalimat NLTK 1143
5 Lima Kalimat teks token dari awal:-
0    [[PROMO] Beli paket Flash mulai 1GB di MY TELK...
1    [2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat ...
2    [2016-07-08 11:47:11.Plg Yth, sisa kuota Flash...
3    [2016-08-07 11:29:47.Plg Yth, sisa kuota Flash...
4    [4.5GB/30 hari hanya Rp 55 Ribu Spesial buat a...
Name: teks_token_kalimat, dtype: object


In [24]:
print ("-------------------------------------------")
print ('\nTotal Kalimat teks_token_kalimat spacy', len(df_sms_10530['teks_token_spacy_kalimat']))
print ('5 Lima Kalimat teks token dari awal:-')
pprint(df_sms_10530['teks_token_spacy_kalimat'].head(5))

-------------------------------------------

Total Kalimat teks_token_kalimat spacy 1143
5 Lima Kalimat teks token dari awal:-
0    [[PROMO] Beli paket Flash mulai 1GB di MY TELK...
1    [2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat ...
2    [2016-07-08 11:47:11.Plg Yth, sisa kuota Flash...
3    [2016-08-07 11:29:47.Plg Yth, sisa kuota Flash...
4    [4.5GB/30 hari hanya Rp 55 Ribu Spesial buat a...
Name: teks_token_spacy_kalimat, dtype: object


In [25]:
# bentuk Tabel sekarang
df_sms_10530.head(5)

Unnamed: 0,Teks,label,teks_token_kalimat,teks_token_spacy_kalimat
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2,[[PROMO] Beli paket Flash mulai 1GB di MY TELK...,[[PROMO] Beli paket Flash mulai 1GB di MY TELK...
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2,[2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat ...,[2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat ...
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2,"[2016-07-08 11:47:11.Plg Yth, sisa kuota Flash...","[2016-07-08 11:47:11.Plg Yth, sisa kuota Flash..."
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2,"[2016-08-07 11:29:47.Plg Yth, sisa kuota Flash...","[2016-08-07 11:29:47.Plg Yth, sisa kuota Flash..."
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2,[4.5GB/30 hari hanya Rp 55 Ribu Spesial buat a...,[4.5GB/30 hari hanya Rp 55 Ribu Spesial buat a...


Tokenisasi Kata 

In [26]:
default_wt = nltk.word_tokenize
df_sms_10530['teks_token_kata'] = df_sms_10530['Teks'].apply(default_wt)
df_sms_10530['teks_token_spacy_kata'] = df_sms_10530['Teks'].apply(lambda x: [token.text for token in nlp(x)])

In [27]:
print ("-------------------------------------------")
print ('\nTotal Kalimat teks_token kata NLTK', len(df_sms_10530['teks_token_kata']))
print ('5 Lima Kalimat teks token dari awal:-')
pprint(df_sms_10530['teks_token_kata'].head(5))

-------------------------------------------

Total Kalimat teks_token kata NLTK 1143
5 Lima Kalimat teks token dari awal:-
0    [[, PROMO, ], Beli, paket, Flash, mulai, 1GB, ...
1    [2.5, GB/30, hari, hanya, Rp, 35, Ribu, Spesia...
2    [2016-07-08, 11:47:11.Plg, Yth, ,, sisa, kuota...
3    [2016-08-07, 11:29:47.Plg, Yth, ,, sisa, kuota...
4    [4.5GB/30, hari, hanya, Rp, 55, Ribu, Spesial,...
Name: teks_token_kata, dtype: object


In [28]:
print ("-------------------------------------------")
print ('\nTotal Kalimat teks_token kata spacy', len(df_sms_10530['teks_token_spacy_kata']))
print ('5 Lima Kalimat teks token dari awal:-')
pprint(df_sms_10530['teks_token_spacy_kata'].head(5))

-------------------------------------------

Total Kalimat teks_token kata spacy 1143
5 Lima Kalimat teks token dari awal:-
0    [[, PROMO, ], Beli, paket, Flash, mulai, 1, GB...
1    [2.5, GB/30, hari, hanya, Rp, 35, Ribu, Spesia...
2    [2016, -, 07, -, 08, 11:47:11.Plg, Yth, ,, sis...
3    [2016, -, 08, -, 07, 11:29:47.Plg, Yth, ,, sis...
4    [4.5GB/30, hari, hanya, Rp, 55, Ribu, Spesial,...
Name: teks_token_spacy_kata, dtype: object


In [29]:
df_sms_10530.head(5)

Unnamed: 0,Teks,label,teks_token_kalimat,teks_token_spacy_kalimat,teks_token_kata,teks_token_spacy_kata
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2,[[PROMO] Beli paket Flash mulai 1GB di MY TELK...,[[PROMO] Beli paket Flash mulai 1GB di MY TELK...,"[[, PROMO, ], Beli, paket, Flash, mulai, 1GB, ...","[[, PROMO, ], Beli, paket, Flash, mulai, 1, GB..."
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2,[2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat ...,[2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat ...,"[2.5, GB/30, hari, hanya, Rp, 35, Ribu, Spesia...","[2.5, GB/30, hari, hanya, Rp, 35, Ribu, Spesia..."
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2,"[2016-07-08 11:47:11.Plg Yth, sisa kuota Flash...","[2016-07-08 11:47:11.Plg Yth, sisa kuota Flash...","[2016-07-08, 11:47:11.Plg, Yth, ,, sisa, kuota...","[2016, -, 07, -, 08, 11:47:11.Plg, Yth, ,, sis..."
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2,"[2016-08-07 11:29:47.Plg Yth, sisa kuota Flash...","[2016-08-07 11:29:47.Plg Yth, sisa kuota Flash...","[2016-08-07, 11:29:47.Plg, Yth, ,, sisa, kuota...","[2016, -, 08, -, 07, 11:29:47.Plg, Yth, ,, sis..."
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2,[4.5GB/30 hari hanya Rp 55 Ribu Spesial buat a...,[4.5GB/30 hari hanya Rp 55 Ribu Spesial buat a...,"[4.5GB/30, hari, hanya, Rp, 55, Ribu, Spesial,...","[4.5GB/30, hari, hanya, Rp, 55, Ribu, Spesial,..."


## Normalisasi

Removing Stopwords && case conversion

In [30]:
stop_words = set(nltk.corpus.stopwords.words('indonesian'))
def normalize_and_remove_stopwords(tokens):
    normalized_tokens = []
    for token in tokens:
        token = token.lower()
        token = ''.join(e for e in token if e.isalnum())
        if token not in stop_words:
            normalized_tokens.append(token)
    return normalized_tokens


In [31]:
df_sms_10530['teks_token_spacy_kata'] = df_sms_10530['teks_token_spacy_kata'].apply(normalize_and_remove_stopwords)
df_sms_10530['teks_token_kata'] = df_sms_10530['teks_token_kata'].apply(normalize_and_remove_stopwords)

In [32]:
# Teks Token Spacy_kata
pprint(df_sms_10530['teks_token_spacy_kata'].head(10))

0    [, promo, , beli, paket, flash, 1, gb, my, tel...
1    [25, gb30, rp, 35, ribu, spesial, terpilih, , ...
2    [2016, , 07, , 08, 114711plg, yth, , sisa, kuo...
3    [2016, , 08, , 07, 112947plg, yth, , sisa, kuo...
4    [45gb30, rp, 55, ribu, spesial, terpilih, , ak...
5    [5, , ekstra, pulsa, 50, rb, dg, beli, paket, ...
6    [iring, dgn, tarif, rp, , 01, , 7hr, , perpanj...
7    [eksis, loh, , internetan, pake, volume, ultim...
8    [aktifkan, iring, coboy, jr, , terhebat, , tek...
9    [ambil, bonus, harianmu, , 600, , , bebas, pul...
Name: teks_token_spacy_kata, dtype: object


In [33]:
# Teks Token NLTK_kata
pprint(df_sms_10530['teks_token_spacy_kata'].head(10))

0    [, promo, , beli, paket, flash, 1, gb, my, tel...
1    [25, gb30, rp, 35, ribu, spesial, terpilih, , ...
2    [2016, , 07, , 08, 114711plg, yth, , sisa, kuo...
3    [2016, , 08, , 07, 112947plg, yth, , sisa, kuo...
4    [45gb30, rp, 55, ribu, spesial, terpilih, , ak...
5    [5, , ekstra, pulsa, 50, rb, dg, beli, paket, ...
6    [iring, dgn, tarif, rp, , 01, , 7hr, , perpanj...
7    [eksis, loh, , internetan, pake, volume, ultim...
8    [aktifkan, iring, coboy, jr, , terhebat, , tek...
9    [ambil, bonus, harianmu, , 600, , , bebas, pul...
Name: teks_token_spacy_kata, dtype: object


In [34]:
df_sms_10530.head(10)

Unnamed: 0,Teks,label,teks_token_kalimat,teks_token_spacy_kalimat,teks_token_kata,teks_token_spacy_kata
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2,[[PROMO] Beli paket Flash mulai 1GB di MY TELK...,[[PROMO] Beli paket Flash mulai 1GB di MY TELK...,"[, promo, , beli, paket, flash, 1gb, my, telko...","[, promo, , beli, paket, flash, 1, gb, my, tel..."
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2,[2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat ...,[2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat ...,"[25, gb30, rp, 35, ribu, spesial, terpilih, , ...","[25, gb30, rp, 35, ribu, spesial, terpilih, , ..."
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2,"[2016-07-08 11:47:11.Plg Yth, sisa kuota Flash...","[2016-07-08 11:47:11.Plg Yth, sisa kuota Flash...","[20160708, 114711plg, yth, , sisa, kuota, flas...","[2016, , 07, , 08, 114711plg, yth, , sisa, kuo..."
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2,"[2016-08-07 11:29:47.Plg Yth, sisa kuota Flash...","[2016-08-07 11:29:47.Plg Yth, sisa kuota Flash...","[20160807, 112947plg, yth, , sisa, kuota, flas...","[2016, , 08, , 07, 112947plg, yth, , sisa, kuo..."
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2,[4.5GB/30 hari hanya Rp 55 Ribu Spesial buat a...,[4.5GB/30 hari hanya Rp 55 Ribu Spesial buat a...,"[45gb30, rp, 55, ribu, spesial, terpilih, , ak...","[45gb30, rp, 55, ribu, spesial, terpilih, , ak..."
5,5 HARI LAGI ! EKSTRA Pulsa 50rb dg beli paket ...,2,"[5 HARI LAGI !, EKSTRA Pulsa 50rb dg beli pake...",[5 HARI LAGI ! EKSTRA Pulsa 50rb dg beli paket...,"[5, , ekstra, pulsa, 50rb, dg, beli, paket, in...","[5, , ekstra, pulsa, 50, rb, dg, beli, paket, ..."
6,"Ada iRing dgn tarif Rp. 0,1/7hr (perpanjangan ...",2,"[Ada iRing dgn tarif Rp., 0,1/7hr (perpanjanga...","[Ada iRing dgn tarif Rp. 0,1/7hr (perpanjangan...","[iring, dgn, tarif, rp, , 017hr, , perpanjanga...","[iring, dgn, tarif, rp, , 01, , 7hr, , perpanj..."
7,Akhir bulan harus tetap eksis loh! Internetan ...,2,"[Akhir bulan harus tetap eksis loh!, Interneta...","[Akhir bulan harus tetap eksis loh!, Interneta...","[eksis, loh, , internetan, pake, volume, ultim...","[eksis, loh, , internetan, pake, volume, ultim..."
8,Aktifkan iRing Coboy Jr - Terhebat. Tekan *808...,2,"[Aktifkan iRing Coboy Jr - Terhebat., Tekan *8...","[Aktifkan iRing Coboy Jr - Terhebat., Tekan *8...","[aktifkan, iring, coboy, jr, , terhebat, , tek...","[aktifkan, iring, coboy, jr, , terhebat, , tek..."
9,Ambil bonus harianmu di *600# (Bebas Pulsa). D...,2,"[Ambil bonus harianmu di *600# (Bebas Pulsa).,...","[Ambil bonus harianmu di *600# (Bebas Pulsa).,...","[ambil, bonus, harianmu, , 600, , , bebas, pul...","[ambil, bonus, harianmu, , 600, , , bebas, pul..."


Removing Unecessary Character Or Symbol

In [35]:
def join_tokens(tokens):
    return ' '.join(tokens)
def remove_special_characters(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

In [36]:
df_sms_10530['teks_token_spacy_kata'] = df_sms_10530['teks_token_spacy_kata'].apply(join_tokens)
df_sms_10530['teks_token_kata'] = df_sms_10530['teks_token_kata'].apply(join_tokens)

In [37]:
df_sms_10530['teks_token_spacy_kata'] = df_sms_10530['teks_token_spacy_kata'].apply(remove_special_characters)
df_sms_10530['teks_token_kata'] = df_sms_10530['teks_token_kata'].apply(remove_special_characters)

In [38]:
pprint(df_sms_10530['teks_token_spacy_kata'].head(10))
print("------------------------------------------")
pprint(df_sms_10530['teks_token_spacy_kata'].head(10))

0     promo  beli paket flash 1 gb my telkomsel app...
1    25 gb30 rp 35 ribu spesial terpilih  aktifkan ...
2    2016  07  08 114711plg yth  sisa kuota flash 4...
3    2016  08  07 112947plg yth  sisa kuota flash 7...
4    45gb30 rp 55 ribu spesial terpilih  aktifkan  ...
5    5  ekstra pulsa 50 rb dg beli paket internet b...
6    iring dgn tarif rp  01  7hr  perpanjangan rp  ...
7    eksis loh  internetan pake volume ultima 900mb...
8    aktifkan iring coboy jr  terhebat  tekan  808 ...
9    ambil bonus harianmu  600   bebas pulsa   dptk...
Name: teks_token_spacy_kata, dtype: object
------------------------------------------
0     promo  beli paket flash 1 gb my telkomsel app...
1    25 gb30 rp 35 ribu spesial terpilih  aktifkan ...
2    2016  07  08 114711plg yth  sisa kuota flash 4...
3    2016  08  07 112947plg yth  sisa kuota flash 7...
4    45gb30 rp 55 ribu spesial terpilih  aktifkan  ...
5    5  ekstra pulsa 50 rb dg beli paket internet b...
6    iring dgn tarif rp  01  7hr  

Expanding Contraction

In [39]:
def expand_contractions(text, contraction_mapping):
    def replace(match):
        return contraction_mapping[match.group(0)]
    contraction_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                     flags=re.IGNORECASE | re.DOTALL)
    expanded_text = contraction_pattern.sub(replace, text)
    return expanded_text

In [40]:
df_sms_10530['teks_token_spacy_kata'] = df_sms_10530['teks_token_spacy_kata'].apply(remove_special_characters)
df_sms_10530['teks_token_kata'] = df_sms_10530['teks_token_kata'].apply(remove_special_characters)

In [41]:
pprint(df_sms_10530['teks_token_spacy_kata'].head(10))
print("------------------------------------------")
pprint(df_sms_10530['teks_token_spacy_kata'].head(10))

0     promo  beli paket flash 1 gb my telkomsel app...
1    25 gb30 rp 35 ribu spesial terpilih  aktifkan ...
2    2016  07  08 114711plg yth  sisa kuota flash 4...
3    2016  08  07 112947plg yth  sisa kuota flash 7...
4    45gb30 rp 55 ribu spesial terpilih  aktifkan  ...
5    5  ekstra pulsa 50 rb dg beli paket internet b...
6    iring dgn tarif rp  01  7hr  perpanjangan rp  ...
7    eksis loh  internetan pake volume ultima 900mb...
8    aktifkan iring coboy jr  terhebat  tekan  808 ...
9    ambil bonus harianmu  600   bebas pulsa   dptk...
Name: teks_token_spacy_kata, dtype: object
------------------------------------------
0     promo  beli paket flash 1 gb my telkomsel app...
1    25 gb30 rp 35 ribu spesial terpilih  aktifkan ...
2    2016  07  08 114711plg yth  sisa kuota flash 4...
3    2016  08  07 112947plg yth  sisa kuota flash 7...
4    45gb30 rp 55 ribu spesial terpilih  aktifkan  ...
5    5  ekstra pulsa 50 rb dg beli paket internet b...
6    iring dgn tarif rp  01  7hr  

POS Tagging

In [42]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [43]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [44]:
# df_sms_10530['teks_token_spacy_kata_Factory'] = df_sms_10530['teks_token_spacy_kata'].apply(stemmer.stem)
# df_sms_10530['teks_token_kata_Factory'] = df_sms_10530['teks_token_kata'].apply(stemmer.stem)

In [45]:
# pprint(df_sms_10530['teks_token_spacy_kata_Factory'].head(10))
# print("------------------------------------------")
# pprint(df_sms_10530['teks_token_kata_Factory'].head(10))

In [46]:
def normalize_pos(text):
    doc = nlp(text)
    normalized_pos = " ".join([token.text + "_" + token.pos_ for token in doc])
    return normalized_pos

In [47]:
df_sms_10530['teks_token_spacy_kata'] = df_sms_10530['teks_token_spacy_kata'].apply(normalize_pos)
df_sms_10530['teks_token_kata'] = df_sms_10530['teks_token_kata'].apply(normalize_pos)

In [48]:
pprint(df_sms_10530['teks_token_spacy_kata'].head(10))
print("------------------------------------------")
pprint(df_sms_10530['teks_token_spacy_kata'].head(10))

0     _NOUN promo_PROPN  _PROPN beli_VERB paket_NOU...
1    25_NUM gb30_NOUN rp_ADP 35_NUM ribu_NOUN spesi...
2    2016_NUM  _NOUN 07_NUM  _NOUN 08_NUM 114711plg...
3    2016_NUM  _NOUN 08_NUM  _NOUN 07_NUM 112947plg...
4    45gb30_NUM rp_NOUN 55_NUM ribu_NOUN spesial_AD...
5    5_NUM  _NOUN ekstra_NOUN pulsa_NOUN 50_NUM rb_...
6    iring_NOUN dgn_NOUN tarif_NOUN rp_ADP  _NOUN 0...
7    eksis_NOUN loh_X  _NOUN internetan_NOUN pake_N...
8    aktifkan_VERB iring_NOUN coboy_NOUN jr_NOUN  _...
9    ambil_NOUN bonus_NOUN harianmu_ADJ  _ADV 600_N...
Name: teks_token_spacy_kata, dtype: object
------------------------------------------
0     _NOUN promo_PROPN  _PROPN beli_VERB paket_NOU...
1    25_NUM gb30_NOUN rp_ADP 35_NUM ribu_NOUN spesi...
2    2016_NUM  _NOUN 07_NUM  _NOUN 08_NUM 114711plg...
3    2016_NUM  _NOUN 08_NUM  _NOUN 07_NUM 112947plg...
4    45gb30_NUM rp_NOUN 55_NUM ribu_NOUN spesial_AD...
5    5_NUM  _NOUN ekstra_NOUN pulsa_NOUN 50_NUM rb_...
6    iring_NOUN dgn_NOUN tarif_NOU

Lemmatization

In [49]:
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_text = " ".join([token.lemma_ for token in doc])
    return lemmatized_text

In [50]:
df_sms_10530['teks_token_spacy_kata'] = df_sms_10530['teks_token_spacy_kata'].apply(lemmatize_text)
df_sms_10530['teks_token_kata'] = df_sms_10530['teks_token_kata'].apply(lemmatize_text)

In [51]:
pprint(df_sms_10530['teks_token_spacy_kata'].head(10))
print("------------------------------------------")
pprint(df_sms_10530['teks_token_spacy_kata'].head(10))

0      _ NOUN promo_PROPN   _ PROPN beli_VERB paket...
1    25_NUM gb30_NOUN rp_ADP 35_NUM ribu_NOUN spesi...
2    2016_NUM   _ NOUN 07_NUM   _ NOUN 08_NUM 11471...
3    2016_NUM   _ NOUN 08_NUM   _ NOUN 07_NUM 11294...
4    45gb30_NUM rp_NOUN 55_NUM ribu_NOUN spesial_AD...
5    5_NUM   _ NOUN ekstra_NOUN pulsa_NOUN 50_NUM r...
6    iring_NOUN dgn_NOUN tarif_NOUN rp_ADP   _ NOUN...
7    eksis_NOUN loh_X   _ NOUN internetan_NOUN pake...
8    aktifkan_VERB iring_NOUN coboy_NOUN jr_NOUN   ...
9    ambil_NOUN bonus_NOUN harianmu_ADJ   _ ADV 600...
Name: teks_token_spacy_kata, dtype: object
------------------------------------------
0      _ NOUN promo_PROPN   _ PROPN beli_VERB paket...
1    25_NUM gb30_NOUN rp_ADP 35_NUM ribu_NOUN spesi...
2    2016_NUM   _ NOUN 07_NUM   _ NOUN 08_NUM 11471...
3    2016_NUM   _ NOUN 08_NUM   _ NOUN 07_NUM 11294...
4    45gb30_NUM rp_NOUN 55_NUM ribu_NOUN spesial_AD...
5    5_NUM   _ NOUN ekstra_NOUN pulsa_NOUN 50_NUM r...
6    iring_NOUN dgn_NOUN tarif_NOU

Dependency Parsing

In [52]:
def normalize_dependency_parsing(text):
    doc = nlp(text)
    normalized_dependencies = " ".join([token.text + "_" + token.dep_ for token in doc])
    return normalized_dependencies

In [53]:
df_sms_10530['teks_token_spacy_kata'] = df_sms_10530['teks_token_spacy_kata'].apply(normalize_dependency_parsing)
df_sms_10530['teks_token_kata'] = df_sms_10530['teks_token_kata'].apply(normalize_dependency_parsing)

In [54]:
pprint(df_sms_10530['teks_token_spacy_kata'].head(10))
print("------------------------------------------")
pprint(df_sms_10530['teks_token_spacy_kata'].head(10))

0      _nsubj __punct NOUN_nmod promo_PROPN_flat:na...
1    25_NUM_nummod gb30_NOUN_ROOT rp_ADP_nmod 35_NU...
2    2016_NUM_nummod   _ROOT __punct NOUN_nmod 07_N...
3    2016_NUM_nummod   _ROOT __punct NOUN_nmod 08_N...
4    45gb30_NUM_nummod rp_NOUN_ROOT 55_NUM_nummod r...
5    5_NUM_nsubj   _compound __punct NOUN_nmod ekst...
6    iring_NOUN_nsubj:pass dgn_NOUN_ROOT tarif_NOUN...
7    eksis_NOUN_ROOT loh_X_amod   _compound __punct...
8    aktifkan_VERB_nsubj:pass iring_NOUN_compound c...
9    ambil_NOUN_ROOT bonus_NOUN_case harianmu_ADJ_n...
Name: teks_token_spacy_kata, dtype: object
------------------------------------------
0      _nsubj __punct NOUN_nmod promo_PROPN_flat:na...
1    25_NUM_nummod gb30_NOUN_ROOT rp_ADP_nmod 35_NU...
2    2016_NUM_nummod   _ROOT __punct NOUN_nmod 07_N...
3    2016_NUM_nummod   _ROOT __punct NOUN_nmod 08_N...
4    45gb30_NUM_nummod rp_NOUN_ROOT 55_NUM_nummod r...
5    5_NUM_nsubj   _compound __punct NOUN_nmod ekst...
6    iring_NOUN_nsubj:pass dgn_NOU

In [55]:
df_sms_10530.head(10)

Unnamed: 0,Teks,label,teks_token_kalimat,teks_token_spacy_kalimat,teks_token_kata,teks_token_spacy_kata
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2,[[PROMO] Beli paket Flash mulai 1GB di MY TELK...,[[PROMO] Beli paket Flash mulai 1GB di MY TELK...,_ROOT __punct NOUN_nmod promo_PROPN_flat:nam...,_nsubj __punct NOUN_nmod promo_PROPN_flat:na...
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2,[2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat ...,[2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat ...,25_NUM_nummod gb30_NOUN_ROOT rp_ADP_nmod 35_NU...,25_NUM_nummod gb30_NOUN_ROOT rp_ADP_nmod 35_NU...
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2,"[2016-07-08 11:47:11.Plg Yth, sisa kuota Flash...","[2016-07-08 11:47:11.Plg Yth, sisa kuota Flash...",20160708_NUM_ROOT 114711plg_NUM_nmod yth_NOUN_...,2016_NUM_nummod _ROOT __punct NOUN_nmod 07_N...
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2,"[2016-08-07 11:29:47.Plg Yth, sisa kuota Flash...","[2016-08-07 11:29:47.Plg Yth, sisa kuota Flash...",20160807_NUM_ROOT 112947plg_NOUN_nummod yth_NO...,2016_NUM_nummod _ROOT __punct NOUN_nmod 08_N...
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2,[4.5GB/30 hari hanya Rp 55 Ribu Spesial buat a...,[4.5GB/30 hari hanya Rp 55 Ribu Spesial buat a...,45gb30_NUM_nummod rp_NOUN_ROOT 55_NUM_nummod r...,45gb30_NUM_nummod rp_NOUN_ROOT 55_NUM_nummod r...
5,5 HARI LAGI ! EKSTRA Pulsa 50rb dg beli paket ...,2,"[5 HARI LAGI !, EKSTRA Pulsa 50rb dg beli pake...",[5 HARI LAGI ! EKSTRA Pulsa 50rb dg beli paket...,5_NUM_nsubj _compound __punct NOUN_nmod ekst...,5_NUM_nsubj _compound __punct NOUN_nmod ekst...
6,"Ada iRing dgn tarif Rp. 0,1/7hr (perpanjangan ...",2,"[Ada iRing dgn tarif Rp., 0,1/7hr (perpanjanga...","[Ada iRing dgn tarif Rp. 0,1/7hr (perpanjangan...",iring_NOUN_nsubj:pass dgn_NOUN_ROOT tarif_NOUN...,iring_NOUN_nsubj:pass dgn_NOUN_ROOT tarif_NOUN...
7,Akhir bulan harus tetap eksis loh! Internetan ...,2,"[Akhir bulan harus tetap eksis loh!, Interneta...","[Akhir bulan harus tetap eksis loh!, Interneta...",eksis_NOUN_ROOT loh_X_amod _compound __punct...,eksis_NOUN_ROOT loh_X_amod _compound __punct...
8,Aktifkan iRing Coboy Jr - Terhebat. Tekan *808...,2,"[Aktifkan iRing Coboy Jr - Terhebat., Tekan *8...","[Aktifkan iRing Coboy Jr - Terhebat., Tekan *8...",aktifkan_VERB_ROOT iring_NOUN_compound coboy_N...,aktifkan_VERB_nsubj:pass iring_NOUN_compound c...
9,Ambil bonus harianmu di *600# (Bebas Pulsa). D...,2,"[Ambil bonus harianmu di *600# (Bebas Pulsa).,...","[Ambil bonus harianmu di *600# (Bebas Pulsa).,...",ambil_NOUN_ROOT bonus_NOUN_case harianmu_ADJ_n...,ambil_NOUN_ROOT bonus_NOUN_case harianmu_ADJ_n...
