In [1]:
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("data_prediksi/data_prediksi.csv")
data.head()

Unnamed: 0,Komen
0,"Soal jln Jatibaru,polisi tdk bs GERTAK gubernu..."
1,"Sesama cewe lho (kayaknya), harusnya bisa lebi..."
2,Kepingin gudeg mbarek Bu hj. Amad Foto dari go...
3,"Jln Jatibaru,bagian dari wilayah Tn Abang.Peng..."
4,"Sharing pengalaman aja, kemarin jam 18.00 bata..."


Case Folding dan Pembersihan Teks

In [3]:
# Proses Case Folding 
# Proses Membersihkan teks dari simbol yang tidak diperlukan
import re 
def casefolding(komen):
    komen = komen.lower()
    komen = komen.strip(" ")
    komen = re.sub(r'(url)|(username)|(<[A-Za-z0-9]+>)|(#[A-Za-z0-9]+)|(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)',"",komen)
    return komen
data['Komen'] = data['Komen'].apply(casefolding)
data.head(100)

Unnamed: 0,Komen
0,soal jln jatibarupolisi tdk bs gertak gubernur...
1,sesama cewe lho kayaknya harusnya bisa lebih r...
2,kepingin gudeg mbarek bu hj amad foto dari goo...
3,jln jatibarubagian dari wilayah tn abangpengat...
4,sharing pengalaman aja kemarin jam 1800 batali...
...,...
95,mudah2an sudah terupload smua sebelum z mudik ...
96,orang pendukung khilafah memang harus di black...
97,jangan sok akrab ye mention mention gue malin...
98,alhamdulillah prof setelah berbicara semalam d...


Tokenizing

In [4]:
# Proses Tokenizing
# Memecah sebuah teks menjadi bagian-bagian yang lebih kecil
def token(komen):
    nstr = komen.split(" ")
    dat = []
    a = -1

    for i in nstr:
        a = a + 1

    if i == "":
        dat.append(a)

    p = 0
    b = 0

    for q in dat:
        b = q - p
        del nstr[b]
        p = p + 1

    return nstr

data["Komen"] = data["Komen"].apply(token)
data.head()

Unnamed: 0,Komen
0,"[soal, jln, jatibarupolisi, tdk, bs, gertak, g..."
1,"[sesama, cewe, lho, kayaknya, harusnya, bisa, ..."
2,"[kepingin, gudeg, mbarek, bu, hj, amad, foto, ..."
3,"[jln, jatibarubagian, dari, wilayah, tn, abang..."
4,"[sharing, pengalaman, aja, kemarin, jam, 1800,..."


Filtering

In [5]:
# Proses Filtering
# Menghilangkan kata yang tidak bermanfaat
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def stopword_removal(komen):
    filtering = stopwords.words("indonesian")
    x = []
    data = []
    def myFunc(x):
        if x in filtering:
            return False
        else:
            return True
        
    fit = filter(myFunc, komen)
    for x in fit:
        data.append(x)

    return data

data["Komen"] = data["Komen"].apply(stopword_removal)
data.head(100)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\verda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Komen
0,"[jln, jatibarupolisi, tdk, bs, gertak, gubernu..."
1,"[cewe, lho, kayaknya, rasain, sibuk, jaga, ras..."
2,"[kepingin, gudeg, mbarek, bu, hj, amad, foto, ..."
3,"[jln, jatibarubagian, wilayah, tn, abangpengat..."
4,"[sharing, pengalaman, aja, kemarin, jam, 1800,..."
...,...
95,"[mudah2an, terupload, smua, z, mudik, karna, f..."
96,"[orang, pendukung, khilafah, black, mail, , , ..."
97,"[sok, akrab, ye, mention, mention, gue, , mali..."
98,"[alhamdulillah, prof, berbicara, semalam, , ad..."


Stemming

In [6]:
# Proses Steming
from sklearn.pipeline import Pipeline
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def stemming(komen):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    kt = []

    for w in komen:
        dt = stemmer.stem(w)
        kt.append(dt)

    dt_clean = []
    dt_clean = " ".join(kt)
    return dt_clean

data["Komen"] = data["Komen"].apply(stemming)

data.to_csv("data_prediksi/data_prediksi_clean.csv", index=False)
data_clean = pd.read_csv("data_prediksi/data_prediksi_clean.csv")
data_clean.head

<bound method NDFrame.head of                                                  Komen
0    jln jatibarupolisi tdk bs gertak gubernur eman...
1    cewe lho kayak rasain sibuk jaga rasain sakit ...
2    kepingin gudeg mbarek bu hj amad foto google s...
3    jln jatibarubagian wilayah tn abangpengaturan ...
4    sharing alam aja kemarin jam 1800 batalin tike...
..                                                 ...
490  selamat  resmi dar saran si jago kasih dengar ...
491  heran rkhup ngawur kek yg rancang ngawur deh s...
492       iya wangsit sih undur pesimis deh negara ...
493  um thanks for  nayangin teleseries  jejak suar...
494  yg tnya nawarin knpa sih gak kerja bank sih ga...

[495 rows x 1 columns]>

In [7]:
data_clean = data_clean.astype({'Komen' : 'string'})
data_clean.dtypes

Komen    string[python]
dtype: object

### Ubah data bersih menjadi CSV

In [8]:
data_clean.to_csv("data_prediksi/data_prediksi_clean.csv", index=False)