#Import data

In [None]:
import pandas as pd
import numpy as np

data = pd.read_csv("/content/drive/MyDrive/segari_data_not_clean.tsv", sep='\t')

data.head()

Unnamed: 0,score,content,category
0,5,Mantabbbbb,positive
1,2,Semenjak cod ditiadakan yg ke wilayahku udah g...,negative
2,5,Sayang bgt ya buat si kecil kurang lengkap klo...,positive
3,5,"Belum pernah belanja, tapi liatt teman selalu ...",positive
4,5,"sayuran fresh,bersih & semua barang kualitas baik",positive


#Case Folding

In [None]:
# Case Folding
data['content'] = data['content'].str.lower()

print('Case Folding Result : \n')
print(data['content'].head(5))
print('\n\n\n')

Case Folding Result : 

0                                           mantabbbbb
1    semenjak cod ditiadakan yg ke wilayahku udah g...
2    sayang bgt ya buat si kecil kurang lengkap klo...
3    belum pernah belanja, tapi liatt teman selalu ...
4    sayuran fresh,bersih & semua barang kualitas baik
Name: content, dtype: object






#Tokenize

In [None]:
!pip install nltk



In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import string
import re #regex library

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# ------ Tokenizing ---------

def remove_content_special(text):
    # remove tab, new line, and backslash
    text = text.replace('\\t', ' ').replace('\\n', ' ').replace('\\u', '').replace('\\', '')

    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')

    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)", " ", text).split())

    # remove incomplete URL
    text = text.replace("http://", " ").replace("https://", " ")

    return text

data['content'] = data['content'].apply(remove_content_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

data['content'] = data['content'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

data['content'] = data['content'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

data['content'] = data['content'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

data['content'] = data['content'].apply(remove_whitespace_multiple)

# remove single char
def remove_single_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

data['content'] = data['content'].apply(remove_single_char)

# NLTK word_tokenize
def word_tokenize_wrapper(text):
    return word_tokenize(text)

data['content_tokens'] = data['content'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n')
print(data['content_tokens'].head())
print('\n\n\n')

Tokenizing Result : 

0                                         [mantabbbbb]
1    [semenjak, cod, ditiadakan, yg, ke, wilayahku,...
2    [sayang, bgt, ya, buat, si, kecil, kurang, len...
3    [belum, pernah, belanja, tapi, liatt, teman, s...
4    [sayuran, freshbersih, semua, barang, kualitas...
Name: content_tokens, dtype: object






#Stopwords

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')


# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo',
                       'kalo', 'amp', 'biar', 'bikin', 'bilang',
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih',
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya',
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't',
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])

# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# ---------------------------------------------------------------------------------------

# convert list to dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

data['content_tokens_stw'] = data['content_tokens'].apply(stopwords_removal)


print(data['content_tokens_stw'].head())

0                                         [mantabbbbb]
1    [semenjak, cod, ditiadakan, wilayahku, udah, p...
2    [sayang, bgt, lengkap, lengkap, skalian, kn, b...
3    [belanja, liatt, teman, belanja, lewatt, aplik...
4             [sayuran, freshbersih, barang, kualitas]
Name: content_tokens_stw, dtype: object


#Normalisasi

In [None]:
normalizad_word = pd.read_csv("normalisasi2.csv", encoding='ISO-8859-1')

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1]

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

data['content_normalized'] = data['content_tokens_stw'].apply(normalized_term)

data['content_normalized'].head(10)

0                                             [mantap]
1    [semenjak, cod, ditiadakan, wilayahku, sudah, ...
2    [sayang, banget, lengkap, lengkap, sekalian, k...
3    [belanja, lihat, teman, belanja, lewat, aplika...
4             [sayuran, freshbersih, barang, kualitas]
5    [kualitas, barang, fresh, segar, bersih, pengi...
6    [sekarang, belanja, yang, komplain, mengaco, c...
7             [produk, bagus, packaging, safe, banget]
8                     [pengiriman, cepatbarang, bagus]
9                              [bagus, cepat, praktis]
Name: content_normalized, dtype: object

#Stemming

In [None]:
!pip install swifter



In [None]:
!pip install Sastrawi



In [None]:
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter


# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in data['content_normalized']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '

print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])

print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

data['content_tokens_stemmed'] = data['content_normalized'].swifter.apply(get_stemmed_term)
print(data['content_tokens_stemmed'])

[1;30;43mOutput streaming akan dipotong hingga 5000 baris terakhir.[0m
mengaco : aco
transfer : transfer
nugelo : nugelo
segari : segar
produk : produk
bagus : bagus
packaging : packaging
safe : safe
cepatbarang : cepatbarang
praktis : praktis
beberapa : beberapa
kali : kali
kosong : kosong
display : display
lahbaik : lahbaik
promo : promo
ada : ada
uang : uang
coin : coin
begitu : begitu
biaya : biaya
penanganan : tangan
gabung : gabung
mending : mending
pindah : pindah
toko : toko
sebelah : belah
keren : keren
membantu : bantu
emak : emak
dari : dari
tukang : tukang
sayur : sayur
tempatku : tempat
min : min
sampai : sampai
sayurnya : sayur
promonya : promonya
very : very
worth : worth
itu : itu
anak : anak
riweh : riweh
pesan : pesan
hp : hp
langsung : langsung
buah : buah
buahan : buah
juga : juga
harga : harga
ok : ok
sesuai : sesuai
jadwal : jadwal
pengemasan : emas
telur : telur
ditingkatkan : tingkat
pengamanan : aman
ditindih : tindih
good : good
aplication : aplication
keban

Pandas Apply:   0%|          | 0/5206 [00:00<?, ?it/s]

0                                                [mantap]
1       [semenjak, cod, tiada, wilayah, sudah, pakai, ...
2       [sayang, banget, lengkap, lengkap, sekali, kan...
3       [belanja, lihat, teman, belanja, lewat, aplika...
4                  [sayur, freshbersih, barang, kualitas]
                              ...                        
5201                        [barang, fresh, kirim, cepat]
5202                                               [baik]
5203                                  [seller, bagaimana]
5204    [aplikasi, bantu, sibuk, rumah, pandemi, langs...
5205    [suruh, masuk, kode, pin, error, nomor, telpon...
Name: content_tokens_stemmed, Length: 5206, dtype: object


In [None]:
data.to_csv("/content/drive/MyDrive/3Text_Preprocessing_daritsv.tsv", sep='\t', index=False)

In [None]:
data.to_csv("Text_Preprocessing_daritsv.tsv", sep='\t', index=False)

In [None]:
data.to_csv("3Text_Preprocessing_daritsv.csv")