In [14]:
import pickle
import pandas as pd
import re
import string
import unicodedata
import nltk
from emo_unicode import UNICODE_EMO, EMOTICONS
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [15]:
class TextPreprocessing:
    def __init__(self, text="test"):
        self.text = text

    def lowercase(self):
        """Convert to lowercase"""
        self.text = str(self.text).lower()
        self.text = self.text.strip()
        return self

    def remove_url(self):
        """Remove URL (http/https/www)"""
        self.text = re.sub(r"https?://\S+|www\.\S+", "", self.text)
        return self

    def remove_email(self):
        """Remove email"""
        self.text = re.sub("\S*@\S*\s?", "", self.text)
        return self

    def remove_between_square_brackets(self):
        """Remove string diantara square brackets []"""
        self.text = re.sub("\[[^]]*\]", "", self.text)
        return self

    def remove_numbers(self):
        """Remove angka"""
        self.text = re.sub("[-+]?[0-9]+", "", self.text)
        return self

    def remove_emoji(self):
        """Remove emoji """
        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002500-\U00002BEF"  # chinese char
            "]+",
            flags=re.UNICODE,
        )
        self.text = emoji_pattern.sub(r"", self.text)
        return self

    def remove_emoticon(self):
        """Remove emoticon"""
        emoticon_pattern = re.compile(u"(" + u"|".join(k for k in EMOTICONS) + u")")
        self.text = emoticon_pattern.sub(r"", self.text)
        return self

    def remove_punctuation(self):
        """Remove tanda baca"""
        self.text = re.sub(r"[^\w\s]", "", self.text)
        return self

    def normalize_word(self):
        """Normalize slang word"""
        normal_word_path = pd.read_csv("C:/Users/ASUS/TA01/00_data/key_norm.csv")

        self.text = " ".join(
            [
                normal_word_path[normal_word_path["singkat"] == word]["hasil"].values[0]
                if (normal_word_path["singkat"] == word).any()
                else word
                for word in self.text.split()
            ]
        )
        return self

    def stemming(self):
        """Stemming menggunakan Sastrawi"""
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        self.text = stemmer.stem(self.text)
        return self

    def tokenize(self):
        """Tokenize words"""
        self.words = nltk.word_tokenize(self.text)
        return self

    def stopwords_removal(self):
        """Stopword removal"""
        stopword = stopwords.words("indonesian")
        more_stopword = [
            "assalamualaikum", "wr", "wb", "pak",
            "bu", "selamat", "siang", "pagi",
            "sore", "malam", "saya",
            "terimakasih", "terima",
            "kasih", "kepada", "bpk",
            "ibu", "mohon", "tolong",
            "maaf", "dear", "wassalamualaikum",
            "regards", "nbsp", "amp", "lg", "lgi", "kak",
            "bapakibu","bapak", "admin", "pakbu", "bupak",
            "wrwb", "ya", "min", "nim", "jurus" ]  # tambah stopword
        stop_factory = stopword + more_stopword
        stop_factory.remove('tak')
        stop_factory.remove('akhir')
        
        clean_words = []
        for word in self.words:
            if word not in stop_factory:
                clean_words.append(word)
        self.words = clean_words  
        return self

    def join_words(self):
        """Menggabungkan kata hasil tokenize"""
        self.words = " ".join(self.words)
        return self
    
    def do_all(self, text):
        """Do all text preprocessing process""" 
        self.text = text
        self = self.lowercase()
        self = self.remove_url()
        self = self.remove_email()
        self = self.remove_between_square_brackets()
        self = self.remove_numbers()
        self = self.remove_emoticon()
        self = self.remove_emoji()
        self = self.remove_punctuation()
        self = self.normalize_word()
        self = self.stemming()
        self = self.tokenize()
        self = self.stopwords_removal()
        self = self.join_words()
        return self.words

In [16]:
data_path = 'C:/Users/ASUS/TA01/01_data_analysis/01_pickle/01_data_training.pickle'

with open(data_path, 'rb') as data_training:
    data = pickle.load(data_training)

data

Unnamed: 0,keluhan,bagian
0,1. Pembayaran SPP Genap 1415 dianggap belum lu...,AKUNTANSI
1,Selamat sore.\n Saya Andika mahasiswa sistem i...,AKUNTANSI
2,Mohon maaf saya haidar mau komplain pada saat ...,AKUNTANSI
3,"Assalamualaikum pak/bu, permisi saya\n ABBAS P...",AKUNTANSI
4,"Halo saya mau tanya, tadi saya ke BNI untuk ba...",AKUNTANSI
...,...,...
1548,bagaimana cara masuk igracias ketika saya lupa...,RISET DAN LAYANAN TEKNOLOGI INFORMASI
1549,mohon untuk mempermudah untuk melakukan keluha...,RISET DAN LAYANAN TEKNOLOGI INFORMASI
1550,saya tidak bisa memasukin akun igracias padaha...,RISET DAN LAYANAN TEKNOLOGI INFORMASI
1551,tolong igracias diperbarui lagi dong tampilannya,RISET DAN LAYANAN TEKNOLOGI INFORMASI


In [17]:
tp = TextPreprocessing() # load module text preprocessing

data['clean_keluhan'] = data['keluhan'].apply(tp.do_all) #apply text preprocessing

data.to_csv('C:/Users/ASUS/TA01/00_data/clean_data_training.csv', encoding='utf-8') #simpan dataset bersih

In [None]:
import time
import dask.dataframe as dd
from dask.multiprocessing import get

tp = TextPreprocessing() # load module text preprocessing

def dask_this(data):
    data['clean_keluhan'] = data['keluhan'].apply(tp.do_all)
    return data

ddata = dd.from_pandas(data, npartitions=10)

try:
    start_time = time.time()
    data = ddata.map_partitions(dask_this).compute(scheduler='processes', num_workers=10)
except:
    print('Text preprocessing failed !')
else:
    data.to_csv('C:/Users/ASUS/TA01/00_data/clean_data_training.csv', encoding='utf-8')
    print('Text preprocessing success !')
    print('Elapsed time:', time.time() - start_time, 'seconds')
finally:
    print('\nFinish')

In [18]:
columns = ['clean_keluhan', 'bagian']
data = data[columns]

data

Unnamed: 0,clean_keluhan,bagian
0,bayar spp genap anggap lunas igracias bayar ta...,AKUNTANSI
1,andika mahasiswa sistem informasi akuntansi an...,AKUNTANSI
2,haidar komplain bayar bank bni bank milik tera...,AKUNTANSI
3,permisi abbas pahlawan nazarsyah kelas tt laku...,AKUNTANSI
4,halo bni bayar semester tellernya nominal baya...,AKUNTANSI
...,...,...
1548,masuk igracias lupa password akun,RISET DAN LAYANAN TEKNOLOGI INFORMASI
1549,mudah laku keluh masuk igracias,RISET DAN LAYANAN TEKNOLOGI INFORMASI
1550,memasukin akun igracias akun,RISET DAN LAYANAN TEKNOLOGI INFORMASI
1551,igracias tampil,RISET DAN LAYANAN TEKNOLOGI INFORMASI


In [19]:
with open('02_pickle/02_clean_data.pickle', 'wb') as output:
    pickle.dump(data, output)