In [41]:
import pandas as pd
import re
import nltk

In [55]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shafiranaya/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [26]:
train = pd.read_csv("../dataset/train.csv")
test = pd.read_csv("../dataset/test.csv")
dev = pd.read_csv("../dataset/dev.csv")

In [27]:
train[train['label']=='yes']

Unnamed: 0.1,Unnamed: 0,text_a,label
2,2,e100ss gini buka informasi sejelas nya identit...,yes
7,7,jokowi menteri2 nya silakan tes corona,yes
8,8,pencegahan corona other moms minum multivitami...,yes
11,11,kemenkesri mohon yg transparan mudah spt utk d...,yes
18,18,jateng jatim jakarta siaga corona info disebar...,yes
...,...,...,...
21582,21582,do ra dong depok terpaksa diumumkan walikota n...,yes
21584,21584,keren nya pt kai commuter jabodetabek mengaku ...,yes
21586,21586,detikcom ga tindakan pemerintah wilayah depok ...,yes
21593,21593,cnnindonesia klo istana diliburkan gak nya pen...,yes


In [28]:
train[train['label']=='no']

Unnamed: 0.1,Unnamed: 0,text_a,label
0,0,betewe buka twitter cuman ngetweet liat home b...,no
1,1,mas piyuuu mugo2 corona tuh mulut tersumpal ma...,no
3,3,neng solo wes ono terduga corona cobo neng ati...,no
4,4,midiahn nii akun gak takut takut nya isu coron...,no
5,5,hey corona prrgi sna,no
...,...,...,...
21596,21596,depok panas ga karuan kereta sampe pasming huj...,no
21597,21597,oxfara arie kriting yg lebi goblo nya orang ke...,no
21598,21598,virus corona menyaba depok cuci tangan makan n...,no
21599,21599,mata sipit tinggal depok udah abis dah bahan c...,no


In [29]:
test.sample(10)

Unnamed: 0,text_a,label
2656,orang depok jan temenin,no
1474,10 00 pelayanan simling 2 tmcpoldametro ntmcla...,no
1777,announcement pagi terkait corona indonesia kel...,no
1751,info dampak vius corona demam kunjung reda ses...,yes
2409,anisadanur cnnindonesia hmm iya ya nya beda ga...,no
2068,ya anak yg seangkatan sm gue kyk unlucky bgt p...,no
516,"Selamat sore bapak/ibu, semoga semua dalam kea...",yes
2280,zarazettirazr msaid didu keretaapikita patah s...,no
1811,menikmati sensasi kesegaran minuman sampah ose...,no
2407,menkes ngapain sih anjing https t co 6djyatb9bo,no


# Preprocessing

In [34]:
# • Tokenization: Text is tokenized into tokens such as words
# • Lemmatization: Word is lemmatized into its lemma form
# • Morphological analyzer: word is analyzed into its root word and its affixes
# • Stemming: Word is stemmed into its stemmed form
# • Lowercase: all words are lowercased
# • Stopword elimination: words are filtered by a stop word list
# • POS Tagger and stopword elimination: conduct POS Tagger and filter words based on its POS Tag
# • Spelling correction: incorrect words (including informal) are corrected
# • Word normalization: acronym
# • Entity masking: words fulfilling certain patterns are masked

In [30]:
# Fungsi untuk membersihkan teks
def clean_data(text):
    # just comment syntax that u dont want to use
    normal_tw = text

    # lowercase
    normal_tw = text.lower()
#     # hapus b'
#     normal_tw = re.sub(r'^b\'', '', normal_tw)
#     # hapus RT
#     normal_tw = re.sub(r'^rt ', '', normal_tw)
    # hapus emoji
    normal_tw = re.sub(r'\\x.{2}', '', normal_tw)
    # hapus www.* atau https?://* (URL)
    normal_tw = re.sub(r'((www\.[^\s]*)|(https?://[^\s]*))', '', normal_tw)
    # trim depan belakang
    normal_tw = normal_tw.strip()
    # hapus @username
    normal_tw = re.sub(r'@[^\s]+', '', normal_tw)
    # hapus hashtag
    normal_tw = re.sub(r'#[^\s]+', '', normal_tw)
    # hapus angka
    normal_tw = re.sub(r'\d+', ' ', normal_tw) 

    # hapus nan
    normal_tw = re.sub(r'^nan$', '', normal_tw) 
    # hapus underscore
    normal_tw = re.sub(r'[_]+', '', normal_tw)
    # hapus undefine letter
    normal_tw =  re.sub(r'[Ã°Âã¯¹¢²ðâ]', '', normal_tw) 
    # regex huruf yang berulang kaya haiiii (untuk fitur unigram)
    normal_regex = re.compile(r"(.)\1{1,}")
    # buang huruf yang berulang
    normal_tw = normal_regex.sub(r"\1\1", normal_tw)
    # remove spasi berlebih (whitespace character)
    normal_tw = re.sub(r'\s+', ' ', normal_tw)
    # hapus tanda baca dan emoji
    normal_tw = re.sub(r'[^\w\s]', '', normal_tw) 
    # hapus kata aneh
    normal_tw = re.sub(r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]', '', normal_tw)
    
    return normal_tw

In [50]:
# To get list of stopwords (bahasa Indonesia)
stopwords_list = []
with open("../stopwords.txt", "r") as file:
    for line in file:
        stripped_line = line.strip()
        stopwords_list.append(stripped_line)
    
# print(stopwords_list)

In [51]:
def remove_stopwords(text):
    token = nltk.word_tokenize(text)
    token_afterremoval = []
    for k in token:
        if k not in stopwords_list: 
            token_afterremoval.append(k)

    str_clean = ' '.join(token_afterremoval)
    return str_clean

In [52]:
# Fungsi untuk melakukan preprocessing pada data 
def preprocess(text):
    # Membersihkan
    processed_text = clean_data(text)
    # Ubah menjadi lowercase
    processed_text = processed_text.lower()
    # Menghailangkan stopwords
    processed_text = remove_stopwords(processed_text)

    
    # Convert bahasa tidak baku ke bahasa baku
    
    # TODO Stemming with sastrawi
    # Stemming
    
    return processed_text

In [60]:
for text in test['text_a'][:20]:
    print("Before\t:",text)
    print("After\t:",preprocess(text))

Before	: jek dajal ga depok bang
After	: jek dajal ga depok bang
Before	: detikcom untung depok masuk wilayah nya ridwan kamil kalo masuk wilayah nya anis abis lu bully ama buzzer kolam
After	: detikcom untung depok masuk wilayah nya ridwan kamil kalo masuk wilayah nya anis abis lu bully ama buzzer kolam
Before	: df dom jakarta depok yg gunain vc cabang nya cabang yg tercantum pas kesana gabisa bayar pake shopeepay
After	: df dom jakarta depok yg gunain vc cabang nya cabang yg tercantum pas kesana gabisa bayar pake shopeepay
Before	: your2rl depok jkt
After	: your rl depok jkt
Before	: doakan indonesia selamat virus corona pkb depok gelar nusantara bershalawat
After	: doakan indonesia selamat virus corona pkb depok gelar nusantara bershalawat
Before	: warga depok terganggu isu corona
After	: warga depok terganggu isu corona
Before	: kenapaa mendengar kabar salah wni positif corona depok dimana tinggal ku ku kawatir takut
After	: kenapaa mendengar kabar salah wni positif corona depok di

# Modeling

In [None]:
# Undersampling

# Referensi
- Stopwords: https://github.com/masdevid/ID-Stopwords
- Kamus alay: https://raw.githubusercontent.com/JhonLimbong17/Perbaikan-kata-tidak-baku-Bahasa-Indonesia-dengan-perbandingan-kata-dari-kateglo-dan-colloquial-indo/master/colloquial-indonesian-lexicon.csv
- Kamus alay: https://raw.githubusercontent.com/okkyibrohim/id-multi-label-hate-speech-and-abusive-language-detection/master/new_kamusalay.csv