In [1]:
# Import Library

import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import re
import warnings
from tqdm.auto import tqdm
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
# Load Dataset

dataset = pd.read_csv('../dataset/data_modelling/data_preparation_notneutral.csv', index_col=None)
dataset.head()

Unnamed: 0,Tweet,HS,Abusive,Char_Length
0,- disaat semua cowok berusaha melacak perhatia...,1,1,138
1,RT USER: USER siapa yang telat ngasih tau elu?...,0,1,120
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",0,0,254
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0,0,75
4,USER USER Kaum cebong kapir udah keliatan dong...,1,1,89


# Casefolding

In [3]:
def lowercase(text):
    return text.lower()

def remove_unnecessary_char(text):
    text = re.sub('\n',' ',text)
    text = re.sub('rt',' ',text)
    text = re.sub('user',' ',text)
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text)
    text = re.sub('  +', ' ', text)
    return text
    
def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text)
    return text

def casefold(text):
    text = lowercase(text)
    text = remove_nonaplhanumeric(text)
    text = remove_unnecessary_char(text)
    return text

dataset['Casefolding'] = tqdm(dataset['Tweet'].apply(casefold))
dataset.head()

  0%|          | 0/13169 [00:00<?, ?it/s]

Unnamed: 0,Tweet,HS,Abusive,Char_Length,Casefolding
0,- disaat semua cowok berusaha melacak perhatia...,1,1,138,disaat semua cowok berusaha melacak perhatian...
1,RT USER: USER siapa yang telat ngasih tau elu?...,0,1,120,siapa yang telat ngasih tau elu edan sarap gu...
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",0,0,254,41 kadang aku berfikir kenapa aku tetap percay...
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0,0,75,aku itu aku n nku tau matamu sipit tapi dilia...
4,USER USER Kaum cebong kapir udah keliatan dong...,1,1,89,kaum cebong kapir udah keliatan dongoknya dar...


In [None]:
#data_casefolding = dataset[["Tweet", "Casefolding"]][0:15]
#data_casefolding.to_csv("../../../asset/data_casefolding.xlsx", index=False)

In [4]:
dataset.to_csv("../dataset/data_modelling/data_casefolding_not_neutral.csv", index=False)

In [5]:
for index, text in enumerate(dataset['Casefolding'][100:110]):
  print('Review %d:\n'%(index+1),text)

Review 1:
 prabowo sudah kalah menyebut bantuan jokowi hanya pencitraan adalah ratapan pilu
Review 2:
  dan yang takut dengan adzan adalah iblis 
Review 3:
  yg goblog itu adalah bani cebong tukang tipu penjilat penguasa yg ketahuan gerakin masa yg dibayar pakai nasi bungkus propaganda nasi bungkus memang selalu gagal 
Review 4:
  wuih cebong sewot n xf0 x9f x98 x82 xf0 x9f x98 x82 xf0 x9f x98 x82 
Review 5:
 2 gerakan ini menekankan pentingnya kerja keras secara total untuk tingkatkan potensi bangsa
Review 6:
 padahal gubernur saat ini djarot mayoritas pa ai politik di dprd juga pendukung ahok djarot payah ya mereka url
Review 7:
  rezim rusak bukan memperbaiki systeamnya malah sibuk cari2 kesalahan klu itu pasti ketemu aja kpk alatnya besok pemerintah berganti berulang kembali engah pernah selesai siapapun organisasi orangnya 
Review 8:
 saat orang orang saling menuding antek aseng aseng sesungguhnya be epuk tangan 
Review 9:
  nah loh katanya anti aseng 
Review 10:
 selamat pak semo

# Stopwords

In [6]:
def normalize_alay(text):
    alay_dict = pd.read_csv('../dataset/stopwords/kamusalay.csv', encoding='latin-1', header=None)
    alay_dict = alay_dict.rename(columns={0: 'original', 1: 'replacement'})
    alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))
    return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])

def remove_stopword(text):
    id_stopword = pd.read_csv('../dataset/stopwords/idstopwords.csv', header=None)
    id_stopword = id_stopword.rename(columns={0: 'stopword'})
    text = ' '.join(['' if word in id_stopword.stopword.values else word for word in text.split(' ')])
    text = re.sub('  +', ' ', text)
    text = text.strip()
    return text

def stopword(text):
    text = normalize_alay(text)
    text = remove_stopword(text)
    return text

dataset['Stopwords'] = tqdm(dataset['Casefolding'].apply(stopword))
dataset.head()

  0%|          | 0/13169 [00:00<?, ?it/s]

Unnamed: 0,Tweet,HS,Abusive,Char_Length,Casefolding,Stopwords
0,- disaat semua cowok berusaha melacak perhatia...,1,1,138,disaat semua cowok berusaha melacak perhatian...,cowok berusaha melacak perhatian lantas remehk...
1,RT USER: USER siapa yang telat ngasih tau elu?...,0,1,120,siapa yang telat ngasih tau elu edan sarap gu...,telat tau edan sarap bergaul licew
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",0,0,254,41 kadang aku berfikir kenapa aku tetap percay...,41 kadang berpikir percaya tuhan jatuh berkali...
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0,0,75,aku itu aku n nku tau matamu sipit tapi dilia...,ku tau matamu sipit
4,USER USER Kaum cebong kapir udah keliatan dong...,1,1,89,kaum cebong kapir udah keliatan dongoknya dar...,kaum cebong kafir dongoknya dungu haha


In [None]:
#data_stopwords = dataset[["Casefolding", "Stopwords"]][0:15]
#data_stopwords.to_csv("../../../asset/data_stopwords.xlsx")

In [7]:
dataset.to_csv("../dataset/data_modelling/data_stopwords_not_neutral.csv", index=False)

In [8]:
for index, text in enumerate(dataset['Stopwords'][100:110]):
  print('Review %d:\n'%(index+1),text)

Review 1:
 prabowo kalah menyebut bantuan jokowi pencitraan ratapan pilu
Review 2:
 takut azan iblis
Review 3:
 goblok bani cebong tukang tipu penjilat penguasa ketahuan gerakan dibayar pakai nasi bungkus propaganda nasi bungkus gagal
Review 4:
 cebong sewot
Review 5:
 2 gerakan menekankan kerja keras total tingkatkan potensi bangsa
Review 6:
 gubernur djarot mayoritas ai politik dewan perwakilan rakyat daerah pendukung ahok djarot payah
Review 7:
 rezim rusak memperbaiki sistemnya sibuk cari cari kesalahan ketemu komisi pemberantasan korupsi alatnya besok pemerintah berganti berulang engah selesai organisasi orangnya
Review 8:
 orang orang menuding antek asing asing sesungguhnya be epuk tangan
Review 9:
 anti asing
Review 10:
 selamat semoga berkiprah tingkat nasional


# Tokenizing 

In [None]:
# def tokenizing(text):
#     words = word_tokenize(text)
#     return words

# dataset['Tokenizing'] = tqdm(dataset['Stopwords'].apply(tokenizing))
# dataset.head()

In [None]:
# for index, text in enumerate(dataset['Tokenizing'][100:110]):
#   print('Review %d:\n'%(index+1),text)

In [None]:
#data_tokenizing = dataset[["Stemming", "Tokenizing"]][0:15]
#data_tokenizing.to_csv("../../../asset/data_tokenizing.xlsx", index=None)

In [None]:
dataset.to_csv("../dataset/data_modelling/data_tokenizing.csv", index=False)

# Stemming

In [None]:
# def stemming(text):
#     factory = StemmerFactory()
#     stemmer = factory.create_stemmer()
#     do = []
#     for w in text:
#         dt = stemmer.stem(w)
#         do.append(dt)
#     d_clean = []
#     d_clean = " ".join(do)
#     print(d_clean)
#     return d_clean

# dataset["Stemming"] = dataset["Tokenizing"].apply(stemming)
# dataset.head()

In [None]:
# for index, text in enumerate(dataset['Stemming'][100:110]):
#     print('Review %d:\n'%(index+1),text)

In [None]:
#data_stemming = dataset[["Stopwords", "Stemming"]][0:15]
#data_stemming.to_csv("../../../asset/data_stemming.xlsx", index=False)

In [None]:
dataset.to_csv("../dataset/data_modelling/data_stemming.csv", index=False)

In [None]:
# Character Length

dataset['Char_Length_Prep'] = dataset['Stopwords'].apply(lambda x: len(str(x)))

dataset.head()

In [None]:
dataset.isna().sum()

In [None]:
# save dataset

dataset.to_csv("../dataset/data_modelling/data_preprocessing.csv", index=False)