# Preprocessing Data

Data preprocessing adalah proses yang mengubah data mentah ke dalam bentuk yang lebih mudah dipahami. Proses ini penting dilakukan karena data mentah sering kali tidak memiliki format yang teratur. Selain itu, data mining juga tidak dapat memproses data mentah, sehingga proses ini sangat penting dilakukan untuk mempermudah proses berikutnya, yakni analisis data.


## Import Library

In [289]:
import pandas as pd
import numpy as np
import string
import re #regrex libray
import nltk
import swifter
import Sastrawi
import networkx as nx

from nltk.tokenize import word_tokenize
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

## Load Dataset

In [304]:
data_berita = pd.read_csv('beritatempo.csv')
data_berita.head()

Unnamed: 0,deskripsi,judul,tanggal
0,\n\r\n\t\t\t\t\t\t\tInggris Larang Impor Produ...,Inggris Larang Impor Produk Kesehatan Xinjiang...,23 April 2022 02:28 WIB
1,\n\n\nTempo.co\nRamadan\n\n\n\n\nTak Hanya Ind...,"Tak Hanya Indonesia, 5 Negara Ini Memiliki Tra...",23 April 2022 02:38 WIB
2,"\n\r\n\t\t\t\t\t\t\tTumbuh 40,1 Persen, Amar B...","Tumbuh 40,1 Persen, Amar Bank Salurkan Pinjama...",23 April 2022 04:39 WIB
3,\n\r\n\t\t\t\t\t\t\tKiat Menjaga Kesehatan Hat...,Kiat Menjaga Kesehatan Hati,23 April 2022 04:36 WIB
4,\n\r\n\t\t\t\t\t\t\tPesenam Kelahiran New York...,Pesenam Kelahiran New York Tetap Tak Boleh Ber...,23 April 2022 04:36 WIB


## Case Folding

In [305]:
# ------ Case Folding --------
# gunakan fungsi Series.str.lower() pada Pandas
data_berita['deskripsi'] = data_berita['deskripsi'].str.lower()


print('Case Folding Result : \n')
print(data_berita['deskripsi'].head(20))
print('\n\n\n')

Case Folding Result : 

0     \n\r\n\t\t\t\t\t\t\tinggris larang impor produ...
1     \n\n\ntempo.co\nramadan\n\n\n\n\ntak hanya ind...
2     \n\r\n\t\t\t\t\t\t\ttumbuh 40,1 persen, amar b...
3     \n\r\n\t\t\t\t\t\t\tkiat menjaga kesehatan hat...
4     \n\r\n\t\t\t\t\t\t\tpesenam kelahiran new york...
5     \n\r\n\t\t\t\t\t\t\tbursa transfer liga 1: ran...
6     \n\r\n\t\t\t\t\t\t\tsiaran tv analog dihentika...
7     \n\r\n\t\t\t\t\t\t\tkualifikasi formula 1 emil...
8     \n\r\n\t\t\t\t\t\t\tbursa transfer liga 1: psi...
9     \n\r\n\t\t\t\t\t\t\tgempa terkini di sulawesi ...
10    \n\r\n\t\t\t\t\t\t\ttop 3 dunia: sekjen pbb in...
11    \n\r\n\t\t\t\t\t\t\tskuter listrik yamaha e01 ...
12    \n\r\n\t\t\t\t\t\t\tmudik lebaran, waspada ker...
13    \n\r\n\t\t\t\t\t\t\tterpopuler bisnis: rincian...
14    \n\r\n\t\t\t\t\t\t\tprakiraan cuaca bmkg, anti...
15    \n\r\n\t\t\t\t\t\t\tjadwal bola sabtu malam 23...
16    \n\r\n\t\t\t\t\t\t\tdengan aplikasi ini, siswa...
17    \n\r\n\t\t\t\t\t\t

## Tokenizing

In [306]:
import string 
import re #regex library

# import word_tokenize & FreqDist from NLTK
nltk.download('punkt')
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

# ------ Tokenizing ---------

def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
data_berita['deskripsi'] = data_berita['deskripsi'].apply(remove_tweet_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

data_berita['deskripsi'] = data_berita['deskripsi'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

data_berita['deskripsi'] = data_berita['deskripsi'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

data_berita['deskripsi'] = data_berita['deskripsi'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

data_berita['deskripsi'] = data_berita['deskripsi'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

data_berita['deskripsi'] = data_berita['deskripsi'].apply(remove_singl_char)

# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

data_berita['deskripsi_tokens'] = data_berita['deskripsi'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(data_berita['deskripsi_tokens'].head(20))
print('\n\n\n')

Tokenizing Result : 

0     [inggris, larang, impor, produk, kesehatan, xi...
1     [tempoco, ramadan, tak, hanya, indonesia, nega...
2     [tumbuh, persen, amar, bank, salurkan, pinjama...
3     [kiat, menjaga, kesehatan, hati, reporter, bis...
4     [pesenam, kelahiran, new, york, tetap, tak, bo...
5     [bursa, transfer, liga, rans, cilegon, fc, kem...
6     [siaran, tv, analog, dihentikan, april, kominf...
7     [kualifikasi, formula, emilia, romagna, versta...
8     [bursa, transfer, liga, psis, semarang, dapatk...
9     [gempa, terkini, di, sulawesi, tengah, giliran...
10    [top, dunia, sekjen, pbb, ingin, ketemu, presi...
11    [skuter, listrik, yamaha, tak, pakai, baterai,...
12    [mudik, lebaran, waspada, kerawanan, di, jalur...
13    [terpopuler, bisnis, rincian, aset, ketua, bpk...
14    [prakiraan, cuaca, bmkg, antisipasi, hujan, da...
15    [jadwal, bola, sabtu, malam, april, berpeluang...
16    [dengan, aplikasi, ini, siswa, mau, ujian, bis...
17    [zendaya, absen, di,

## Menghitung Frekuensi Distribusi Token

In [307]:
# NLTK calc frequency distribution
def freqDist_wrapper(text):
    return FreqDist(text)

data_berita['deskripsi_tokens_fdist'] = data_berita['deskripsi_tokens'].apply(freqDist_wrapper)

print('Frequency Tokens : \n') 
print(data_berita['deskripsi_tokens_fdist'].head(20).apply(lambda x : x.most_common()))

Frequency Tokens : 

0     [(di, 10), (yang, 10), (dan, 9), (inggris, 6),...
1     [(mudik, 18), (di, 18), (tradisi, 11), (yang, ...
2     [(amar, 11), (bank, 11), (pinjaman, 9), (yang,...
3     [(hati, 22), (yang, 19), (dan, 16), (untuk, 9)...
4     [(tidak, 19), (sea, 14), (games, 13), (di, 10)...
5     [(pemain, 15), (fc, 8), (rans, 6), (cilegon, 6...
6     [(di, 8), (tahap, 6), (siaran, 5), (tv, 5), (a...
7     [(di, 13), (yang, 11), (dan, 11), (untuk, 9), ...
8     [(psis, 11), (fajar, 7), (liga, 6), (pemain, 6...
9     [(gempa, 11), (yang, 8), (terkini, 6), (di, 6)...
10    [(di, 28), (yang, 14), (ukraina, 13), (dan, 11...
11    [(baterai, 13), (listrik, 8), (motor, 8), (yan...
12    [(jalur, 23), (yogyakarta, 19), (dan, 15), (di...
13    [(bpk, 5), (berita, 4), (ketua, 3), (bank, 3),...
14    [(di, 14), (dan, 10), (hujan, 7), (hingga, 6),...
15    [(vs, 22), (wib, 18), (liga, 14), (akan, 10), ...
16    [(bisa, 8), (pengajar, 8), (yang, 7), (platfor...
17    [(dan, 13), (di, 11),

## Filtering (Stopword Removal)

In [308]:
nltk.download('stopwords')

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')


# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])

# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
txt_stopword = pd.read_csv("beritatempo.csv", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# ---------------------------------------------------------------------------------------

# convert list to dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

data_berita['deskripsi_tokens_WSW'] = data_berita['deskripsi_tokens'].apply(stopwords_removal) 


print(data_berita['deskripsi_tokens_WSW'].head(20))

0     [inggris, larang, impor, produk, kesehatan, xi...
1     [tempoco, ramadan, indonesia, negara, memiliki...
2     [tumbuh, persen, amar, bank, salurkan, pinjama...
3     [kiat, menjaga, kesehatan, hati, reporter, bis...
4     [pesenam, kelahiran, new, york, berangkat, man...
5     [bursa, transfer, liga, rans, cilegon, fc, dap...
6     [siaran, tv, analog, dihentikan, april, kominf...
7     [kualifikasi, formula, emilia, romagna, versta...
8     [bursa, transfer, liga, psis, semarang, dapatk...
9     [gempa, terkini, sulawesi, giliran, morowali, ...
10    [top, dunia, sekjen, pbb, ketemu, presiden, pu...
11    [skuter, listrik, yamaha, pakai, baterai, swap...
12    [mudik, lebaran, waspada, kerawanan, jalur, ma...
13    [terpopuler, bisnis, rincian, aset, ketua, bpk...
14    [prakiraan, cuaca, bmkg, antisipasi, hujan, pe...
15    [jadwal, bola, sabtu, malam, april, berpeluang...
16    [aplikasi, siswa, ujian, ajari, dosen, profeso...
17    [zendaya, absen, met, gala, berturutturut,

## Normalization

In [309]:
normalizad_word = pd.read_csv("beritatempo.csv")

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

data_berita['deskripsi_normalized'] = data_berita['deskripsi_tokens_WSW'].apply(normalized_term)

data_berita['deskripsi_normalized'].head(20)

0     [inggris, larang, impor, produk, kesehatan, xi...
1     [tempoco, ramadan, indonesia, negara, memiliki...
2     [tumbuh, persen, amar, bank, salurkan, pinjama...
3     [kiat, menjaga, kesehatan, hati, reporter, bis...
4     [pesenam, kelahiran, new, york, berangkat, man...
5     [bursa, transfer, liga, rans, cilegon, fc, dap...
6     [siaran, tv, analog, dihentikan, april, kominf...
7     [kualifikasi, formula, emilia, romagna, versta...
8     [bursa, transfer, liga, psis, semarang, dapatk...
9     [gempa, terkini, sulawesi, giliran, morowali, ...
10    [top, dunia, sekjen, pbb, ketemu, presiden, pu...
11    [skuter, listrik, yamaha, pakai, baterai, swap...
12    [mudik, lebaran, waspada, kerawanan, jalur, ma...
13    [terpopuler, bisnis, rincian, aset, ketua, bpk...
14    [prakiraan, cuaca, bmkg, antisipasi, hujan, pe...
15    [jadwal, bola, sabtu, malam, april, berpeluang...
16    [aplikasi, siswa, ujian, ajari, dosen, profeso...
17    [zendaya, absen, met, gala, berturutturut,

## Stemming

In [310]:

# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter


# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in data_berita['deskripsi_normalized']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    i=0
    if i<10:
        term_dict[term] = stemmed_wrapper(term)
        #print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

data_berita['deskripsi_tokens_stemmed'] = data_berita['deskripsi_normalized'].swifter.apply(get_stemmed_term)
print(data_berita['deskripsi_tokens_stemmed'])

strikdistrik', 'berhasil': 'hasil', 'memangkas': 'mangkas', 'penularan': 'tular', 'nol': 'nol', 'pasalnya': 'pasal', 'area': 'area', 'karantina': 'karantina', 'lagiotoritas': 'lagiotoritas', 'distrik': 'distrik', 'jingan': 'jingan', 'penduduk': 'duduk', 'mengizinkan': 'izin', 'penduduknya': 'duduk', 'kompleks': 'kompleks', 'perumahan': 'rumah', 'menghindari': 'hindar', 'kegiatan': 'giat', 'berkerumum': 'berkerumum', 'chongming': 'chongming', 'terletak': 'letak', 'terpencilpada': 'terpencilpada', 'konferensi': 'konferensi', 'reguler': 'reguler', 'wakil': 'wakil', 'gubernur': 'gubernur', 'melaporkan': 'lapor', 'ribu': 'ribu', 'semestinya': 'mesti', 'meninggalkan': 'tinggal', 'merekawakil': 'merekawakil', 'zhang': 'zhang', 'zhitong': 'zhitong', 'supermarket': 'supermarket', 'tutup': 'tutup', 'beroperasi': 'operasi', 'tangga': 'tangga', 'chongmingbaca': 'chongmingbaca', 'siniselaluupdateinfo': 'siniselaluupdateinfo', 'skuter': 'skuter', 'listrik': 'listrik', 'yamaha': 'yamaha', 'pakai': 'p

AttributeError: 'Series' object has no attribute '_is_builtin_func'

## Simpan Data ke CSV

In [325]:
data_berita.to_csv("TextPreprocessing.csv")

## Prepare Data

In [338]:
import pandas as pd 
import numpy as np

data_berita = pd.read_csv("TextPreprocessing.csv", usecols=["deskripsi_tokens"])
data_berita.columns = ["deskripsi"]

data_berita.head(20)

Unnamed: 0,deskripsi
0,"['inggris', 'larang', 'impor', 'produk', 'kese..."
1,"['tempoco', 'ramadan', 'tak', 'hanya', 'indone..."
2,"['tumbuh', 'persen', 'amar', 'bank', 'salurkan..."
3,"['kiat', 'menjaga', 'kesehatan', 'hati', 'repo..."
4,"['pesenam', 'kelahiran', 'new', 'york', 'tetap..."
5,"['bursa', 'transfer', 'liga', 'rans', 'cilegon..."
6,"['siaran', 'tv', 'analog', 'dihentikan', 'apri..."
7,"['kualifikasi', 'formula', 'emilia', 'romagna'..."
8,"['bursa', 'transfer', 'liga', 'psis', 'semaran..."
9,"['gempa', 'terkini', 'di', 'sulawesi', 'tengah..."


## Term Frekuensi

In [339]:
from sklearn.feature_extraction.text import CountVectorizer

a=len(document)
document = data_berita['deskripsi']

# Create a Vectorizer Object
vectorizer = CountVectorizer()

vectorizer.fit(document)

# Printing the identified Unique words along with their indices
print("Vocabulary: ", vectorizer.vocabulary_)

# Encode the Document
vector = vectorizer.transform(document)

# Summarizing the Encoded Texts
print("Encoded Document is:")
print(vector.toarray())

: 479, 'bmkg': 321, 'relatif': 1943, 'dekat': 441, 'daratanbmkg': 430, 'menyebut': 1449, 'skalaguncangan': 2192, 'dirasakandi': 544, 'morowalidan': 1517, 'sekitarnya': 2089, 'gambar': 690, 'intensitas': 830, 'setidaknya': 2157, 'dirasakan': 543, 'skala': 2191, 'iii': 796, 'mmi': 1499, 'setara': 2149, 'getaran': 706, 'rumah': 1999, 'truk': 2405, 'melintasgempa': 1288, 'untukditeruskan': 2450, 'bunyi': 349, 'arahan': 93, 'diberikan': 485, 'disarankannya': 554, 'kemudian': 1010, 'hatihati': 756, 'bumi': 348, 'susulan': 2241, 'berselang': 279, 'poso': 1837, 'ibukota': 794, 'provinsi': 1872, 'berkekuatan': 249, 'berpusat': 276, 'darat': 429, 'mmicatatandalam': 1500, 'informasi': 812, 'telahdisusulkan': 2287, 'pagi': 1615, 'bahodopi': 143, 'morowalibaca': 1516, 'jugabeasiswa': 911, 'djarumplus': 588, 'mahasiswa': 1209, 'simak': 2174, 'daftar': 415, 'kriteriaselaluupdateinfo': 1106, 'simakbreaking': 2175, 'newsdan': 1554, 'pilihan': 1807, 'daritempocodi': 433, 'kanal': 946, 'telegram': 2288, 

In [340]:
a = vectorizer.get_feature_names()

## TF-IDF

In [341]:
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
tf = tfidf.fit_transform(vectorizer.fit_transform(document)).toarray()

In [342]:
dfb = pd.DataFrame(data=tf,index=list(range(1, len(tf[:,1])+1, )),columns=[a])
# dfb

Unnamed: 0,abang,abidi,absen,abuabuselaluupdateinfo,acara,aceh,ada,adalah,adalahbengkulu,adam,...,yudha,yudono,yuliyanto,zabina,zacharias,zelensky,zendaya,zhang,zhitong,zhoualfa
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030388,0.0,0.0,...,0.0,0.051826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.032533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.03208,0.036189,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.028324,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071591,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032242
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.081981,0.0,0.0,0.0,0.0,0.0


## Convert File to CSV

In [343]:
dfb.to_csv("TF-IDF.csv")