In [None]:
import pandas as pd
import numpy as np

TWEET_DATA = pd.read_csv("mypertamina_label.csv", encoding = "ISO-8859-1")
TWEET_DATA['Teksbersih'].str.encode('ascii', 'ignore')
TWEET_DATA.head()

Unnamed: 0,Teksbersih,translate,klasifikasi
0,alun al kahfi malam moga allah rahmat jalan ba...,alun al kahfi night moga allah rahmat jalan ba...,negative
1,beli bensin dah pake apk,buy gasoline already using apk,negative
2,warga kota tasikmalaya pilih beli bbm spbu dae...,residents of the city of tasikmalaya choose bu...,negative
3,tuku bbm subsidi sing kudu nganggo aplikasi,tuku bbm subsidized sing kudu nganggo the appl,negative
4,daftar subsidi tp ga save data kendara ny mu e...,list of subsidies but not saving data kendara ...,negative


In [None]:
 #------ Case Folding --------
# gunakan fungsi Series.str.lower() pada Pandas
TWEET_DATA['Teksbersih'] = TWEET_DATA['Teksbersih'].str.lower()


print('Case Folding Result : \n')
print(TWEET_DATA['Teksbersih'].head(5))
print('\n\n\n')

Case Folding Result : 

0    alun al kahfi malam moga allah rahmat jalan ba...
1                            beli bensin dah pake apk 
2    warga kota tasikmalaya pilih beli bbm spbu dae...
3         tuku bbm subsidi sing kudu nganggo aplikasi 
4    daftar subsidi tp ga save data kendara ny mu e...
Name: Teksbersih, dtype: object






In [None]:
import string
import re #regex library
import nltk
nltk.download('punkt')
# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# ------ Tokenizing ---------

def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")

TWEET_DATA['Teksbersih'] = TWEET_DATA['Teksbersih'].apply(remove_tweet_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

TWEET_DATA['Teksbersih'] = TWEET_DATA['Teksbersih'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

TWEET_DATA['Teksbersih'] = TWEET_DATA['Teksbersih'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

TWEET_DATA['Teksbersih'] = TWEET_DATA['Teksbersih'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

TWEET_DATA['Teksbersih'] = TWEET_DATA['Teksbersih'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

TWEET_DATA['Teksbersih'] = TWEET_DATA['Teksbersih'].apply(remove_singl_char)

# NLTK word rokenize
def word_tokenize_wrapper(text):
    return word_tokenize(text)

TWEET_DATA['tweet_tokens'] = TWEET_DATA['Teksbersih'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n')
print(TWEET_DATA['tweet_tokens'].head())
print('\n\n\n')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Tokenizing Result : 

0    [alun, al, kahfi, malam, moga, allah, rahmat, ...
1                       [beli, bensin, dah, pake, apk]
2    [warga, kota, tasikmalaya, pilih, beli, bbm, s...
3    [tuku, bbm, subsidi, sing, kudu, nganggo, apli...
4    [daftar, subsidi, tp, ga, save, data, kendara,...
Name: tweet_tokens, dtype: object






In [None]:
# NLTK calc frequency distribution
def freqDist_wrapper(text):
    return FreqDist(text)

TWEET_DATA['tweet_tokens_fdist'] = TWEET_DATA['tweet_tokens'].apply(freqDist_wrapper)

print('Frequency Tokens : \n')
print(TWEET_DATA['tweet_tokens_fdist'].head().apply(lambda x : x.most_common()))

Frequency Tokens : 

0    [(alun, 1), (al, 1), (kahfi, 1), (malam, 1), (...
1    [(beli, 1), (bensin, 1), (dah, 1), (pake, 1), ...
2    [(beli, 2), (bbm, 2), (warga, 1), (kota, 1), (...
3    [(tuku, 1), (bbm, 1), (subsidi, 1), (sing, 1),...
4    [(ga, 2), (daftar, 1), (subsidi, 1), (tp, 1), ...
Name: tweet_tokens_fdist, dtype: object


In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')
print(len(list_stopwords))

# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo',
                       'kalo', 'amp', 'biar', 'bikin', 'bilang',
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih',
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya',
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't',
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])
len(list_stopwords)
# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
txt_stopword = pd.read_csv("stopwordbahasa.csv", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))
len(list_stopwords)
# ---------------------------------------------------------------------------------------

# convert list to dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

TWEET_DATA['tweet_tokens_WSW'] = TWEET_DATA['tweet_tokens'].apply(stopwords_removal)


print(TWEET_DATA['tweet_tokens_WSW'].head())

758
0    [alun, al, kahfi, malam, moga, allah, rahmat, ...
1                       [beli, bensin, dah, pake, apk]
2    [warga, kota, tasikmalaya, pilih, beli, bbm, s...
3    [tuku, bbm, subsidi, sing, kudu, nganggo, apli...
4    [daftar, subsidi, tp, save, data, kendara, mu,...
Name: tweet_tokens_WSW, dtype: object


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
normalizad_word = pd.read_csv("normalisasi.csv")

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1]

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

TWEET_DATA['tweet_normalized'] = TWEET_DATA['tweet_tokens_WSW'].apply(normalized_term)

TWEET_DATA['tweet_normalized'].head(10)

0    [alun, al, kahfi, malam, semoga, allah, rahmat...
1                      [beli, bensin, deh, pakai, apk]
2    [warga, kota, tasikmalaya, pilih, beli, bbm, s...
3    [tuku, bbm, subsidi, sing, harus, nganggo, apl...
4    [daftar, subsidi, tapi, save, data, kendara, m...
5    [solusi, efektif, tekan, beban, uang, negara, ...
6                                  [gin, melulu, dahh]
7    [syarat, beli, bahan, bakar, minyak, bbm, subs...
8    [antri, gara-gara, ternate, masuk, kota, ujico...
9    [beli, pertalite, solar, pakai, beli, tabung, ...
Name: tweet_normalized, dtype: object

In [None]:
# import Sastrawi package
!pip install Sastrawi
!pip install swifter
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in TWEET_DATA['tweet_normalized']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '

print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])

print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

TWEET_DATA['tweet_tokens_stemmed'] = TWEET_DATA['tweet_normalized'].swifter.apply(get_stemmed_term)
print(TWEET_DATA['tweet_tokens_stemmed'])

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 5.1 MB/s 
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting swifter
  Downloading swifter-1.1.4.tar.gz (647 kB)
[K     |████████████████████████████████| 647 kB 5.2 MB/s 
Collecting psutil>=5.6.6
  Downloading psutil-5.9.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (281 kB)
[K     |████████████████████████████████| 281 kB 46.5 MB/s 
Collecting partd>=0.3.10
  Downloading partd-1.2.0-py3-none-any.whl (19 kB)
Collecting fsspec>=0.6.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 76.3 MB/s 
Collecting lock

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
ok : ok
bpa : bpa
gmna : gmna
mksain : mksain
pkenya : pkenya
boros : boros
lurus : lurus
siur : siur
distributif : distributif
paka : paka
minus : minus
ale : ale
cec : cec
sambel : sambel
cabe : cabe
idul : idul
adha : adha
daging : daging
sapi : sapi
buktipelanggaran : buktipelanggaran
tahan : tahan
jul : jul
newsone : newsone
cariberitaditvone : cariberitaditvone
kripto : kripto
gateio : gateio
dailycryptonews : dailycryptonews
bitcoinetf : bitcoinetf
websitenya : websitenya
maksyarakat : maksyarakat
moderen : moderen
su : su
curahanhati : curahanhati
bnyak : bnyak
testimoni : testimoni
btp : btp
apl : apl
diterapin : diterapin
mlibatkan : mlibatkan
rusuh : rusuh
ptgs : ptgs
perspektif : perspektif
pedjabat : pedjabat
gaksie : gaksie
gampangin : gampangin
maksay : maksay
grabpools : grabpools
shio : shio
kerbau : kerbau
batikpoker : batikpoker
togelgrabpools : togelgrabpools
sihhh : sihhh
tega : tega
liga : liga
dibal

Pandas Apply:   0%|          | 0/6738 [00:00<?, ?it/s]

0       [alun, al, kahfi, malam, moga, allah, rahmat, ...
1                         [beli, bensin, deh, pakai, apk]
2       [warga, kota, tasikmalaya, pilih, beli, bbm, s...
3       [tuku, bbm, subsidi, sing, harus, nganggo, apl...
4       [daftar, subsidi, tapi, save, data, kendara, m...
                              ...                        
6733    [halah, curang, kontrol, isi, minyak, jarak, k...
6734    [app, tolong, bang, brando, kumandang, alias, ...
6735                                       [kerja, hasil]
6736    [lah, jaring, internet, remang, atur, aneh, ke...
6737    [barusan, beli, pertalite, eh, lupa, beli, per...
Name: tweet_tokens_stemmed, Length: 6738, dtype: object


In [None]:
TWEET_DATA.to_csv("Text_Preprocessing.csv")

In [None]:
TWEET_DATA.head()

Unnamed: 0,Teksbersih,translate,klasifikasi,tweet_tokens,tweet_tokens_fdist,tweet_tokens_WSW,tweet_normalized,tweet_tokens_stemmed
0,alun al kahfi malam moga allah rahmat jalan ba...,alun al kahfi night moga allah rahmat jalan ba...,negative,"[alun, al, kahfi, malam, moga, allah, rahmat, ...","{'alun': 1, 'al': 1, 'kahfi': 1, 'malam': 1, '...","[alun, al, kahfi, malam, moga, allah, rahmat, ...","[alun, al, kahfi, malam, semoga, allah, rahmat...","[alun, al, kahfi, malam, moga, allah, rahmat, ..."
1,beli bensin dah pake apk,buy gasoline already using apk,negative,"[beli, bensin, dah, pake, apk]","{'beli': 1, 'bensin': 1, 'dah': 1, 'pake': 1, ...","[beli, bensin, dah, pake, apk]","[beli, bensin, deh, pakai, apk]","[beli, bensin, deh, pakai, apk]"
2,warga kota tasikmalaya pilih beli bbm spbu dae...,residents of the city of tasikmalaya choose bu...,negative,"[warga, kota, tasikmalaya, pilih, beli, bbm, s...","{'warga': 1, 'kota': 1, 'tasikmalaya': 1, 'pil...","[warga, kota, tasikmalaya, pilih, beli, bbm, s...","[warga, kota, tasikmalaya, pilih, beli, bbm, s...","[warga, kota, tasikmalaya, pilih, beli, bbm, s..."
3,tuku bbm subsidi sing kudu nganggo aplikasi,tuku bbm subsidized sing kudu nganggo the appl,negative,"[tuku, bbm, subsidi, sing, kudu, nganggo, apli...","{'tuku': 1, 'bbm': 1, 'subsidi': 1, 'sing': 1,...","[tuku, bbm, subsidi, sing, kudu, nganggo, apli...","[tuku, bbm, subsidi, sing, harus, nganggo, apl...","[tuku, bbm, subsidi, sing, harus, nganggo, apl..."
4,daftar subsidi tp ga save data kendara ny mu e...,list of subsidies but not saving data kendara ...,negative,"[daftar, subsidi, tp, ga, save, data, kendara,...","{'daftar': 1, 'subsidi': 1, 'tp': 1, 'ga': 2, ...","[daftar, subsidi, tp, save, data, kendara, mu,...","[daftar, subsidi, tapi, save, data, kendara, m...","[daftar, subsidi, tapi, save, data, kendara, m..."


In [None]:
TWEET_DATA.to_excel("Text_Preprocessing_myPertamina.xlsx")