In [2]:
import pandas as pd
import numpy as np

#Import dataset
data=pd.read_excel("datasentimen.xlsx")
data.head()

Unnamed: 0,Tweet,Sentimen
0,Banyak otak manusia yang hancur gara gara coro...,Negatif
1,"Segeralah beranjak dari negeri ini hai corona,...",Positif
2,pas ada corona byk hal yg jd 'sakitnya ga ngot...,Negatif
3,corona kapan selese ak mau nonton konser anjim,Negatif
4,pemerintah plin plan ngasih izin mudik / nggak...,Negatif


In [3]:
#Cleansing
import string 
import re #regex library

def remove_tweet_special(text):
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#x][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).
        split())
    # remove b',RT, and &amp
    text = text.replace("b'","").replace('amp',"").replace('RT',"")
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
data['Tweet'] = data['Tweet'].apply(remove_tweet_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

data['Tweet'] = data['Tweet'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

data['Tweet'] = data['Tweet'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

data['Tweet'] = data['Tweet'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

data['Tweet'] = data['Tweet'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

data['Tweet'] = data['Tweet'].apply(remove_singl_char)

#view
data['tweet_cleansing'] = data['Tweet'].apply(remove_singl_char)

print('Cleansing Result : \n') 
print(data['Tweet'].head())
print('\n\n\n')

Cleansing Result : 

0    Banyak otak manusia yang hancur gara gara coro...
1    Segeralah beranjak dari negeri ini hai corona ...
2    pas ada corona byk hal yg jd sakitnya ga ngota...
3       corona kapan selese ak mau nonton konser anjim
4    pemerintah plin plan ngasih izin mudik nggak i...
Name: Tweet, dtype: object






In [4]:
# ------ Case Folding --------
# gunakan fungsi Series.str.lower() pada Pandas
data['tweet_casefolding'] = data['tweet_cleansing'].str.lower()


print('Case Folding Result : \n')
print(data['tweet_casefolding'].head())
print('\n\n\n')

Case Folding Result : 

0    banyak otak manusia yang hancur gara gara coro...
1    segeralah beranjak dari negeri ini hai corona ...
2    pas ada corona byk hal yg jd sakitnya ga ngota...
3       corona kapan selese ak mau nonton konser anjim
4    pemerintah plin plan ngasih izin mudik nggak i...
Name: tweet_casefolding, dtype: object






In [5]:
def convert_negation(text):
    text=''.join(i+' ' if i != 'gak' else i for i in text.split())
    text=''.join(i+' ' if i != 'ga' else i for i in text.split())
    text=''.join(i+' ' if i != 'ngga' else i for i in text.split())
    text=''.join(i+' ' if i != 'gk' else i for i in text.split())
    text=''.join(i+' ' if i != 'nggak' else i for i in text.split())
    text=''.join(i+' ' if i != 'enggak' else i for i in text.split())
    text=''.join(i+' ' if i != 'tidak' else i for i in text.split())
    text=''.join(i+' ' if i != 'tdk' else i for i in text.split())
    text=''.join(i+' ' if i != 'tida' else i for i in text.split())
    text=''.join(i+' ' if i != 'tak' else i for i in text.split())
    text=''.join(i+' ' if i != 'bukan' else i for i in text.split())
    text=''.join(i+' ' if i != 'bkan' else i for i in text.split())
    text=''.join(i+' ' if i != 'bkn' else i for i in text.split())
    text=''.join(i+' ' if i != 'jangan' else i for i in text.split())
    text=''.join(i+' ' if i != 'jgn' else i for i in text.split())
    return ''.join(i+' ' if i != 'kurang' else i for i in text.split())
              
data['tweet_convertnegation'] = data['tweet_casefolding'].apply(convert_negation)
data['tweet_convertnegation'].head(80)

0     banyak otak manusia yang hancur gara gara coro...
1     segeralah beranjak dari negeri ini hai corona ...
2     pas ada corona byk hal yg jd sakitnya gangotak...
3       corona kapan selese ak mau nonton konser anjim 
4     pemerintah plin plan ngasih izin mudik nggakit...
                            ...                        
75                                       jancok corona 
76    setidaknya gue bersyukur masih bisa doain lu p...
77    april mudahan corona ada obatnya udah seneng b...
78            kemarin jualan corona lalu jualan vaksin 
79    zaman corona emg banyak isis ini susah itu susah 
Name: tweet_convertnegation, Length: 80, dtype: object

In [6]:
# ------ Tokenizing ---------
# import word_tokenize from NLTK
from nltk.tokenize import word_tokenize 

def tokenizing(text):
    return word_tokenize(text)

data['tweet_tokens'] = data['tweet_convertnegation'].apply(tokenizing)

print(data['tweet_tokens'].head(80))
print('\n\n\n')

0     [banyak, otak, manusia, yang, hancur, gara, ga...
1     [segeralah, beranjak, dari, negeri, ini, hai, ...
2     [pas, ada, corona, byk, hal, yg, jd, sakitnya,...
3     [corona, kapan, selese, ak, mau, nonton, konse...
4     [pemerintah, plin, plan, ngasih, izin, mudik, ...
                            ...                        
75                                     [jancok, corona]
76    [setidaknya, gue, bersyukur, masih, bisa, doai...
77    [april, mudahan, corona, ada, obatnya, udah, s...
78      [kemarin, jualan, corona, lalu, jualan, vaksin]
79    [zaman, corona, emg, banyak, isis, ini, susah,...
Name: tweet_tokens, Length: 80, dtype: object






In [7]:
normalization_word = pd.read_excel("normalization.xlsx")

normalization_word_dict = {}

for index, row in normalization_word.iterrows():
    if row[0] not in normalization_word_dict:
        normalization_word_dict[row[0]] = row[1] 

def normalization_term(document):
    return [normalization_word_dict[term] if term in normalization_word_dict else term for term in document]

data['tweet_normalization'] = data['tweet_tokens'].apply(normalization_term)

data['tweet_normalization'].head(80)

0     [banyak, otak, manusia, yang, hancur, gara, ga...
1     [segeralah, beranjak, dari, negeri, ini, hai, ...
2     [pas, ada, corona, banyak, hal, yang, jadi, sa...
3     [corona, kapan, selese, aku, mau, menonton, ko...
4     [pemerintah, plin, plan, kasih, izin, mudik, n...
                            ...                        
75                                     [jancok, corona]
76    [setidaknya, saya, bersyukur, masih, bisa, doa...
77    [april, mudahan, corona, ada, obatnya, sudah, ...
78      [kemarin, jualan, corona, lalu, jualan, vaksin]
79    [zaman, corona, memang, banyak, isis, ini, sus...
Name: tweet_normalization, Length: 80, dtype: object

In [11]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

stopword_factory = StopWordRemoverFactory()
stopwords_txt = open("stopword_id.txt", "r")


stopword_dict = []
for word in stopwords_txt:
    stopword_dict.append(word.rstrip())
list_stopword=stopword_factory.get_stop_words()+stopword_dict

def remove_stopword(words):
    return [word for word in words if word not in list_stopword]

data['tweet_stopwordremoval']= data['tweet_normalization'].apply(remove_stopword)
data['tweet_stopwordremoval'].head(80)

0     [otak, manusia, hancur, gara, gara, corona, ku...
1     [segeralah, beranjak, negeri, hai, corona, pen...
2                  [pas, corona, sakitnya, gangotakhem]
3             [corona, selese, menonton, konser, anjim]
4     [pemerintah, plin, plan, kasih, izin, mudik, n...
                            ...                        
75                                     [jancok, corona]
76    [bersyukur, doakan, pas, sedih, terjangkit, co...
77    [april, mudahan, corona, obatnya, senang, bang...
78            [kemarin, jualan, corona, jualan, vaksin]
79                  [zaman, corona, isis, susah, susah]
Name: tweet_stopwordremoval, Length: 80, dtype: object

In [12]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def Stem(remove_stopword): 
    return [stemmer.stem(word) for word in remove_stopword]

data['tweet_stemming']= data['tweet_stopwordremoval'].apply(Stem)
data['tweet_stemming'].head(80)

0     [otak, manusia, hancur, gara, gara, corona, ku...
1     [segera, anjak, negeri, hai, corona, didik, be...
2                     [pas, corona, sakit, gangotakhem]
3               [corona, selese, tonton, konser, anjim]
4     [perintah, plin, plan, kasih, izin, mudik, ngg...
                            ...                        
75                                     [jancok, corona]
76           [syukur, doa, pas, sedih, jangkit, corona]
77    [april, mudah, corona, obat, senang, banget, sih]
78                [kemarin, jual, corona, jual, vaksin]
79                  [zaman, corona, isis, susah, susah]
Name: tweet_stemming, Length: 80, dtype: object

In [13]:
data.to_excel("Preprocessing.xlsx")