# Text Pre-Processing

In [1]:
import pandas as pd
import numpy as np
import re
import itertools
import collections
import string 
from collections import OrderedDict

In [2]:
raw = pd.read_csv( "Data Tweet Vaksinasi Jakarta.csv")

In [3]:
stop_words = np.array(pd.read_csv("data_support/stopwords_ID.txt",
                        sep="\n", header=None).values)
neg_words = np.array(pd.read_csv("data_support/negative_keyword_ID.txt",
                        sep="\n", header=None).values)
pos_words = np.array(pd.read_csv("data_support/positive_keyword_ID.txt",
                        sep="\n", header=None).values)
slang_words = pd.read_csv("data_support/kbba_ID.txt",
                        sep="\t", header=None)
root_words = np.array(pd.read_csv("data_support/rootword_ID.txt",
                        sep="\n", header=None).values)
slang = pd.read_csv("data_support/slangword_ID.txt",
                        sep=":", header=None)
emoticon = pd.read_csv("data_support/emoticon.txt",
                        sep="\t", header=None)
booster_words = np.array(pd.read_csv("data_support/boosterword_ID.txt",
                        sep="\n", header=None).values)
baku_words = pd.read_csv("data_support/katabaku_ID.txt",
                        sep="|", header=None)
baku_words.columns = [1,0]

In [4]:
slang_words = pd.concat([slang_words, slang, baku_words])
sentiment_words = np.concatenate((pos_words, neg_words, booster_words))
slang_words.drop_duplicates(inplace=True)
emoticon.drop_duplicates(inplace=True)
emoticon = dict(zip(emoticon[0], emoticon[1]))
slang_words = dict(zip(slang_words[0],slang_words[1]))
neg_words = np.unique(neg_words)
pos_words = np.unique(pos_words)
stop_words = np.unique(stop_words)
stop_words = [word for word in stop_words if word not in sentiment_words]

In [5]:
print("------Dictionary Info---------")
print("Slang words = "  + str(len(slang_words)) + " entries")
print("Emoticon = "  + str(len(emoticon)) + " entries" )
print("Root words = "  + str(len(root_words)) + " entries")
print("Sentiment words = "  + str(len(sentiment_words)) + " entries")

------Dictionary Info---------
Slang words = 2398 entries
Emoticon = 110 entries
Root words = 27979 entries
Sentiment words = 8449 entries


In [6]:
print(raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   created_at      25000 non-null  object
 1   user            25000 non-null  object
 2   location        24667 non-null  object
 3   coordinates     48 non-null     object
 4   text            25000 non-null  object
 5   retweet_count   25000 non-null  int64 
 6   favorite_count  25000 non-null  int64 
 7   id              25000 non-null  object
 8   nama_kota       25000 non-null  object
dtypes: int64(2), object(7)
memory usage: 1.7+ MB
None


In [7]:
raw = raw.dropna(subset=['text'],how='all')
raw = raw.drop_duplicates(subset=['text'])

In [8]:
print(raw.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5380 entries, 0 to 19972
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   created_at      5380 non-null   object
 1   user            5380 non-null   object
 2   location        5291 non-null   object
 3   coordinates     13 non-null     object
 4   text            5380 non-null   object
 5   retweet_count   5380 non-null   int64 
 6   favorite_count  5380 non-null   int64 
 7   id              5380 non-null   object
 8   nama_kota       5380 non-null   object
dtypes: int64(2), object(7)
memory usage: 420.3+ KB
None


Prepare the Helper Text 


1.   Angka Satuan & Translate Angka (0-9) (Satu, Dua, Tiga)
2.   Translate Repeating Words, Emoticon, Non-Alpha Numeric, Normalize Word
3.   Delete Suffix






In [9]:
satuan = ['', 'satu', 'dua', 'tiga', 'empat', 'lima', 'enam', 'tujuh',
          'delapan', 'sembilan', 'sepuluh', 'sebelas']
    
def terbilang_(n):
    if n >= 0 and n <= 11:
        hasil = [satuan[n]]
    elif n >= 12 and n <= 19:
        hasil = terbilang_(n % 10) + ['belas']
    elif n >= 20 and n <= 99:
        hasil = terbilang_(n // 10) + ['puluh'] + terbilang_(n % 10)
    elif n >= 100 and n <= 199:
        hasil = ['seratus'] + terbilang_(n - 100)
    elif n >= 200 and n <= 999:
        hasil = terbilang_(n // 100) + ['ratus'] + terbilang_(n % 100)
    elif n >= 1000 and n <= 1999:
        hasil = ['seribu'] + terbilang_(n - 1000)
    elif n >= 2000 and n <= 999999:
        hasil = terbilang_(n // 1000) + ['ribu'] + terbilang_(n % 1000)
    elif n >= 1000000 and n <= 999999999:
        hasil = terbilang_(n // 1000000) + ['juta'] + terbilang_(n % 1000000)
    else:
        hasil = terbilang_(n // 1000000000) + ['milyar'] + terbilang_(n % 100000000)
    return hasil

def terbilang(n):
    if n == 0:
        return 'nol'
    t = terbilang_(n)
    while '' in t:
        t.remove('')
    return ' '.join(t)

def translate_angka(review):
    number = re.findall("[0-9]+",review)
    for digit in number:
        review = re.sub(digit,terbilang(int(digit))+" ",review)
    return review

In [10]:
def translate_repeating_words(review):
    repeating_words = re.findall(r'\w*(?:2|")',review)
    for word in repeating_words:
        cleaned = word[:-1]
        review = re.sub(word,cleaned + " " + cleaned, review)
    return review

In [11]:
def delete_suffix_nya(review):
    return re.sub("(?:nya|ny)[$|\s]"," ",review)

In [12]:
def translate_emoticon(t):
    for w, v in emoticon.items():
        pattern = re.compile(re.escape(w))
        match = re.search(pattern,t)
        if match:
            t = re.sub(pattern,v,t)
    return t

In [13]:
def translate_non_alpha_num(t):
    non_alpha_num = {
        '%' : 'persen',
        '$' : 'dolar',
        '@' : 'di',
        '&' : 'dan',
        '/' : 'atau',
        '+' : 'plus'
    }
    for w, v in non_alpha_num.items():
        pattern = re.compile(re.escape(w))
        match = re.search(pattern,t)
        if match:
            t = re.sub(pattern,v + " ",t)
    return t

In [14]:
def remove_non_alphanumeric(review):
    return re.sub("[^a-zA-Z\d]"," ", review)

In [15]:

def normalizing_words(review):
    return ''.join(''.join(s)[:1] for _, s in itertools.groupby(review))

Data Cleaning Process 

1.   Translate Repeating Words
2.   Translate Emoticon
3.   Lower Words
4.   Translate Non Alpha Numeric & Remove Non Alpha Numeric
5.   Delete Suffix
6.   Translate Angka
7.   Normalize Words
8.   Spell Checking (Norvig's Algorithm)
8.   Remove Stopword 




In [16]:
raw['text'] = raw['text'].apply(translate_repeating_words)
raw.head()

Unnamed: 0,created_at,user,location,coordinates,text,retweet_count,favorite_count,id,nama_kota
0,2021-08-14 14:53:43,restrotangkot,"Jl. Daan Mogot No.5, Tangerang",,Jajaran Polsek Ciledug Polres Metro Tangerang ...,0,0,ID3172,JAKARTA TIMUR
1,2021-08-14 14:53:42,dryxanne,Jakarta,,"Abis tu kebangun, lsg brasa lrga cm mimpi.\n\n...",0,0,ID3172,JAKARTA TIMUR
2,2021-08-14 14:53:24,bukanlucinta,"Jakarta, Indonesia.",,@bertanyarl Loh bersyukur masih bisa dapet ast...,0,0,ID3172,JAKARTA TIMUR
3,2021-08-14 14:53:09,mporatne,"DKI Jakarta, Indonesia",,@trianadewi_td Dipegangin pak bupati biar tida...,0,0,ID3172,JAKARTA TIMUR
4,2021-08-14 14:52:20,whopsy_opsy,Jakarta,,buset abis vaksin gue makan muluu,0,0,ID3172,JAKARTA TIMUR


In [17]:

raw['text'] = raw['text'].apply(translate_emoticon)
raw.head()

Unnamed: 0,created_at,user,location,coordinates,text,retweet_count,favorite_count,id,nama_kota
0,2021-08-14 14:53:43,restrotangkot,"Jl. Daan Mogot No.5, Tangerang",,Jajaran Polsek Ciledug Polres Metro Tangerang ...,0,0,ID3172,JAKARTA TIMUR
1,2021-08-14 14:53:42,dryxanne,Jakarta,,"Abis tu kebangun, lsg brasa lrga cm mimpi.\n\n...",0,0,ID3172,JAKARTA TIMUR
2,2021-08-14 14:53:24,bukanlucinta,"Jakarta, Indonesia.",,@bertanyarl Loh bersyukur masih bisa dapet ast...,0,0,ID3172,JAKARTA TIMUR
3,2021-08-14 14:53:09,mporatne,"DKI Jakarta, Indonesia",,@trianadewi_td Dipegangin pak bupati biar tida...,0,0,ID3172,JAKARTA TIMUR
4,2021-08-14 14:52:20,whopsy_opsy,Jakarta,,buset abis vaksin gue makan muluu,0,0,ID3172,JAKARTA TIMUR


In [18]:
raw['text'] = raw['text'].apply(lambda x: x.lower())
raw.head()

Unnamed: 0,created_at,user,location,coordinates,text,retweet_count,favorite_count,id,nama_kota
0,2021-08-14 14:53:43,restrotangkot,"Jl. Daan Mogot No.5, Tangerang",,jajaran polsek ciledug polres metro tangerang ...,0,0,ID3172,JAKARTA TIMUR
1,2021-08-14 14:53:42,dryxanne,Jakarta,,"abis tu kebangun, lsg brasa lrga cm mimpi.\n\n...",0,0,ID3172,JAKARTA TIMUR
2,2021-08-14 14:53:24,bukanlucinta,"Jakarta, Indonesia.",,@bertanyarl loh bersyukur masih bisa dapet ast...,0,0,ID3172,JAKARTA TIMUR
3,2021-08-14 14:53:09,mporatne,"DKI Jakarta, Indonesia",,@trianadewi_td dipegangin pak bupati biar tida...,0,0,ID3172,JAKARTA TIMUR
4,2021-08-14 14:52:20,whopsy_opsy,Jakarta,,buset abis vaksin gue makan muluu,0,0,ID3172,JAKARTA TIMUR


In [19]:
raw['text'] = raw['text'].apply(translate_non_alpha_num)
raw.head()

Unnamed: 0,created_at,user,location,coordinates,text,retweet_count,favorite_count,id,nama_kota
0,2021-08-14 14:53:43,restrotangkot,"Jl. Daan Mogot No.5, Tangerang",,jajaran polsek ciledug polres metro tangerang ...,0,0,ID3172,JAKARTA TIMUR
1,2021-08-14 14:53:42,dryxanne,Jakarta,,"abis tu kebangun, lsg brasa lrga cm mimpi.\n\n...",0,0,ID3172,JAKARTA TIMUR
2,2021-08-14 14:53:24,bukanlucinta,"Jakarta, Indonesia.",,di bertanyarl loh bersyukur masih bisa dapet a...,0,0,ID3172,JAKARTA TIMUR
3,2021-08-14 14:53:09,mporatne,"DKI Jakarta, Indonesia",,di trianadewi_td dipegangin pak bupati biar ti...,0,0,ID3172,JAKARTA TIMUR
4,2021-08-14 14:52:20,whopsy_opsy,Jakarta,,buset abis vaksin gue makan muluu,0,0,ID3172,JAKARTA TIMUR


In [20]:
raw['text'] = raw['text'].apply(remove_non_alphanumeric)
raw.head()

Unnamed: 0,created_at,user,location,coordinates,text,retweet_count,favorite_count,id,nama_kota
0,2021-08-14 14:53:43,restrotangkot,"Jl. Daan Mogot No.5, Tangerang",,jajaran polsek ciledug polres metro tangerang ...,0,0,ID3172,JAKARTA TIMUR
1,2021-08-14 14:53:42,dryxanne,Jakarta,,abis tu kebangun lsg brasa lrga cm mimpi se...,0,0,ID3172,JAKARTA TIMUR
2,2021-08-14 14:53:24,bukanlucinta,"Jakarta, Indonesia.",,di bertanyarl loh bersyukur masih bisa dapet a...,0,0,ID3172,JAKARTA TIMUR
3,2021-08-14 14:53:09,mporatne,"DKI Jakarta, Indonesia",,di trianadewi td dipegangin pak bupati biar ti...,0,0,ID3172,JAKARTA TIMUR
4,2021-08-14 14:52:20,whopsy_opsy,Jakarta,,buset abis vaksin gue makan muluu,0,0,ID3172,JAKARTA TIMUR


In [21]:
raw['text'] = raw['text'].apply(delete_suffix_nya)

In [22]:
raw['text'] = raw['text'].apply(translate_angka)
raw.head()

Unnamed: 0,created_at,user,location,coordinates,text,retweet_count,favorite_count,id,nama_kota
0,2021-08-14 14:53:43,restrotangkot,"Jl. Daan Mogot No.5, Tangerang",,jajaran polsek ciledug polres metro tangerang ...,0,0,ID3172,JAKARTA TIMUR
1,2021-08-14 14:53:42,dryxanne,Jakarta,,abis tu kebangun lsg brasa lrga cm mimpi se...,0,0,ID3172,JAKARTA TIMUR
2,2021-08-14 14:53:24,bukanlucinta,"Jakarta, Indonesia.",,di bertanyarl loh bersyukur masih bisa dapet a...,0,0,ID3172,JAKARTA TIMUR
3,2021-08-14 14:53:09,mporatne,"DKI Jakarta, Indonesia",,di trianadewi td dipegangin pak bupati biar ti...,0,0,ID3172,JAKARTA TIMUR
4,2021-08-14 14:52:20,whopsy_opsy,Jakarta,,buset abis vaksin gue makan muluu,0,0,ID3172,JAKARTA TIMUR


In [23]:
raw['text'] = raw['text'].apply(normalizing_words)
raw.head()

Unnamed: 0,created_at,user,location,coordinates,text,retweet_count,favorite_count,id,nama_kota
0,2021-08-14 14:53:43,restrotangkot,"Jl. Daan Mogot No.5, Tangerang",,jajaran polsek ciledug polres metro tangerang ...,0,0,ID3172,JAKARTA TIMUR
1,2021-08-14 14:53:42,dryxanne,Jakarta,,abis tu kebangun lsg brasa lrga cm mimpi segit...,0,0,ID3172,JAKARTA TIMUR
2,2021-08-14 14:53:24,bukanlucinta,"Jakarta, Indonesia.",,di bertanyarl loh bersyukur masih bisa dapet a...,0,0,ID3172,JAKARTA TIMUR
3,2021-08-14 14:53:09,mporatne,"DKI Jakarta, Indonesia",,di trianadewi td dipegangin pak bupati biar ti...,0,0,ID3172,JAKARTA TIMUR
4,2021-08-14 14:52:20,whopsy_opsy,Jakarta,,buset abis vaksin gue makan mulu,0,0,ID3172,JAKARTA TIMUR


In [24]:
raw['text'] = raw['text'].apply(lambda x: x.split())
raw['text'] = raw['text'].apply(lambda x: np.array(x))
def mapping_slang_words(review):
    return [slang_words[word] if word in slang_words else word for word in review]
raw['text'] = raw['text'].apply(mapping_slang_words)
raw.head()

Unnamed: 0,created_at,user,location,coordinates,text,retweet_count,favorite_count,id,nama_kota
0,2021-08-14 14:53:43,restrotangkot,"Jl. Daan Mogot No.5, Tangerang",,"[jajaran, polsek, ciledug, polres, metro, tang...",0,0,ID3172,JAKARTA TIMUR
1,2021-08-14 14:53:42,dryxanne,Jakarta,,"[habis, tu, kebangun, lsg, brasa, lrga, cuma, ...",0,0,ID3172,JAKARTA TIMUR
2,2021-08-14 14:53:24,bukanlucinta,"Jakarta, Indonesia.",,"[di, bertanyarl, lho, bersyukur, masih, bisa, ...",0,0,ID3172,JAKARTA TIMUR
3,2021-08-14 14:53:09,mporatne,"DKI Jakarta, Indonesia",,"[di, trianadewi, tadi, dipegangin, pak, bupati...",0,0,ID3172,JAKARTA TIMUR
4,2021-08-14 14:52:20,whopsy_opsy,Jakarta,,"[buset, habis, vaksin, aku, makan, melulu]",0,0,ID3172,JAKARTA TIMUR


In [25]:
raw.head()

Unnamed: 0,created_at,user,location,coordinates,text,retweet_count,favorite_count,id,nama_kota
0,2021-08-14 14:53:43,restrotangkot,"Jl. Daan Mogot No.5, Tangerang",,"[jajaran, polsek, ciledug, polres, metro, tang...",0,0,ID3172,JAKARTA TIMUR
1,2021-08-14 14:53:42,dryxanne,Jakarta,,"[habis, tu, kebangun, lsg, brasa, lrga, cuma, ...",0,0,ID3172,JAKARTA TIMUR
2,2021-08-14 14:53:24,bukanlucinta,"Jakarta, Indonesia.",,"[di, bertanyarl, lho, bersyukur, masih, bisa, ...",0,0,ID3172,JAKARTA TIMUR
3,2021-08-14 14:53:09,mporatne,"DKI Jakarta, Indonesia",,"[di, trianadewi, tadi, dipegangin, pak, bupati...",0,0,ID3172,JAKARTA TIMUR
4,2021-08-14 14:52:20,whopsy_opsy,Jakarta,,"[buset, habis, vaksin, aku, makan, melulu]",0,0,ID3172,JAKARTA TIMUR


In [26]:
raw = raw.dropna(subset=['text'],how='all')
raw = raw[raw['text'].map(len) > 0]
def avg_word(words):
  return (sum(len(word) for word in words)/len(words))

raw['avg_word'] = raw['text'].apply(lambda x: avg_word(x))
raw['word_count'] = raw['text'].map(len)

In [27]:
raw

Unnamed: 0,created_at,user,location,coordinates,text,retweet_count,favorite_count,id,nama_kota,avg_word,word_count
0,2021-08-14 14:53:43,restrotangkot,"Jl. Daan Mogot No.5, Tangerang",,"[jajaran, polsek, ciledug, polres, metro, tang...",0,0,ID3172,JAKARTA TIMUR,5.681818,22
1,2021-08-14 14:53:42,dryxanne,Jakarta,,"[habis, tu, kebangun, lsg, brasa, lrga, cuma, ...",0,0,ID3172,JAKARTA TIMUR,4.629630,27
2,2021-08-14 14:53:24,bukanlucinta,"Jakarta, Indonesia.",,"[di, bertanyarl, lho, bersyukur, masih, bisa, ...",0,0,ID3172,JAKARTA TIMUR,5.200000,25
3,2021-08-14 14:53:09,mporatne,"DKI Jakarta, Indonesia",,"[di, trianadewi, tadi, dipegangin, pak, bupati...",0,0,ID3172,JAKARTA TIMUR,5.181818,11
4,2021-08-14 14:52:20,whopsy_opsy,Jakarta,,"[buset, habis, vaksin, aku, makan, melulu]",0,0,ID3172,JAKARTA TIMUR,5.000000,6
...,...,...,...,...,...,...,...,...,...,...,...
14997,2021-08-13 5:41:58,dityaawahyu,"Pamulang, Indonesia",,"[halo, kalau, ada, teman, sodara, atau, sekita...",3,1,ID3171,JAKARTA SELATAN,5.040000,25
17629,2021-08-13 19:45:00,dicxvlr,"Tangerang, INDONESIA",,"[sakit, kepala, sekali, abs, vaksin]",0,0,ID3173,JAKARTA PUSAT,5.200000,5
19943,2021-08-13 5:01:52,vascodagambir,"Cikarang, Indonesia",,"[di, helianthus, an, di, gringozika, iya, kan,...",0,0,ID3173,JAKARTA PUSAT,4.933333,15
19945,2021-08-13 5:01:25,rakapradana07,"Jakarta Capital Region, Indonesia",,"[dari, kemarin, mencari, apakah, ada, badan, s...",0,0,ID3173,JAKARTA PUSAT,5.178571,28


In [28]:
raw.to_csv("data_tweet_vaksinasi_cleaned.csv", index=False)

# Data Cleaning V.2

In [29]:
import pandas as pd
import numpy as np
import string
import re
from collections import Counter

In [31]:
raw = pd.read_csv( "data_tweet_vaksinasi_cleaned.csv")
slang_words = pd.read_csv("data_support/kbba_ID.txt",
                        sep="\t", header=None)
slang = pd.read_csv("data_support/slangword_ID.txt",
                        sep=":", header=None)
baku_words = pd.read_csv("data_support/katabaku_ID.txt",
                        sep="|", header=None)
baku_words.columns = [1,0]
slang_words = pd.concat([slang_words, slang, baku_words])

In [32]:
slang_words.drop_duplicates(inplace=True)
slang_words = dict(zip(slang_words[0],slang_words[1]))
print(raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5380 entries, 0 to 5379
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   created_at      5380 non-null   object 
 1   user            5380 non-null   object 
 2   location        5291 non-null   object 
 3   coordinates     13 non-null     object 
 4   text            5380 non-null   object 
 5   retweet_count   5380 non-null   int64  
 6   favorite_count  5380 non-null   int64  
 7   id              5380 non-null   object 
 8   nama_kota       5380 non-null   object 
 9   avg_word        5380 non-null   float64
 10  word_count      5380 non-null   int64  
dtypes: float64(1), int64(3), object(7)
memory usage: 462.5+ KB
None


In [33]:
def remove_numeric(review):
    return re.sub("\d"," ", review)

raw['text'] = raw['text'].apply(remove_numeric)

In [34]:
def csv_string_to_list(csv_string):
    return csv_string[1:-1].split()

def string_without_quotes(word_list):
    new  = []
    for word in word_list:
        new.append(word.replace("'",""))
    return new

def delete_suffix_nya(review):
    return re.sub("(?:nya|ny)[$|\s]"," ",review)
    
raw['text'] = raw['text'].apply(delete_suffix_nya)
raw['text']  = raw['text'].apply(csv_string_to_list)
raw['text']  = raw['text'].apply(string_without_quotes)
raw['text'] 

0       [jajaran,, polsek,, ciledug,, polres,, metro,,...
1       [habis,, tu,, kebangun,, lsg,, brasa,, lrga,, ...
2       [di,, bertanyarl,, lho,, bersyukur,, masih,, b...
3       [di,, trianadewi,, tadi,, dipegangin,, pak,, b...
4         [buset,, habis,, vaksin,, aku,, makan,, melulu]
                              ...                        
5375    [halo,, kalau,, ada,, teman,, sodara,, atau,, ...
5376             [sakit,, kepala,, sekali,, abs,, vaksin]
5377    [di,, helianthus,, an,, di,, gringozika,, iya,...
5378    [dari,, kemarin,, mencari,, apakah,, ada,, bad...
5379    [di,, ekokuntadhi,, memang,, gabener,, bisa,, ...
Name: text, Length: 5380, dtype: object

In [35]:
def mapping_slang_words(review):
    return [slang_words[word] if word in slang_words else word for word in review]
raw['text'] = raw['text'].apply(mapping_slang_words)
raw['text']

0       [jajaran,, polsek,, ciledug,, polres,, metro,,...
1       [habis,, tu,, kebangun,, lsg,, brasa,, lrga,, ...
2       [di,, bertanyarl,, lho,, bersyukur,, masih,, b...
3       [di,, trianadewi,, tadi,, dipegangin,, pak,, b...
4         [buset,, habis,, vaksin,, aku,, makan,, melulu]
                              ...                        
5375    [halo,, kalau,, ada,, teman,, sodara,, atau,, ...
5376             [sakit,, kepala,, sekali,, abs,, vaksin]
5377    [di,, helianthus,, an,, di,, gringozika,, iya,...
5378    [dari,, kemarin,, mencari,, apakah,, ada,, bad...
5379    [di,, ekokuntadhi,, memang,, gabener,, bisa,, ...
Name: text, Length: 5380, dtype: object

In [36]:
def remove_single_alphabet_only(review):
    return [word for word in review if word not in string.ascii_lowercase]

def remove_too_short_words(review):
    return [word for word in review if len(word) > 2]
raw['text'] = raw['text'].apply(remove_single_alphabet_only)
raw['text'] = raw['text'].apply(remove_too_short_words)

In [37]:
raw = raw.dropna(subset=['text'],how='all')
raw = raw[raw['text'].map(len) > 0]
raw['word_count'] = raw['text'].map(len)
def convert_list_to_string(word_list):
    return ",".join(word_list)
raw['text'] = raw['text'].apply(convert_list_to_string)
raw = raw.dropna(subset=['text'],how='all')
raw = raw[raw['text'].map(len) > 0]

In [38]:
def avg_word(words):
  return (sum(len(word) for word in words)/len(words))

raw['avg_word'] = raw['text'].apply(lambda x: avg_word(x.split(",")))

In [39]:
raw.to_csv("data_tweet_vaksinasi_cleaned_v2.csv", index=False)

# Final Pre-processing & Stemming

In [40]:
import pandas as pd
import numpy as np
import string
import re
from collections import Counter

In [41]:
raw = pd.read_csv( "data_tweet_vaksinasi_cleaned_v2.csv")
slang_words = pd.read_csv("data_support/kbba_ID.txt",
                        sep="\t", header=None)
slang = pd.read_csv("data_support/slangword_ID.txt",
                        sep=":", header=None)
baku_words = pd.read_csv("data_support/katabaku_ID.txt",
                        sep="|", header=None)
baku_words.columns = [1,0]
slang_words = pd.concat([slang_words, slang, baku_words])

In [42]:
slang_words.drop_duplicates(inplace=True)
slang_words = dict(zip(slang_words[0],slang_words[1]))
raw['text'] = raw['text'].apply(lambda x: x.replace(","," "))
def delete_suffix_nya(review):
    return re.sub("(?:nya|ny|y)[$|\s]"," ",review)
    
raw['text'] = raw['text'].apply(delete_suffix_nya)
raw['text'] = raw['text'].apply(lambda x: x.split())

In [43]:
def mapping_slang_words(review):
    return [slang_words[word] if word in slang_words else word for word in review]
raw['text'] = raw['text'].apply(mapping_slang_words)

In [44]:
def convert_list_to_string(word_list):
    return ",".join(word_list)
raw['text'] = raw['text'].apply(convert_list_to_string)
raw = raw.dropna(subset=['text'],how='all')
raw = raw[raw['text'].map(len) > 0]

In [45]:
def avg_word(words):
  return (sum(len(word) for word in words)/len(words))

raw['avg_word'] = raw['text'].apply(lambda x: avg_word(x.split(",")))

In [46]:
raw.to_csv("data_tweet_vaksinasi_cleaned_pure.csv", index=False)

In [47]:
stop_words = np.array(pd.read_csv("data_support/stopwords_ID.txt",
                        sep="\n", header=None).values)
neg_words = np.array(pd.read_csv("data_support/negative_keyword_ID.txt",
                        sep="\n", header=None).values)
pos_words = np.array(pd.read_csv("data_support/positive_keyword_ID.txt",
                        sep="\n", header=None).values)
booster_words = np.array(pd.read_csv("data_support/boosterword_ID.txt",
                        sep="\n", header=None).values)
sentiment_words = np.concatenate((pos_words, neg_words, booster_words))
stop_words = np.reshape(stop_words, -1)

In [48]:
raw.head()
raw['text'] = raw['text'].apply(lambda x: x.split(","))

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False)
tvec_weights = vectorizer.fit_transform(raw['text'])
weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': vectorizer.get_feature_names(), 'weight': weights})
weights_df = weights_df.sort_values(by='weight', ascending=False)
domain_spesific_stop_words = np.array(weights_df['term'][:200].values)
stop_words = np.concatenate((domain_spesific_stop_words,stop_words[:,]))
stop_words = [word for word in stop_words if word not in sentiment_words]
stop_words = np.unique(stop_words)

In [50]:
def remove_stop_words(word_list):
    return [word for word in word_list if word not in stop_words]
raw['text'] = raw['text'].apply(remove_stop_words)
raw.head()

Unnamed: 0,created_at,user,location,coordinates,text,retweet_count,favorite_count,id,nama_kota,avg_word,word_count
0,2021-08-14 14:53:43,restrotangkot,"Jl. Daan Mogot No.5, Tangerang",,"[jajaran, polsek, ciledug, polres, metro, tang...",0,0,ID3172,JAKARTA TIMUR,5.904762,21
1,2021-08-14 14:53:42,dryxanne,Jakarta,,"[habis, tu, kebangun, lsg, brasa, lrga, hanya,...",0,0,ID3172,JAKARTA TIMUR,4.807692,26
2,2021-08-14 14:53:24,bukanlucinta,"Jakarta, Indonesia.",,"[bertanyarl, lho, bersyukur, bisa, astra, bera...",0,0,ID3172,JAKARTA TIMUR,5.375,24
3,2021-08-14 14:53:09,mporatne,"DKI Jakarta, Indonesia",,"[trianadewi, dipegangin, bupati, biar, tidak, ...",0,0,ID3172,JAKARTA TIMUR,5.181818,11
4,2021-08-14 14:52:20,whopsy_opsy,Jakarta,,"[buset, habis, makan, melulu]",0,0,ID3172,JAKARTA TIMUR,5.0,6


In [None]:
raw = raw.dropna(subset=['text'],how='all')
raw = raw[raw['text'].map(len) > 0]
def avg_word(words):
  return (sum(len(word) for word in words)/len(words))

raw['avg_word'] = raw['text'].apply(lambda x: avg_word(x))
raw['word_count'] = raw['text'].map(len)
raw['review'] = raw['text'].apply(convert_list_to_string)

In [51]:
raw.to_csv("data_tweet_vaksinasi_cleaned_stop_removed.csv", index=False)

#Stemming with Sastrawi
Stemming is basically removing the suffix from a word and reduce it to its root word.
For example: “Flying” is a word and its suffix is “ing”, if we remove “ing” from “Flying” then we will get base word or root word which is “Fly”.

In [52]:

import pandas as pd
import numpy as np
import string
import re
from collections import Counter

In [53]:
raw = pd.read_csv( "data_tweet_vaksinasi_cleaned_stop_removed.csv")

In [54]:
print(raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5380 entries, 0 to 5379
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   created_at      5380 non-null   object 
 1   user            5380 non-null   object 
 2   location        5291 non-null   object 
 3   coordinates     13 non-null     object 
 4   text            5380 non-null   object 
 5   retweet_count   5380 non-null   int64  
 6   favorite_count  5380 non-null   int64  
 7   id              5380 non-null   object 
 8   nama_kota       5380 non-null   object 
 9   avg_word        5380 non-null   float64
 10  word_count      5380 non-null   int64  
dtypes: float64(1), int64(3), object(7)
memory usage: 462.5+ KB
None


In [55]:
raw.head()

Unnamed: 0,created_at,user,location,coordinates,text,retweet_count,favorite_count,id,nama_kota,avg_word,word_count
0,2021-08-14 14:53:43,restrotangkot,"Jl. Daan Mogot No.5, Tangerang",,"['jajaran', 'polsek', 'ciledug', 'polres', 'me...",0,0,ID3172,JAKARTA TIMUR,5.904762,21
1,2021-08-14 14:53:42,dryxanne,Jakarta,,"['habis', 'tu', 'kebangun', 'lsg', 'brasa', 'l...",0,0,ID3172,JAKARTA TIMUR,4.807692,26
2,2021-08-14 14:53:24,bukanlucinta,"Jakarta, Indonesia.",,"['bertanyarl', 'lho', 'bersyukur', 'bisa', 'as...",0,0,ID3172,JAKARTA TIMUR,5.375,24
3,2021-08-14 14:53:09,mporatne,"DKI Jakarta, Indonesia",,"['trianadewi', 'dipegangin', 'bupati', 'biar',...",0,0,ID3172,JAKARTA TIMUR,5.181818,11
4,2021-08-14 14:52:20,whopsy_opsy,Jakarta,,"['buset', 'habis', 'makan', 'melulu']",0,0,ID3172,JAKARTA TIMUR,5.0,6


In [56]:
raw['text'] = raw['text'].apply(lambda x: x.replace(","," "))
raw['text'] = raw['text'].apply(lambda x: x.replace(",,"," "))

In [57]:
raw.head()

Unnamed: 0,created_at,user,location,coordinates,text,retweet_count,favorite_count,id,nama_kota,avg_word,word_count
0,2021-08-14 14:53:43,restrotangkot,"Jl. Daan Mogot No.5, Tangerang",,['jajaran' 'polsek' 'ciledug' 'polres' 'me...,0,0,ID3172,JAKARTA TIMUR,5.904762,21
1,2021-08-14 14:53:42,dryxanne,Jakarta,,['habis' 'tu' 'kebangun' 'lsg' 'brasa' 'l...,0,0,ID3172,JAKARTA TIMUR,4.807692,26
2,2021-08-14 14:53:24,bukanlucinta,"Jakarta, Indonesia.",,['bertanyarl' 'lho' 'bersyukur' 'bisa' 'as...,0,0,ID3172,JAKARTA TIMUR,5.375,24
3,2021-08-14 14:53:09,mporatne,"DKI Jakarta, Indonesia",,['trianadewi' 'dipegangin' 'bupati' 'biar' ...,0,0,ID3172,JAKARTA TIMUR,5.181818,11
4,2021-08-14 14:52:20,whopsy_opsy,Jakarta,,['buset' 'habis' 'makan' 'melulu'],0,0,ID3172,JAKARTA TIMUR,5.0,6


In [58]:
!pip install Sastrawi

Defaulting to user installation because normal site-packages is not writeable
Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[K     |################################| 209 kB 2.6 MB/s eta 0:00:01
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [59]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [60]:
raw['text']

0       ['jajaran'  'polsek'  'ciledug'  'polres'  'me...
1       ['habis'  'tu'  'kebangun'  'lsg'  'brasa'  'l...
2       ['bertanyarl'  'lho'  'bersyukur'  'bisa'  'as...
3       ['trianadewi'  'dipegangin'  'bupati'  'biar' ...
4                   ['buset'  'habis'  'makan'  'melulu']
                              ...                        
5375    ['sodara'  'lingkungan'  'terkena'  'tolong'  ...
5376                 ['sakit'  'kepala'  'sekali'  'abs']
5377    ['helianthus'  'an'  'gringozika'  'dimasukin'...
5378    ['mencari'  'badan'  'sertifikasi'  'vacine'  ...
5379    ['ekokuntadhi'  'memang'  'gabener'  'bisa'  '...
Name: text, Length: 5380, dtype: object

In [61]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()
raw['text'] = raw['text'].apply(lambda x: stemmer.stem(x).replace(" ",","))

In [62]:
def avg_word(words):
  return (sum(len(word) for word in words)/len(words))

raw['avg_word'] = raw['text'].apply(lambda x: avg_word(x.split(",")))

In [63]:
raw.to_csv("data_tweet_vaksinasi_cleaned_stemmed.csv", index=False)