# Data Cleaning
---------------------
## Objective
+ Unstructured text data --> normalized word list
+ Extract feature from text data
----------------------
## Method
+ Number to Words Translation
+ Case to Lower
+ Emoticon to Words Translation
+ Symbol to Words Translation
+ Translate Reduplication
+ Non-Alphanumeric removal
+ Slang and Abbrv Lookup
+ Word Standardization
+ Spell correction
+ Tokenization
-------------------------------

### As we can see, our data is *super messy*, so we need to clean it up

In [1]:
import pandas as pd
import numpy as np
import re
import itertools
import collections
import string
from collections import OrderedDict

In [2]:
raw = pd.read_csv( "../data/raw/lazada_reviews.csv")

In [3]:
stop_words = np.array(pd.read_csv("../data/external/stopwords_ID.txt",
                        sep="\n", header=None).values)
neg_words = np.array(pd.read_csv("../data/external/negative_keyword_ID.txt",
                        sep="\n", header=None).values)
pos_words = np.array(pd.read_csv("../data/external/positive_keyword_ID.txt",
                        sep="\n", header=None).values)
slang_words = pd.read_csv("../data/external/kbba_ID.txt",
                        sep="\t", header=None)
root_words = np.array(pd.read_csv("../data/external/rootword_ID.txt",
                        sep="\n", header=None).values)
slang = pd.read_csv("../data/external/slangword_ID.txt",
                        sep=":", header=None)
emoticon = pd.read_csv("../data/external/emoticon.txt",
                        sep="\t", header=None)
booster_words = np.array(pd.read_csv("../data/external/boosterword_ID.txt",
                        sep="\n", header=None).values)
baku_words = pd.read_csv("../data/external/katabaku_ID.txt",
                        sep="|", header=None)
baku_words.columns = [1,0]

## Preparing the dict

In [4]:
slang_words = pd.concat([slang_words, slang, baku_words])
sentiment_words = np.concatenate((pos_words, neg_words, booster_words))
slang_words.drop_duplicates(inplace=True)
emoticon.drop_duplicates(inplace=True)
emoticon = dict(zip(emoticon[0], emoticon[1]))
slang_words = dict(zip(slang_words[0],slang_words[1]))
neg_words = np.unique(neg_words)
pos_words = np.unique(pos_words)
stop_words = np.unique(stop_words)
stop_words = [word for word in stop_words if word not in sentiment_words]

In [5]:
print("------Dictionary Info---------")
print("Slang words = "  + str(len(slang_words)) + " entries")
print("Emoticon = "  + str(len(emoticon)) + " entries" )
print("Root words = "  + str(len(root_words)) + " entries")
print("Sentiment words = "  + str(len(sentiment_words)) + " entries")

------Dictionary Info---------
Slang words = 2294 entries
Emoticon = 110 entries
Root words = 27979 entries
Sentiment words = 8439 entries


In [6]:
print(raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280803 entries, 0 to 280802
Data columns (total 2 columns):
rating    280803 non-null int64
review    220233 non-null object
dtypes: int64(1), object(1)
memory usage: 4.3+ MB
None


## There's lot of duplication. Drop it 

In [7]:
raw = raw.dropna(subset=['review'],how='all')
raw = raw.drop_duplicates(subset=['review'])

In [8]:
print(raw.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19974 entries, 0 to 280801
Data columns (total 2 columns):
rating    19974 non-null int64
review    19974 non-null object
dtypes: int64(1), object(1)
memory usage: 468.1+ KB
None


## Prepare the helper

In [9]:
satuan = ['', 'satu', 'dua', 'tiga', 'empat', 'lima', 'enam', 'tujuh',
          'delapan', 'sembilan', 'sepuluh', 'sebelas']
    
def terbilang_(n):
    if n >= 0 and n <= 11:
        hasil = [satuan[n]]
    elif n >= 12 and n <= 19:
        hasil = terbilang_(n % 10) + ['belas']
    elif n >= 20 and n <= 99:
        hasil = terbilang_(n // 10) + ['puluh'] + terbilang_(n % 10)
    elif n >= 100 and n <= 199:
        hasil = ['seratus'] + terbilang_(n - 100)
    elif n >= 200 and n <= 999:
        hasil = terbilang_(n // 100) + ['ratus'] + terbilang_(n % 100)
    elif n >= 1000 and n <= 1999:
        hasil = ['seribu'] + terbilang_(n - 1000)
    elif n >= 2000 and n <= 999999:
        hasil = terbilang_(n // 1000) + ['ribu'] + terbilang_(n % 1000)
    elif n >= 1000000 and n <= 999999999:
        hasil = terbilang_(n // 1000000) + ['juta'] + terbilang_(n % 1000000)
    else:
        hasil = terbilang_(n // 1000000000) + ['milyar'] + terbilang_(n % 100000000)
    return hasil

def terbilang(n):
    if n == 0:
        return 'nol'
    t = terbilang_(n)
    while '' in t:
        t.remove('')
    return ' '.join(t)

def translate_angka(review):
    number = re.findall("[0-9]+",review)
    for digit in number:
        review = re.sub(digit,terbilang(int(digit))+" ",review)
    return review

In [10]:
def translate_repeating_words(review):
    repeating_words = re.findall(r'\w*(?:2|")',review)
    for word in repeating_words:
        cleaned = word[:-1]
        review = re.sub(word,cleaned + " " + cleaned, review)
    return review

In [11]:
def delete_suffix_nya(review):
    return re.sub("(?:nya|ny)[$|\s]"," ",review)

In [12]:
def translate_emoticon(t):
    for w, v in emoticon.items():
        pattern = re.compile(re.escape(w))
        match = re.search(pattern,t)
        if match:
            t = re.sub(pattern,v,t)
    return t

In [13]:
def translate_non_alpha_num(t):
    non_alpha_num = {
        '%' : 'persen',
        '$' : 'dolar',
        '@' : 'di',
        '&' : 'dan',
        '/' : 'atau',
        '+' : 'plus'
    }
    for w, v in non_alpha_num.items():
        pattern = re.compile(re.escape(w))
        match = re.search(pattern,t)
        if match:
            t = re.sub(pattern,v + " ",t)
    return t

In [14]:
def remove_non_alphanumeric(review):
    return re.sub("[^a-zA-Z\d]"," ", review)

In [15]:
def normalizing_words(review):
    return ''.join(''.join(s)[:1] for _, s in itertools.groupby(review))

## Applying Changes. Do it sequentially. Order matters

In [16]:
raw['review'] = raw['review'].apply(translate_repeating_words)
raw.head()

Unnamed: 0,rating,review
0,1,pengiriman melalui NINJA sangattttt lamaaaa. j...
1,1,pesananku pada no order ini terkirim dgn baik....
2,5,ga sia sia susah payah ikutan flashsale akhirn...
3,5,"Setelah 7 kali gagal flashsale, akhirnya dapat..."
4,5,"saya kurang setuju dengan FS, memang sih untuk..."


In [17]:
raw['review'] = raw['review'].apply(translate_emoticon)
raw.head()

Unnamed: 0,rating,review
0,1,pengiriman melalui NINJA sangattttt lamaaaa. j...
1,1,pesananku pada no order ini terkirim dgn baik....
2,5,ga sia sia susah payah ikutan flashsale akhirn...
3,5,"Setelah 7 kali gagal flashsale, akhirnya dapat..."
4,5,"saya kurang setuju dengan FS, memang sih untuk..."


In [18]:
raw['review'] = raw['review'].apply(lambda x: x.lower())
raw.head()

Unnamed: 0,rating,review
0,1,pengiriman melalui ninja sangattttt lamaaaa. j...
1,1,pesananku pada no order ini terkirim dgn baik....
2,5,ga sia sia susah payah ikutan flashsale akhirn...
3,5,"setelah 7 kali gagal flashsale, akhirnya dapat..."
4,5,"saya kurang setuju dengan fs, memang sih untuk..."


In [19]:
raw['review'] = raw['review'].apply(translate_non_alpha_num)
raw.head()

Unnamed: 0,rating,review
0,1,pengiriman melalui ninja sangattttt lamaaaa. j...
1,1,pesananku pada no order ini terkirim dgn baik....
2,5,ga sia sia susah payah ikutan flashsale akhirn...
3,5,"setelah 7 kali gagal flashsale, akhirnya dapat..."
4,5,"saya kurang setuju dengan fs, memang sih untuk..."


In [20]:
raw['review'] = raw['review'].apply(remove_non_alphanumeric)
raw.head()

Unnamed: 0,rating,review
0,1,pengiriman melalui ninja sangattttt lamaaaa j...
1,1,pesananku pada no order ini terkirim dgn baik ...
2,5,ga sia sia susah payah ikutan flashsale akhirn...
3,5,setelah 7 kali gagal flashsale akhirnya dapat...
4,5,saya kurang setuju dengan fs memang sih untuk...


In [21]:
raw['review'] = raw['review'].apply(delete_suffix_nya)
raw['review']

0         pengiriman melalui ninja sangattttt lamaaaa  j...
1         pesananku pada no order ini terkirim dgn baik ...
2         ga sia sia susah payah ikutan flashsale akhir ...
3         setelah 7 kali gagal flashsale  akhir dapat ju...
4         saya kurang setuju dengan fs  memang sih untuk...
5         kurir ninja express leleeeeetttt kecewa   kena...
6         pengiriman ke kota depok membutuhkan 11 hari t...
7         barang udah nyampe   cuma lama banget mending ...
8         saran tolong pengitiman paket saya jng melalui...
9         hp xiomi emng ok  cepat nyampe kurir ramah  te...
10        beli flash sale xiaomi redmi 5a plus powerbank...
11        barang sudah sampai secara cepat dan tepat  tp...
12        proses waktu pengiriman setelah pemesanan sang...
13        barang bagus banget garansi tam dikirim jam 1 ...
14        bulan lalu dpet redmi 5 plus gold dan item  sk...
15        lazadaa ituuu kadangh lamaa pengiriman kadang ...
16        ini pertama kali dapet flash s

In [22]:
raw['review'] = raw['review'].apply(translate_angka)
raw.head()

Unnamed: 0,rating,review
0,1,pengiriman melalui ninja sangattttt lamaaaa j...
1,1,pesananku pada no order ini terkirim dgn baik ...
2,5,ga sia sia susah payah ikutan flashsale akhir ...
3,5,setelah tujuh kali gagal flashsale akhir dap...
4,5,saya kurang setuju dengan fs memang sih untuk...


In [23]:
raw['review'] = raw['review'].apply(normalizing_words)
raw.head()

Unnamed: 0,rating,review
0,1,pengiriman melalui ninja sangat lama jauh berb...
1,1,pesananku pada no order ini terkirim dgn baik ...
2,5,ga sia sia susah payah ikutan flashsale akhir ...
3,5,setelah tujuh kali gagal flashsale akhir dapat...
4,5,saya kurang setuju dengan fs memang sih untuk ...


In [24]:
raw['review'] = raw['review'].apply(lambda x: x.split())
raw['review'] = raw['review'].apply(lambda x: np.array(x))
def mapping_slang_words(review):
    return [slang_words[word] if word in slang_words else word for word in review]
raw['review'] = raw['review'].apply(mapping_slang_words)
raw.head()

Unnamed: 0,rating,review
0,1,"[pengiriman, melalui, ninja, sangat, lama, jau..."
1,1,"[pesananku, pada, nomor, order, ini, terkirim,..."
2,5,"[tidak, sia, sia, susah, payah, ikutan, flash ..."
3,5,"[setelah, tujuh, kali, gagal, flash sale, akhi..."
4,5,"[saya, kurang, setuju, dengan, flash sale, mem..."


## Norvig Spell-checking Algorithm

In [25]:
class spellCheck:
    def train(self,features):
        model = collections.defaultdict(lambda:1)
        for f in features:
            model[f] += 1
        return model
    
    def __init__(self):
        self.NWORDS = self.train(self.words(open('../data/external/spellingset_ID.txt').read()))
        self.alphabet = 'abcdefghijklmnopqrstuvwxyz'
    
    def words(self,text):
        return re.findall('[a-z]+', text.lower())
    
    def edits1(self, word):
        splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes    = [a + b[1:] for a, b in splits if b]
        transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
        replaces   = [a + c + b[1:] for a, b in splits for c in self.alphabet if b]
        inserts    = [a + c + b     for a, b in splits for c in self.alphabet]
        return set(deletes + transposes + replaces + inserts)
    
    def known_edits2(self, word):
        return set(e2 for e1 in self.edits1(word) for e2 in self.edits1(e1) if e2 in self.NWORDS)

    def known(self,words): return set(w for w in words if w in self.NWORDS)

    def correct(self, word):
        candidates = self.known([word]) or self.known(self.edits1(word)) or self.known_edits2(word) or [word]
        return max(candidates, key=self.NWORDS.get)

def correctSpelling(text):
    sc = spellCheck()
    return sc.correct(text) if text not in root_words else text

In [26]:
def spelling_correction(word_list):
    transformed = []
    for word in word_list:
        transformed.append(correctSpelling(word))
    return np.array(transformed)

In [27]:
raw['review'] = raw['review'].apply(spelling_correction)

In [28]:
raw = raw.dropna(subset=['review'],how='all')
raw = raw[raw['review'].map(len) > 0]
def avg_word(words):
  return (sum(len(word) for word in words)/len(words))

raw['avg_word'] = raw['review'].apply(lambda x: avg_word(x))
raw['word_count'] = raw['review'].map(len)

In [29]:
raw

Unnamed: 0,rating,review,avg_word,word_count
0,1,"[pengiriman, melalui, ninja, sangat, lama, jau...",5.571429,21
1,1,"[pesananku, pada, nomor, order, ini, terkirim,...",5.193878,98
2,5,"[tidak, sia, sia, susah, payah, ikutan, flash ...",5.888889,27
3,5,"[setelah, tujuh, kali, gagal, flash sale, akhi...",5.962963,27
4,5,"[saya, kurang, setuju, dengan, flash sale, mem...",5.620690,58
5,1,"[kurir, ninja, express, lambat, kecewa, kenapa...",5.125000,16
6,1,"[pengiriman, ke, kota, depak, membutuhkan, seb...",5.747899,119
7,5,"[barang, sudah, sampai, hanya, lama, sekali, l...",5.571429,21
8,5,"[saran, tolong, pengiriman, paket, saya, jeng,...",5.733333,15
9,5,"[telepon genggam, xiaomi, memang, oke, cepat, ...",5.631579,19


In [30]:
raw.to_csv("../data/interim/1.0_lazada_review_clean_without_stop_removal.csv", index=False)

## Traditional Stop Words Removal (Optional)
This preprocessing step still debatable to be used because using generic list of stopwords can have a negative impact on sentiment analysis performance. Stop words removed can contain information that might be important for the sentiment analysis.

In [31]:
def remove_stop_words(word_list):
    return [word for word in word_list if word not in stop_words]
raw['review'] = raw['review'].apply(remove_stop_words)
raw.head()

Unnamed: 0,rating,review,avg_word,word_count
0,1,"[pengiriman, ninja, sangat, lama, jauh, berbed...",5.571429,21
1,1,"[pesananku, nomor, order, terkirim, baik, nomo...",5.193878,98
2,5,"[tidak, sia, sia, susah, payah, ikutan, flash ...",5.888889,27
3,5,"[tujuh, kali, gagal, flash sale, padahal, tida...",5.962963,27
4,5,"[kurang, setuju, flash sale, sih, strategi, pe...",5.62069,58


In [32]:
raw = raw.dropna(subset=['review'],how='all')
raw = raw[raw['review'].map(len) > 0]
def avg_word(words):
  return (sum(len(word) for word in words)/len(words))

raw['avg_word'] = raw['review'].apply(lambda x: avg_word(x))
raw['word_count'] = raw['review'].map(len)

In [33]:
raw.to_csv("../data/interim/1.0_lazada_review_clean_with_stop_removal.csv", index=False)