In [1]:
import pandas as pd
import numpy as np
import re
import itertools
import collections
import string
from collections import OrderedDict

In [2]:
raw = pd.read_csv( "../data/raw/lazada_reviews.csv")
stop_words = np.array(pd.read_csv("../data/external/stopwords_ID.txt",
                        sep="\n", header=None).values)
neg_words = np.array(pd.read_csv("../data/external/negative_keyword_ID.txt",
                        sep="\n", header=None).values)
pos_words = np.array(pd.read_csv("../data/external/positive_keyword_ID.txt",
                        sep="\n", header=None).values)
slang_words = pd.read_csv("../data/external/kbba_ID.txt",
                        sep="\t", header=None)
root_words = np.array(pd.read_csv("../data/external/rootword_ID.txt",
                        sep="\n", header=None).values)
slang = pd.read_csv("../data/external/slangword_ID.txt",
                        sep=":", header=None)
slang_words = pd.concat([slang_words, slang])
sentiment_words = np.concatenate((pos_words, neg_words))

In [3]:
slang_words.drop_duplicates(inplace=True)
slang_words = dict(zip(slang_words[0],slang_words[1]))
neg_words = np.unique(neg_words)
pos_words = np.unique(pos_words)
stop_words = np.unique(stop_words)
stop_words = [word for word in stop_words if word not in sentiment_words]

In [4]:
print(raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280803 entries, 0 to 280802
Data columns (total 2 columns):
rating    280803 non-null int64
review    220233 non-null object
dtypes: int64(1), object(1)
memory usage: 4.3+ MB
None


In [5]:
raw = raw.dropna(subset=['review'],how='all')
raw = raw.drop_duplicates(subset=['review'])

In [6]:
print(raw.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19974 entries, 0 to 280801
Data columns (total 2 columns):
rating    19974 non-null int64
review    19974 non-null object
dtypes: int64(1), object(1)
memory usage: 468.1+ KB
None


In [7]:
raw.head()

Unnamed: 0,rating,review
0,1,pengiriman melalui NINJA sangattttt lamaaaa. j...
1,1,pesananku pada no order ini terkirim dgn baik....
2,5,ga sia sia susah payah ikutan flashsale akhirn...
3,5,"Setelah 7 kali gagal flashsale, akhirnya dapat..."
4,5,"saya kurang setuju dengan FS, memang sih untuk..."


In [8]:
satuan = ['', 'satu', 'dua', 'tiga', 'empat', 'lima', 'enam', 'tujuh',
          'delapan', 'sembilan', 'sepuluh', 'sebelas']
    
def terbilang_(n):
    if n >= 0 and n <= 11:
        hasil = [satuan[n]]
    elif n >= 12 and n <= 19:
        hasil = terbilang_(n % 10) + ['belas']
    elif n >= 20 and n <= 99:
        hasil = terbilang_(n // 10) + ['puluh'] + terbilang_(n % 10)
    elif n >= 100 and n <= 199:
        hasil = ['seratus'] + terbilang_(n - 100)
    elif n >= 200 and n <= 999:
        hasil = terbilang_(n // 100) + ['ratus'] + terbilang_(n % 100)
    elif n >= 1000 and n <= 1999:
        hasil = ['seribu'] + terbilang_(n - 1000)
    elif n >= 2000 and n <= 999999:
        hasil = terbilang_(n // 1000) + ['ribu'] + terbilang_(n % 1000)
    elif n >= 1000000 and n <= 999999999:
        hasil = terbilang_(n // 1000000) + ['juta'] + terbilang_(n % 1000000)
    else:
        hasil = terbilang_(n // 1000000000) + ['milyar'] + terbilang_(n % 100000000)
    return hasil

def terbilang(n):
    if n == 0:
        return 'nol'
    t = terbilang_(n)
    while '' in t:
        t.remove('')
    return ' '.join(t)

def translate_angka(review):
    number = re.findall("[0-9]+",review)
    for digit in number:
        review = re.sub(digit,terbilang(int(digit)),review)
    return review

In [9]:
def translate_repeating_words(review):
    rev = re.sub("(?:2|\")"," " + review,review)
    return re.sub("(?:2|\")","",rev)

In [10]:
raw['review'] = raw['review'].apply(lambda x: x.lower())
raw['review'] = raw['review'].apply(translate_repeating_words)
raw['review'] = raw['review'].apply(translate_angka)

In [11]:
def remove_non_letter(review):
    return re.sub("[^a-zA-Z]"," ", review)
                  
raw['review'] = raw['review'].apply(remove_non_letter)
raw.head()

Unnamed: 0,rating,review
0,1,pengiriman melalui ninja sangattttt lamaaaa j...
1,1,pesananku pada no order ini terkirim dgn baik ...
2,5,ga sia sia susah payah ikutan flashsale akhirn...
3,5,setelah tujuh kali gagal flashsale akhirnya d...
4,5,saya kurang setuju dengan fs memang sih untuk...


In [12]:
def normalizing_words(review):
    return ''.join(''.join(s)[:1] for _, s in itertools.groupby(review))
raw['review'] = raw['review'].apply(normalizing_words)
raw.head()

Unnamed: 0,rating,review
0,1,pengiriman melalui ninja sangat lama jauh berb...
1,1,pesananku pada no order ini terkirim dgn baik ...
2,5,ga sia sia susah payah ikutan flashsale akhirn...
3,5,setelah tujuh kali gagal flashsale akhirnya da...
4,5,saya kurang setuju dengan fs memang sih untuk ...


In [13]:
class spellCheck:
    def train(self,features):
        model = collections.defaultdict(lambda:1)
        for f in features:
            model[f] += 1
        return model
    
    def __init__(self):
        self.NWORDS = self.train(self.words(open('../data/external/spellingset_ID.txt').read()))
        self.alphabet = 'abcdefghijklmnopqrstuvwxyz'
    
    def words(self,text):
        return re.findall('[a-z]+', text.lower())
    
    def edits1(self, word):
        splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes    = [a + b[1:] for a, b in splits if b]
        transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
        replaces   = [a + c + b[1:] for a, b in splits for c in self.alphabet if b]
        inserts    = [a + c + b     for a, b in splits for c in self.alphabet]
        return set(deletes + transposes + replaces + inserts)
    
    def known_edits2(self, word):
        return set(e2 for e1 in self.edits1(word) for e2 in self.edits1(e1) if e2 in self.NWORDS)

    def known(self,words): return set(w for w in words if w in self.NWORDS)

    def correct(self, word):
        candidates = self.known([word]) or self.known(self.edits1(word)) or self.known_edits2(word) or [word]
        return max(candidates, key=self.NWORDS.get)

def correctSpelling(text):
    sc = spellCheck()
    return sc.correct(text) if text not in root_words else text

raw['review'] = raw['review'].apply(lambda x: x.split())

In [14]:
raw['review'] = raw['review'].apply(lambda x: np.array(x))
def mapping_slang_words(review):
    return [slang_words[word] if word in slang_words else word for word in review]
raw['review'] = raw['review'].apply(mapping_slang_words)

In [15]:
def spelling_correction(word_list):
    transformed = []
    for word in word_list:
        transformed.append(correctSpelling(word))
    return np.array(transformed)

In [16]:
raw['review'] = raw['review'].apply(spelling_correction)

In [17]:
def remove_stop_words(word_list):
    return [word for word in word_list if word not in stop_words]
raw['review'] = raw['review'].apply(remove_stop_words)
raw.head()

Unnamed: 0,rating,review
0,1,"[pengiriman, ninja, sangat, lama, jauh, berbed..."
1,1,"[pesananku, nomor, order, terkirim, baik, nomo..."
2,5,"[tidak, sia, sia, susah, payah, ikutan, flashs..."
3,5,"[tujuh, kali, gagal, flashsale, padahal, nga, ..."
4,5,"[kurang, setuju, flash sale, sih, strategi, ma..."


In [18]:
raw = raw.dropna(subset=['review'],how='all')
raw = raw[raw['review'].map(len) > 0]
def avg_word(words):
  return (sum(len(word) for word in words)/len(words))

raw['avg_word'] = raw['review'].apply(lambda x: avg_word(x))
raw['word_count'] = raw['review'].map(len)

In [19]:
raw

Unnamed: 0,rating,review,avg_word,word_count
0,1,"[pengiriman, ninja, sangat, lama, jauh, berbed...",5.812500,16
1,1,"[pesananku, nomor, order, terkirim, baik, nomo...",6.152985,268
2,5,"[tidak, sia, sia, susah, payah, ikutan, flashs...",6.105263,19
3,5,"[tujuh, kali, gagal, flashsale, padahal, nga, ...",6.125000,16
4,5,"[kurang, setuju, flash sale, sih, strategi, ma...",6.678571,28
5,1,"[kurir, ninja, express, lambat, kecewa, sih, l...",5.500000,10
6,1,"[pengiriman, kota, depak, membutuhkan, sebelas...",6.689189,74
7,5,"[barang, cuma, lama, lebih baik, pakai, eksped...",5.400000,15
8,5,"[saran, tolong, pengiriman, paket, jeng, jasa,...",5.909091,11
9,5,"[telepon genggam, xiaomi, oke, cepat, kurir, r...",5.800000,15


In [20]:
raw.to_csv("../data/interim/lazada_review_clean_1.csv", index=False)