In [335]:
import pandas as pd
import numpy as np
import re
import itertools
import collections
from collections import OrderedDict

In [336]:
raw = pd.read_csv( "../data/raw/lazada_reviews.csv")
stop_words = np.array(pd.read_csv("../data/external/stopwords_ID.txt",
                        sep="\n", header=None).values)
neg_words = np.array(pd.read_csv("../data/external/negative_keyword_ID.txt",
                        sep="\n", header=None).values)
pos_words = np.array(pd.read_csv("../data/external/positive_keyword_ID.txt",
                        sep="\n", header=None).values)
slang_words = pd.read_csv("../data/external/kbba_ID.txt",
                        sep="\t", header=None)
root_words = np.array(pd.read_csv("../data/external/rootword_ID.txt",
                        sep="\n", header=None).values)
slang = pd.read_csv("../data/external/slangword_ID.txt",
                        sep=":", header=None)
slang_words = pd.concat([slang_words,slang])

In [337]:
slang_words.drop_duplicates(inplace=True)
slang_words = dict(zip(slang_words[0],slang_words[1]))
neg_words = np.unique(neg_words)
pos_words = np.unique(pos_words)
stop_words = np.unique(stop_words)

In [338]:
print(raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280803 entries, 0 to 280802
Data columns (total 2 columns):
rating    280803 non-null int64
review    220233 non-null object
dtypes: int64(1), object(1)
memory usage: 4.3+ MB
None


In [339]:
raw = raw.dropna(subset=['review'],how='all')
raw = raw.drop_duplicates(subset=['review'])

In [340]:
print(raw.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19974 entries, 0 to 280801
Data columns (total 2 columns):
rating    19974 non-null int64
review    19974 non-null object
dtypes: int64(1), object(1)
memory usage: 468.1+ KB
None


In [341]:
raw.head()

Unnamed: 0,rating,review
0,1,pengiriman melalui NINJA sangattttt lamaaaa. j...
1,1,pesananku pada no order ini terkirim dgn baik....
2,5,ga sia sia susah payah ikutan flashsale akhirn...
3,5,"Setelah 7 kali gagal flashsale, akhirnya dapat..."
4,5,"saya kurang setuju dengan FS, memang sih untuk..."


In [342]:
def remove_non_letter(review):
    return re.sub("[^a-zA-Z]"," ", review
                  
raw['review'] = raw['review'].apply(lambda x: x.lower())
raw['review'] = raw['review'].apply(remove_non_letter)

In [343]:
def normalizing_words(review):
    return ''.join(''.join(s)[:1] for _, s in itertools.groupby(review))
raw['review'] = raw['review'].apply(normalizing_words)

In [344]:
class spellCheck:
    def train(self,features):
        model = collections.defaultdict(lambda:1)
        for f in features:
            model[f] += 1
        return model
    
    def __init__(self):
        self.NWORDS = self.train(self.words(open('../data/external/spellingset_ID.txt').read()))
        self.alphabet = 'abcdefghijklmnopqrstuvwxyz'
    
    def words(self,text):
        return re.findall('[a-z]+', text.lower())
    
    def edits1(self, word):
        splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes    = [a + b[1:] for a, b in splits if b]
        transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
        replaces   = [a + c + b[1:] for a, b in splits for c in self.alphabet if b]
        inserts    = [a + c + b     for a, b in splits for c in self.alphabet]
        return set(deletes + transposes + replaces + inserts)
    
    def known_edits2(self, word):
        return set(e2 for e1 in self.edits1(word) for e2 in self.edits1(e1) if e2 in self.NWORDS)

    def known(self,words): return set(w for w in words if w in self.NWORDS)

    def correct(self, word):
        candidates = self.known([word]) or self.known(self.edits1(word)) or self.known_edits2(word) or [word]
        return max(candidates, key=self.NWORDS.get)

def correctSpelling(text):
    sc = spellCheck()
    return sc.correct(text) if text not in root_words else text

raw['review'] = raw['review'].apply(lambda x: x.split())

In [345]:
def spelling_correction(word_list):
    transformed = []
    for word in word_list:
        transformed.append(correctSpelling(word))
    return transformed

In [346]:
raw['review'] = raw['review'].apply(spelling_correction)

In [347]:
def mapping_slang_words(review):
    return [slang_words[word] if word in slang_words else word for word in review]
raw['review'] = raw['review'].apply(mapping_slang_words)

In [348]:
def remove_stop_words(word_list):
    return [word for word in word_list if word not in stop_words]
raw['review'] = raw['review'].apply(remove_stop_words)

In [349]:
raw = raw.dropna(subset=['review'],how='all')
raw = raw[raw['review'].map(len) > 0]
def avg_word(words):
  return (sum(len(word) for word in words)/len(words))

raw['avg_word'] = raw['review'].apply(lambda x: avg_word(x))
raw['word_count'] = raw['review'].map(len)

In [350]:
raw

Unnamed: 0,rating,review,avg_word,word_count
0,1,"[pengiriman, ninja, berbeda, kurir, internal, ...",6.363636,11
1,1,"[pesananku, nomor, order, terkirim, nomor, ord...",6.685714,35
2,5,"[sia, sia, susah, payah, ikutan, flashsale, ha...",6.000000,18
3,5,"[kali, gagal, flashsale, nga, nyangka, estimas...",6.214286,14
4,5,"[setuju, flash sale, sih, strategi, marketing,...",6.863636,22
5,1,"[kurir, ninja, expres, lambat, kecewa, sih, la...",5.222222,9
6,1,"[pengiriman, kota, depok, membutuhkan, terhitu...",7.033333,60
7,5,"[barang, lebih baik, pakai, ekspedisi, diperca...",5.909091,11
8,5,"[saran, tolong, pengitiman, paket, jng, melalu...",5.750000,12
9,5,"[telepon genggam, xiaomi, oke, cepat, kurir, r...",5.800000,15
