In [1]:
import pandas as pd
import numpy as np
import string
import re
from collections import Counter

In [2]:
raw = pd.read_csv( "../data/interim/lazada_review_clean_v2.0_without_stop.csv")
slang_words = pd.read_csv("../data/external/kbba_ID.txt",
                        sep="\t", header=None)
slang = pd.read_csv("../data/external/slangword_ID.txt",
                        sep=":", header=None)
baku_words = pd.read_csv("../data/external/katabaku_ID.txt",
                        sep="|", header=None)
baku_words.columns = [1,0]
slang_words = pd.concat([slang_words, slang, baku_words])

In [3]:
slang_words.drop_duplicates(inplace=True)
slang_words = dict(zip(slang_words[0],slang_words[1]))

In [4]:
print(raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19937 entries, 0 to 19936
Data columns (total 4 columns):
rating        19937 non-null int64
review        19937 non-null object
avg_word      19937 non-null float64
word_count    19937 non-null int64
dtypes: float64(1), int64(2), object(1)
memory usage: 623.1+ KB
None


In [5]:
raw['review'][2]

"['tidak' 'sia' 'sia' 'susah' 'payah' 'ikutan' 'flash sale' 'akhir' 'dapat'\n 'juga' 'walaupun' 'harga' 'naik' 'turunin' 'lagi' 'dong' 'gara'\n 'secara keseluruhan' 'ok' 'mantap' 'terima kasih' 'lazada' 'terima kasih'\n 'xiaomi' 'redmi' 'lima' 'a']"

In [6]:
def remove_numeric(review):
    return re.sub("\d"," ", review)

raw['review'] = raw['review'].apply(remove_numeric)

In [7]:
raw.head()

Unnamed: 0,rating,review,avg_word,word_count
0,1,['pengiriman' 'melalui' 'ninja' 'sangat' 'lama...,5.571429,21
1,1,['pesananku' 'pada' 'nomor' 'order' 'ini' 'ter...,5.193878,98
2,5,['tidak' 'sia' 'sia' 'susah' 'payah' 'ikutan' ...,5.888889,27
3,5,['setelah' 'tujuh' 'kali' 'gagal' 'flash sale'...,5.962963,27
4,5,['saya' 'kurang' 'setuju' 'dengan' 'flash sale...,5.62069,58


In [8]:
def csv_string_to_list(csv_string):
    return csv_string[1:-1].split()

def string_without_quotes(word_list):
    new  = []
    for word in word_list:
        new.append(word.replace("'",""))
    return new
                   
raw['review'] = raw['review'].apply(csv_string_to_list)
raw['review'] = raw['review'].apply(string_without_quotes)

In [9]:
def mapping_slang_words(review):
    return [slang_words[word] if word in slang_words else word for word in review]
raw['review'] = raw['review'].apply(mapping_slang_words)

In [10]:
raw['review']

0        [pengiriman, melalui, ninja, sangat, lama, jau...
1        [pesananku, pada, nomor, pemesanan, ini, terki...
2        [tidak, sia, sia, susah, payah, ikutan, flash,...
3        [setelah, tujuh, kali, gagal, flash, sale, akh...
4        [saya, kurang, setuju, dengan, flash, sale, me...
5        [kurir, ninja, express, lambat, kecewa, kenapa...
6        [pengiriman, ke, kota, depok, membutuhkan, seb...
7        [barang, sudah, sampai, hanya, lama, sekali, l...
8        [saran, tolong, pengiriman, paket, saya, janga...
9        [telepon, genggam, xiaomi, memang, oke, cepat,...
10       [beli, flash, sale, xiaomi, redmi, lima, a, pl...
11       [barang, sudah, sampai, secara, cepat, dan, te...
12       [proses, waktu, pengiriman, setelah, pesanan, ...
13       [barang, bagus, sekali, garansi, tam, dikirim,...
14       [bulan, lalu, dapat, redmi, lima, plus, emas, ...
15       [lazada, itu, kadang, lama, pengiriman, kadang...
16       [ini, pertama, kali, dapat, flash, sale, baran.

In [11]:
def remove_single_alphabet_only(review):
    return [word for word in review if word not in string.ascii_lowercase]

def remove_too_short_words(review):
    return [word for word in review if len(word) > 2]

In [12]:
raw['review'] = raw['review'].apply(remove_single_alphabet_only)
raw['review'] = raw['review'].apply(remove_too_short_words)

In [13]:
raw = raw.dropna(subset=['review'],how='all')
raw = raw[raw['review'].map(len) > 0]
raw['word_count'] = raw['review'].map(len)
def convert_list_to_string(word_list):
    return ",".join(word_list)
raw['review'] = raw['review'].apply(convert_list_to_string)
raw = raw.dropna(subset=['review'],how='all')
raw = raw[raw['review'].map(len) > 0]

In [14]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19900 entries, 0 to 19936
Data columns (total 4 columns):
rating        19900 non-null int64
review        19900 non-null object
avg_word      19900 non-null float64
word_count    19900 non-null int64
dtypes: float64(1), int64(2), object(1)
memory usage: 777.3+ KB


In [15]:
def avg_word(words):
  return (sum(len(word) for word in words)/len(words))

raw['avg_word'] = raw['review'].apply(lambda x: avg_word(x.split(",")))

In [16]:
raw.to_csv("../data/interim/lazada_review_clean_v2.0_2.csv", index=False)

In [17]:
most_common = pd.Series(''.join(raw['review']).split(",")).value_counts()[:10]
most_common

tidak     4941
dan       4308
barang    4304
sampai    3817
terima    3721
yang      3668
bagus     3641
kasih     3569
cepat     3447
sudah     3422
dtype: int64

In [18]:
def add_coma(review):
    return review + ','
raw['review'] = raw['review'].apply(add_coma)

In [19]:
least_common = Counter("".join(raw["review"]).split(",")).most_common()[-10:]
least_common

[('suhariandhy', 1),
 ('gilap', 1),
 ('begi', 1),
 ('selaras', 1),
 ('pnirimn', 1),
 ('kuningan', 1),
 ('salah tingkah', 1),
 ('cardholder', 1),
 ('garuda', 1),
 ('ckckckck', 1)]