In [1]:
import pandas as pd
import numpy as np
import string
import re
from collections import Counter

In [2]:
raw = pd.read_csv( "../data/interim/lazada_review_stemmed_v2.0_1.csv")

In [3]:
raw['review'] = raw['review'].apply(lambda x: x.split(","))

In [4]:
raw['review'][0]

['kirim',
 'lalu',
 'ninja',
 'sangat',
 'lama',
 'jauh',
 'beda',
 'dengan',
 'kurir',
 'internal',
 'lazada',
 'lebih',
 'baik',
 'kasih',
 'opsi',
 'langgan',
 'agar',
 'bisa',
 'pilih',
 'kurir']

In [5]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

In [6]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
sentences = raw['review']

2018-07-11 08:32:49,183 : INFO : 'pattern' package not found; tag filters are not available for English


Training model...


In [7]:
model = word2vec.Word2Vec(sentences, workers=num_workers,
            size=num_features, min_count = min_word_count,
            window = context, sample = downsampling)
model.init_sims(replace=True)
model_name = "300features_40minwords_10context"
model.save(model_name)

2018-07-11 08:32:49,198 : INFO : collecting all words and their counts
2018-07-11 08:32:49,203 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-07-11 08:32:49,228 : INFO : PROGRESS: at sentence #10000, processed 95241 words, keeping 2799 word types
2018-07-11 08:32:49,255 : INFO : collected 4668 word types from a corpus of 228695 raw words and 19898 sentences
2018-07-11 08:32:49,256 : INFO : Loading a fresh vocabulary
2018-07-11 08:32:49,261 : INFO : min_count=40 retains 527 unique words (11% of original 4668, drops 4141)
2018-07-11 08:32:49,262 : INFO : min_count=40 leaves 209230 word corpus (91% of original 228695, drops 19465)
2018-07-11 08:32:49,270 : INFO : deleting the raw counts dictionary of 4668 items
2018-07-11 08:32:49,273 : INFO : sample=0.001 downsamples 78 most-common words
2018-07-11 08:32:49,274 : INFO : downsampling leaves estimated 128216 word corpus (61.3% of prior 209230)
2018-07-11 08:32:49,276 : INFO : estimated required memory for 5

In [8]:
model.wv.most_similar("kurir")

[('lex', 0.933557391166687),
 ('ninja', 0.8309380412101746),
 ('rumah', 0.8126482963562012),
 ('antar', 0.8105541467666626),
 ('express', 0.7822293639183044),
 ('ramah', 0.7594882845878601),
 ('one', 0.6775049567222595),
 ('layan', 0.6756379008293152),
 ('tempat', 0.6627054810523987),
 ('proses', 0.6541668176651001)]

In [9]:
model.wv.most_similar("mantap")

[('jiwa', 0.8912090063095093),
 ('pokok', 0.8723122477531433),
 ('rekomendasi', 0.8274660706520081),
 ('oke', 0.8247707486152649),
 ('keren', 0.8217877149581909),
 ('muas', 0.7952912449836731),
 ('gin', 0.7788281440734863),
 ('riah', 0.7736654281616211),
 ('elegan', 0.7718223929405212),
 ('ekspektasi', 0.754769504070282)]

In [10]:
model.wv.most_similar("terima")

[('gawai', 0.7034754753112793),
 ('lazada', 0.690081000328064),
 ('darat', 0.6896671652793884),
 ('selamat', 0.6381320357322693),
 ('selera', 0.6152774095535278),
 ('kerja', 0.6084685325622559),
 ('tuju', 0.5951776504516602),
 ('kamu', 0.5823189616203308),
 ('alhamdulilah', 0.503341555595398),
 ('jadwal', 0.49905356764793396)]

In [11]:
model.wv.most_similar("kasih")

[('gawai', 0.6531863212585449),
 ('selera', 0.6213202476501465),
 ('lazada', 0.5957981944084167),
 ('sukses', 0.575785219669342),
 ('kamu', 0.5577393174171448),
 ('darat', 0.5537471175193787),
 ('cinta', 0.5337961316108704),
 ('dan', 0.530433177947998),
 ('maju', 0.5183781385421753),
 ('alhamdulilah', 0.5167912840843201)]

In [12]:
model.wv.most_similar("jelek")

[('cewek', 0.9715321660041809),
 ('kaca', 0.9433884620666504),
 ('kaya', 0.9425203800201416),
 ('yah', 0.9410569667816162),
 ('jarum', 0.9304667711257935),
 ('putus', 0.9303438067436218),
 ('main', 0.9299447536468506),
 ('cowok', 0.9272971153259277),
 ('tali', 0.927254319190979),
 ('sedih', 0.9260545969009399)]

In [13]:
model.wv.most_similar("kecewa")

[('pernah', 0.8638545870780945),
 ('jual', 0.8139086961746216),
 ('customer', 0.8089202046394348),
 ('mau', 0.7955259084701538),
 ('langgan', 0.7924652099609375),
 ('tolong', 0.7831017374992371),
 ('dong', 0.7827646136283875),
 ('jangan', 0.781039834022522),
 ('minta', 0.7753928303718567),
 ('pasti', 0.770636260509491)]

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False)
tvec_weights = vectorizer.fit_transform(raw['review'])

In [62]:
weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': vectorizer.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(150)

Unnamed: 0,term,weight
312,barang,0.067850
241,bagus,0.059888
4288,terima,0.054614
672,cepat,0.052042
1904,kasih,0.048056
2047,kirim,0.045308
2261,lazada,0.045139
2480,mantap,0.043854
3657,sampai,0.042312
3877,sesuai,0.039849
